In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.model_selection import train_test_split
from scipy.stats import zscore
import warnings
warnings.filterwarnings('ignore')

In [None]:
df=pd.read_csv('worldhappiness.csv')
df

In [None]:
df.drop(['Country','Region'],axis=1,inplace=True)

In [None]:
df

In [None]:
df.columns

In [None]:
df.columns.size

In [None]:
df.shape

In [None]:
df.index

In [None]:
df.info()

In [None]:
df.dtypes

In [None]:
df.describe()

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
sns.heatmap(df.isnull())
plt.title("Null values")
plt.show()

In [None]:
df["Happiness Rank"].hist(grid=True)  #Example of unvariate analysis
plt.title("1 vs 5")
plt.show()

In [None]:
plt.scatter(df["Family"],df["Freedom"],alpha=0.55,c=(0,0,0))  #Example of bivariate analysis
plt.title("Family vs Freedom")
plt.show()

# checking correlation

In [None]:
corr_mat=df.corr()    #Exxample of multi variate analysis

plt.figure(figsize=[14,12])
sns.heatmap(corr_mat,annot=True)
plt.title("Correlation Matrix")
plt.show()

In [None]:
df.isnull().sum()

In [None]:
df.dtypes

In [None]:
from sklearn.preprocessing import LabelEncoder

LE=LabelEncoder()
df["Family"]=LE.fit_transform(df["Family"])


In [None]:
corr_matrix=df.corr()
corr_matrix["Family"].sort_values(ascending=False)

# lets check the data distribution among all the columns

In [None]:
#another example of multi variate analysis

df.plot(kind='density',subplots=True,layout=(16,1),sharex=False,legend=False,fontsize=1,figsize=(1,12))
plt.show()

# we can se the skewness data for multile columns,will handle skewness in further steps

In [None]:
sns.pairplot(df)  #one more exxample for multi variate analysis
plt.show()

In [None]:
#splitting the independent and target variables in x and y
x=df.drop("Freedom",axis=1)
y=df["Freedom"]

In [None]:
df

In [None]:
#checking skewness
x.skew()

we can see skewness in the dataset.we will remove the skewness using power_transform function

In [None]:
y=df['Score']
x=df.drop('Score',axis=1)

In [None]:
scaler=MinMaxScaler()
x=pd.DataFrame(scaler.fit_transform(x),columns=x.columns)

In [None]:
df.skew()

# checking outliers

there are 61 columns in a data set so its not possible to plot each and every columns seperatly ot plot all together also.so we will print in 2 steps

In [None]:
#plotting boxxplots for first 30 columns
x.iloc[:,0:15].boxplot(figsize=[20,8])
plt.subplots_adjust(bottom=0.25)
plt.show()

In [None]:
#plotting for rest of all columns
x.iloc[:,16:30].boxplot(figsize=[20,8])
plt.subplots_adjust(bottom=0.25)
plt.show()

#we can se 1 or 2 values for total 3 columns but those are very near to whiskers so these are not outliers

# Finding best random_state


In [None]:
from sklearn.linear_model import LogisticRegression
maxAccu=0
maxRS=0
for i in range(1,200):
    x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.30,random_state=i)
    LR=LogisticRegression()
    LR.fit(x_train,y_train)
    predrf=LR.predict(x_test)
    acc=accuracy_score(y_test,predrf)
    if acc>maxAccu:
        maxAccu=acc
        maxRS=i
print("Best accuracy is ",maxAccu,"on Random_state",maxRS)

# Creating train-test split

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.30,random_state=137)



In [None]:
from sklearn.linear_model import LogisticRegression

LR=LogisticRegression()
LR.fit(x_train,y_train)
predlr=LR.predict(x_test)
print(accuracy_score(y_test,predlr))
print(confusion_matrix(y_test,predlr))
print(classification_report(y_test,predlr))

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf=RandomForestClassifier()
rf.fit(x_train,y_train)
predrf=rf.predict(x_test)
print(accuracy_score(y_test,predrf))
print(confusion_matrix(y_test,predrf))
print(classification_report(y_test,predrf))


In [None]:
from sklearn.svm import SVC

svc=SVC()
svc.fit(x_train,y_train)

ad_pred=svc.predict(x_test)
print(accuracy_score(y_test,ad_pred))
print(confusion_matrix(y_test,ad_pred))
print(classification_report(y_test,ad_pred))

#we are getting accuracy with ___ but it can be due to overfitting also so we will check cross validation scores

In [None]:
from sklearn.model_selection import cross_val_score

scr=cross_val_score(LR,x,y,cv=5)
print("cross validation score of LogisticRegression model :",scr.mean())

In [None]:
from sklearn.model_selection import cross_val_score

scr=cross_val_score(dt,x,y,cv=5)
print("cross validation score of DecisionTree model :",scr.mean())

In [None]:
from sklearn.model_selection import cross_val_score

scr=cross_val_score(rf,x,y,cv=5)
print("cross validation score of RandomForest model :",scr.mean())

In [None]:
from sklearn.model_selection import cross_val_score

scr=cross_val_score(svc,x,y,cv=5)
print("cross validation score of SVC model :",scr.mean())

minimum difference in accuracy and cross validation score is for DecisionTreeClassifier(___)so this is the best model

# hyper parameter tuning

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
#creating parameter list to pass in gridsearchcv

parameters {'max_depth': np.arange(2,15),
           'critersion':['gini','entropy']}

In [None]:
GCV=GridSearchCV(DecisionTreeClassifier(),parameters,cv=5)


In [None]:
GCV.fit(x_train,y_train)

In [None]:
GCV.best_params_

In [None]:
GCV_pred=GCV.best_estimator_.predict(x_test)

In [None]:
accuracy_score(y_test,GCV_pred)

In [None]:
import joblib
joblib.dump(GCV.best_estimator_,"WorldhappinessModelDTC.pkl")