Regression on Happiness Score

In [None]:
#set up a ColumnTransformer with StandardScaler for numerical features and OneHotEncoder for categorical features.
#set up and training a LinearRegression model using scikit-learn, including data preprocessing steps within a Pipeline.
#implement polynomial regression
#perform hyperparameter tuning for a polynomial regression model
#evaluate the performance of a regression model on test data
#use OneHotEncoder with handle_unknown='ignore' within a preprocessing pipeline to handle unseen categories during model training and evaluation
#set up and execute cross_val_score or GridSearchCV to perform cross-validation
#perform hyperparameter tuning for an SVM model using grid search
#calculate and display performance metrics
#integrate PolynomialFeatures in a pipeline before an SVM model
#Use different kernels such as linear, polynomial, and RBF

#Import Library

In [1]:
import pandas as pd
import numpy as np

# Import Dataset

In [19]:
df = pd.read_csv('world_happiness.csv')
df.sample(10)

Unnamed: 0.1,Unnamed: 0,country,social_support,freedom,corruption,generosity,gdp_per_cap,life_exp,happiness_score
22,23,Mexico,67.0,71.0,87.0,120.0,18000,75.6,133
112,113,Bangladesh,126.0,27.0,36.0,107.0,4140,73.7,31
114,115,Mali,112.0,110.0,107.0,138.0,2100,62.9,28
120,121,Ethiopia,119.0,106.0,53.0,99.0,1900,69.1,22
128,129,Comoros,143.0,148.0,81.0,62.0,2480,69.1,14
141,142,Central African Republic,155.0,133.0,122.0,113.0,794,52.9,1
86,87,Bhutan,68.0,59.0,25.0,13.0,9710,74.7,61
132,133,Zimbabwe,110.0,96.0,63.0,141.0,2390,62.0,10
45,46,Cyprus,90.0,81.0,115.0,39.0,34500,82.0,107
55,56,Honduras,84.0,39.0,79.0,51.0,4630,74.3,97


# EDA

In [4]:
df.columns

Index(['Unnamed: 0', 'country', 'social_support', 'freedom', 'corruption',
       'generosity', 'gdp_per_cap', 'life_exp', 'happiness_score'],
      dtype='object')

In [5]:
df.shape

(143, 9)

In [7]:
df['country'].unique()

array(['Finland', 'Denmark', 'Norway', 'Iceland', 'Netherlands',
       'Switzerland', 'Sweden', 'New Zealand', 'Canada', 'Austria',
       'Australia', 'Costa Rica', 'Israel', 'Luxembourg',
       'United Kingdom', 'Ireland', 'Germany', 'Belgium', 'United States',
       'Czech Republic', 'United Arab Emirates', 'Malta', 'Mexico',
       'France', 'Chile', 'Guatemala', 'Saudi Arabia', 'Qatar', 'Spain',
       'Panama', 'Brazil', 'Uruguay', 'Singapore', 'El Salvador', 'Italy',
       'Bahrain', 'Trinidad and Tobago', 'Poland', 'Uzbekistan',
       'Lithuania', 'Colombia', 'Slovenia', 'Nicaragua', 'Argentina',
       'Romania', 'Cyprus', 'Ecuador', 'Kuwait', 'Thailand', 'Latvia',
       'South Korea', 'Estonia', 'Jamaica', 'Mauritius', 'Japan',
       'Honduras', 'Kazakhstan', 'Bolivia', 'Hungary', 'Paraguay', 'Peru',
       'Portugal', 'Pakistan', 'Russia', 'Philippines', 'Serbia',
       'Moldova', 'Libya', 'Montenegro', 'Tajikistan', 'Croatia',
       'Dominican Republic', 'Turkey', 

# Check for null values and preprocessing

In [16]:
df.isna().sum()

Unnamed: 0         0
country            0
social_support     1
freedom            1
corruption         8
generosity         1
gdp_per_cap        0
life_exp           0
happiness_score    0
dtype: int64

In [24]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
numerical_features = ['social_support', 'freedom', 'corruption',  'generosity', 'gdp_per_cap', 'life_exp', 'happiness_score']
df[numerical_features] = imputer.fit_transform(df[numerical_features])

df.sample(5)

Unnamed: 0.1,Unnamed: 0,country,social_support,freedom,corruption,generosity,gdp_per_cap,life_exp,happiness_score
106,107,Georgia,147.0,104.0,28.0,153.0,10700.0,73.2,37.0
65,66,Serbia,57.0,124.0,118.0,84.0,16700.0,76.0,86.0
13,14,Luxembourg,27.0,28.0,9.0,30.0,94300.0,82.0,142.0
113,114,Iraq,124.0,130.0,66.0,73.0,15700.0,77.1,30.0
128,129,Comoros,143.0,148.0,81.0,62.0,2480.0,69.1,14.0


In [25]:
df.isna().sum()

Unnamed: 0         0
country            0
social_support     0
freedom            0
corruption         0
generosity         0
gdp_per_cap        0
life_exp           0
happiness_score    0
dtype: int64

# Set up a ColumnTransformer with StandardScaler for numerical features and OneHotEncoder for categorical features.

In [22]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

ct = ColumnTransformer(
    [('standard', StandardScaler(), numerical_features)]
)

X = ct.fit_transform(df)

print(X)

[[-1.65892081 -1.59118586 -1.69356661 ...  1.08669987  1.13246036
   1.67384369]
 [-1.61488964 -1.56908606 -1.7180244  ...  1.37452586  1.0192638
   1.65220568]
 [-1.63690523 -1.63538547 -1.59573542 ...  2.25263904  1.24565693
   1.63056768]
 ...
 [ 1.6214013   1.72378469  1.53486268 ... -0.89588457 -1.37201358
  -1.63677158]
 [ 1.70946363  1.23758901  1.19245351 ... -0.94300998 -2.95676545
  -1.65840959]
 [ 1.55535454  1.70168488 -0.29947214 ... -0.89100617 -1.99459467
  -1.6800476 ]]


# Set up and training a LinearRegression model using scikit-learn, including data preprocessing steps within a Pipeline.

In [28]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

lr = LinearRegression()

model = Pipeline([
    ('preprocessor', ct),
    ('linear_regression', lr)
])

X = df[numerical_features]
y = df['happiness_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_train)

print(y_pred)

[ 6.30000000e+01  1.26000000e+02  1.13000000e+02  6.10000000e+01
  1.30000000e+02  1.29000000e+02  1.32000000e+02  2.70000000e+01
  8.40000000e+01  1.08000000e+02  1.40000000e+02  4.90000000e+01
  1.14000000e+02  1.33000000e+02  8.30000000e+01  6.70000000e+01
  3.00000000e+00  1.05000000e+02  1.24000000e+02  3.10000000e+01
  7.30000000e+01  6.00000000e+00  1.21000000e+02  6.60000000e+01
  1.00000000e+00  8.90000000e+01  4.00000000e+00  8.00000000e+00
  9.90000000e+01  1.50000000e+02  6.20000000e+01  3.40000000e+01
  1.03000000e+02  1.19000000e+02  7.10000000e+01  1.20000000e+02
  1.20000000e+01  1.48000000e+02  1.09000000e+02  8.10000000e+01
  5.80000000e+01  6.40000000e+01  1.90000000e+01  1.30000000e+01
  1.47000000e+02  1.42000000e+02  3.50000000e+01  2.80000000e+01
  1.52000000e+02  1.38000000e+02  1.15000000e+02  7.70000000e+01
  2.20000000e+01  1.49000000e+02  4.20000000e+01  1.53000000e+02
  2.00000000e+01  9.80000000e+01  1.02000000e+02  3.90000000e+01
  1.06000000e+02  3.80000

# Evaluate the performance of a regression model on test data

In [29]:
from sklearn.metrics import mean_squared_error

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

print(f"Mean squared error: {mse}")

Mean squared error: 4.907815566621513e-28


# Set up and execute cross_val_score to perform cross-validation

In [31]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

scores = cross_val_score(model, X, y, cv=5)

print(f"Cross-validation scores: {scores}")

Cross-validation scores: [1. 1. 1. 1. 1.]


# Calculate and display performance metrics
# integrate PolynomialFeatures in a pipeline before an SVM model
# Use different kernels such as linear, polynomial, and RBF

In [None]:
from sklearn.svm import SVR

kernels = ['linear', 'poly', 'rbf']
models = {}

for kernel in kernels:
    model = SVR(kernel=kernel)
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_mse = mean_squared_error(y_train, y_train_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)

    models[kernel] = model

    print(f"\nKernel: {kernel}")
    print("Training accuracy:", 100-train_mse)
    print("Testing accuracy:", 100-test_mse)
