## Importing the essential libraries over here

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
pd.set_option("display.max_columns",None)

## Importing the dataset over here

In [2]:
data=pd.read_csv("World-happiness-report-2024.csv")

In [3]:
data.head()

Unnamed: 0,Country name,Regional indicator,Ladder score,upperwhisker,lowerwhisker,Log GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,Dystopia + residual
0,Finland,Western Europe,7.741,7.815,7.667,1.844,1.572,0.695,0.859,0.142,0.546,2.082
1,Denmark,Western Europe,7.583,7.665,7.5,1.908,1.52,0.699,0.823,0.204,0.548,1.881
2,Iceland,Western Europe,7.525,7.618,7.433,1.881,1.617,0.718,0.819,0.258,0.182,2.05
3,Sweden,Western Europe,7.344,7.422,7.267,1.878,1.501,0.724,0.838,0.221,0.524,1.658
4,Israel,Middle East and North Africa,7.341,7.405,7.277,1.803,1.513,0.74,0.641,0.153,0.193,2.298


## Taking care of duplicate observations if present over here

In [4]:
data.duplicated().sum()

0

## Taking care of missing values if present over here

In [5]:
data.isnull().sum()

Country name                    0
Regional indicator              0
Ladder score                    0
upperwhisker                    0
lowerwhisker                    0
Log GDP per capita              3
Social support                  3
Healthy life expectancy         3
Freedom to make life choices    3
Generosity                      3
Perceptions of corruption       3
Dystopia + residual             3
dtype: int64

## Dropping the missing values over here

In [6]:
data.dropna(inplace=True)

## Filtering all the numerical features over here

In [7]:
numerical_features=[feature for feature in data.columns if data[feature].dtype!="O"]
for feature in numerical_features:
  print(feature)

Ladder score
upperwhisker
lowerwhisker
Log GDP per capita
Social support
Healthy life expectancy
Freedom to make life choices
Generosity
Perceptions of corruption
Dystopia + residual


In [8]:
data[numerical_features]

Unnamed: 0,Ladder score,upperwhisker,lowerwhisker,Log GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,Dystopia + residual
0,7.741,7.815,7.667,1.844,1.572,0.695,0.859,0.142,0.546,2.082
1,7.583,7.665,7.500,1.908,1.520,0.699,0.823,0.204,0.548,1.881
2,7.525,7.618,7.433,1.881,1.617,0.718,0.819,0.258,0.182,2.050
3,7.344,7.422,7.267,1.878,1.501,0.724,0.838,0.221,0.524,1.658
4,7.341,7.405,7.277,1.803,1.513,0.740,0.641,0.153,0.193,2.298
...,...,...,...,...,...,...,...,...,...,...
138,3.295,3.462,3.128,0.534,0.665,0.262,0.473,0.189,0.072,1.102
139,3.245,3.366,3.124,0.654,0.566,0.253,0.469,0.181,0.053,1.068
140,3.186,3.469,2.904,0.771,0.851,0.000,0.523,0.082,0.085,0.875
141,2.707,2.797,2.616,1.377,0.577,0.556,0.173,0.068,0.029,-0.073


## Filtering all the categorical features over here

In [9]:
cat_features=[feature for feature in data.columns if data[feature].dtype=="O"]
for feature in cat_features:
  print(feature)

Country name
Regional indicator


In [10]:
data[cat_features]

Unnamed: 0,Country name,Regional indicator
0,Finland,Western Europe
1,Denmark,Western Europe
2,Iceland,Western Europe
3,Sweden,Western Europe
4,Israel,Middle East and North Africa
...,...,...
138,Congo (Kinshasa),Sub-Saharan Africa
139,Sierra Leone,Sub-Saharan Africa
140,Lesotho,Sub-Saharan Africa
141,Lebanon,Middle East and North Africa


In [11]:
for feature in cat_features:
  feature_mapping={category:index for index,category in enumerate(data[feature].unique())}
  data[feature]=data[feature].map(feature_mapping)

In [12]:
data

Unnamed: 0,Country name,Regional indicator,Ladder score,upperwhisker,lowerwhisker,Log GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,Dystopia + residual
0,0,0,7.741,7.815,7.667,1.844,1.572,0.695,0.859,0.142,0.546,2.082
1,1,0,7.583,7.665,7.500,1.908,1.520,0.699,0.823,0.204,0.548,1.881
2,2,0,7.525,7.618,7.433,1.881,1.617,0.718,0.819,0.258,0.182,2.050
3,3,0,7.344,7.422,7.267,1.878,1.501,0.724,0.838,0.221,0.524,1.658
4,4,1,7.341,7.405,7.277,1.803,1.513,0.740,0.641,0.153,0.193,2.298
...,...,...,...,...,...,...,...,...,...,...,...,...
138,135,8,3.295,3.462,3.128,0.534,0.665,0.262,0.473,0.189,0.072,1.102
139,136,8,3.245,3.366,3.124,0.654,0.566,0.253,0.469,0.181,0.053,1.068
140,137,8,3.186,3.469,2.904,0.771,0.851,0.000,0.523,0.082,0.085,0.875
141,138,1,2.707,2.797,2.616,1.377,0.577,0.556,0.173,0.068,0.029,-0.073


In [13]:
data['Ladderscore']=data['Ladder score']

In [14]:
data.drop("Ladder score",axis=1,inplace=True)

## Creating the features and labels over here

In [15]:
X=data.iloc[:,:-1].values
y=data.iloc[:,-1].values

## Splitting the dataset into training set and testing set over here to avoid the problem of overfitting

In [16]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

## Training the model on the training dataset over here

In [17]:
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
regressor=RandomForestRegressor(n_estimators=10,random_state=0)
regressor.fit(X_train,y_train)

In [18]:
y_pred=regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),1))

[[6.27 6.23]
 [5.98 5.97]
 [7.15 7.12]
 [6.06 6.06]
 [5.11 5.11]
 [6.54 6.59]
 [3.39 3.5 ]
 [3.1  3.19]
 [4.11 4.05]
 [7.09 7.06]
 [4.43 4.42]
 [6.84 6.84]
 [5.89 5.88]
 [5.72 5.7 ]
 [3.91 3.98]
 [4.91 4.92]
 [4.59 4.66]
 [4.49 4.47]
 [6.48 6.45]
 [5.21 5.14]
 [5.19 5.16]
 [6.74 6.72]
 [4.89 4.83]
 [6.74 6.68]
 [7.4  7.53]
 [6.04 6.06]
 [6.58 6.61]
 [5.81 5.78]]


In [19]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.9980607828047589