In [52]:
#IMPORTING NECESSARY LIBRARIES
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [91]:
#importing libraries for model building
from sklearn.linear_model import LinearRegression as LR
from sklearn.linear_model import Lasso
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error as MSE
import joblib
import xgboost as xgb

In [54]:
#imporing the data
data= pd.read_csv('data.csv')
test = pd.read_csv('test2.csv')
samp = pd.read_csv('sample_submission.csv')

In [55]:
#checking data info
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2132 entries, 0 to 2131
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   year              2132 non-null   int64  
 1   status            2132 non-null   int64  
 2   life_expectancy   2125 non-null   float64
 3   adult_mortality   2125 non-null   float64
 4   infant_death      2132 non-null   int64  
 5   alcohol           2035 non-null   float64
 6   percentage_expen  2132 non-null   float64
 7   hepatitis_b       1850 non-null   float64
 8   measles           2132 non-null   int64  
 9   bmi               2115 non-null   float64
 10  under_5           2132 non-null   int64  
 11  polio             2124 non-null   float64
 12  total_expend      2014 non-null   float64
 13  diphtheria        2124 non-null   float64
 14  hiv/aids          2132 non-null   float64
 15  gdp               1911 non-null   float64
 16  population        1806 non-null   float64


In [56]:
#checking number of missing values
data.isna().sum()

year                  0
status                0
life_expectancy       7
adult_mortality       7
infant_death          0
alcohol              97
percentage_expen      0
hepatitis_b         282
measles               0
bmi                  17
under_5               0
polio                 8
total_expend        118
diphtheria            8
hiv/aids              0
gdp                 221
population          326
thin_10_19yrs        17
thin_5_9yrs          17
income_cor           87
schooling            84
dtype: int64

From above we can observe that there are a few rows with missing values in the respective columns.
After multiple iterations, the best performing model after evaluation I got was by dropping this missing values.

I will be dropping the rows with missing values.

In [57]:
#dropping missing values
data.dropna(axis =0,inplace=True)

After performing several iterations, dropping the year column improved the performance pf the model.

In [58]:
#dropping the year column
data.drop(columns=['year'],inplace=True)

Performing log transformation of the target variable.

In [59]:
#log transformation of target variable
data['life_expectancy'] = np.log1p(data['life_expectancy'])

In [85]:
#saving this dataset to csv
data.to_csv('data2.csv',index= False)

In [60]:
#creating the x variable
x=  data.drop(columns =['life_expectancy']).values

In [61]:
#creating the y variable
y = data['life_expectancy'].values.reshape(-1,1)

In [62]:
#splitting the dataset
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state= 42)

In [63]:
#fitting the model for Linear Regression model
linear_model = LR()
linear_model.fit(x_train,y_train)

In [64]:
#calculating the predicted values
predicted_lr = linear_model.predict(x_test)

In [65]:
#evaluating the RMSE
RMSE = np.sqrt(MSE(y_test, predicted_lr))
print(f'Root Mean Squared Error: {RMSE}')

Root Mean Squared Error: 0.06052737908572332


In [66]:
#setting lasso
lasso = make_pipeline(RobustScaler(),Lasso(alpha=0.0005,random_state =1))

In [67]:
##fitting the model for Lasso Regression model
lasso.fit(x_train,y_train)

  model = cd_fast.enet_coordinate_descent(


In [68]:
#calculating the predicted values
predicted_la = lasso.predict(x_test)

In [69]:
#evaluating the RMSE
RMSE = np.sqrt(MSE(y_test, predicted_la))
print(f'Root Mean Squared Error: {RMSE}')

Root Mean Squared Error: 0.060666395037390075


In [70]:
# setting XGB
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603,gamma=0.0468,learning_rate= 0.05,max_depth=3,min_child_weight=1.7817,n_estimators=2200,reg_alpha=0.4640,reg_lambda=0.8571,subsample=0.5213,silent=1,random_state= 7,nthread=-1)

In [71]:
# fitting the model for XGB Regressor
model_xgb.fit(x_train,y_train)

Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [72]:
#calculating the predicted values
predicted_xgb = model_xgb.predict(x_test)

In [73]:
#evaluating the RMSE
RMSE = np.sqrt(MSE(y_test, predicted_xgb))
print(f'Root Mean Squared Error: {RMSE}')

Root Mean Squared Error: 0.04622038961690339


In [87]:
#using gboost withut hyper parameter tuning
gboostt =GradientBoostingRegressor()

In [88]:
#fitting the model
gboostt = gboostt.fit(x_train,y_train)

  y = column_or_1d(y, warn=True)


In [89]:
#calculating the predicted values
predicted_gbs = gboostt.predict(x_test)

In [90]:
#evaluating the RMSE
RMSE = np.sqrt(MSE(y_test, predicted_gbs))
print(f'Root Mean Squared Error: {RMSE}')

Root Mean Squared Error: 0.0359940099583383


In [74]:
# Setting gradient boosting
GBoost = GradientBoostingRegressor(n_estimators=6000,learning_rate=0.05,max_depth=6,max_features='sqrt',min_samples_leaf=15,min_samples_split=10,loss='huber',random_state=5)

In [75]:
# fitting gradient boosting to the model
GBoost = GBoost.fit(x_train,y_train)

  y = column_or_1d(y, warn=True)


In [76]:
#calculating the predicted values
predicted_gb = GBoost.predict(x_test)

In [77]:
#evaluating the RMSE
RMSE = np.sqrt(MSE(y_test, predicted_gb))
print(f'Root Mean Squared Error: {RMSE}')

Root Mean Squared Error: 0.028025742725467324


Performing same changes done on the training set

In [78]:
#replacing the status column with 0 and 1
test['status'] = test['status'].replace('Developed','0').replace('Developing','1')

In [79]:
#changing the data type
test['status']= test['status'].apply(np.int64)

In [80]:
#dropping the year and id column to match training set
test.drop(columns=['year','id'],inplace=True)

In [86]:
#saving this as a csv to use in main notebook
test.to_csv('test3.csv',index = False)

In [92]:
#saving the model
fn = 'Life_expectancy_model.joblib'
joblib.dump(GBoost,fn)
print('Model saved as',fn)

Model saved as Life_expectancy_model.joblib


Using the Gradient boosting Model on the test Dataset since it performs best!

In [81]:
#directly imputing predictions into the life expectancy column of the submssion sample
samp['Life expectancy'] = GBoost.predict(test)



In [82]:
#reversing the log transformation
samp['Life expectancy'] = np.expm1(samp['Life expectancy'])

In [83]:
samp['Life expectancy']

0      77.919133
1      84.276816
2      68.317915
3      63.348102
4      74.432644
         ...    
801    75.203469
802    73.026961
803    52.216301
804    52.709036
805    75.780489
Name: Life expectancy, Length: 806, dtype: float64

In [94]:
#saving it into a csv file
samp.to_csv('samp_14.csv',index = False)