<a href="https://colab.research.google.com/github/KSR-16/Life-Expectancy-Model/blob/main/Group_6_Life_Expectancy_Feature_Engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary packages
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, MinMaxScaler
import statsmodels.api as sm

In [None]:
# Read in clean data
pd.set_option('display.max_columns', None)
df = pd.read_csv('Clean_Life_Expectancy_Data.csv')

In [None]:
# Check first 5 rows
df.head()

Unnamed: 0,Region,Infant_deaths,Under_five_deaths,Adult_mortality,Alcohol_consumption,Hepatitis_B,Measles,BMI,Polio,Diphtheria,Incidents_HIV,GDP_per_capita,Population_mln,Thinness_ten_nineteen_years,Thinness_five_nine_years,Schooling,Economy_status_Developed,Economy_status_Developing,Life_expectancy
0,Middle East,11.1,13.0,105.824,1.32,97,65,27.8,97,97,0.08,11006,78.53,4.9,4.8,7.8,0,1,76.5
1,European Union,2.7,3.3,57.9025,10.35,97,94,26.0,97,97,0.09,25742,46.44,0.6,0.5,9.7,1,0,82.8
2,Asia,51.5,67.9,201.0765,1.57,60,35,21.2,67,64,0.13,1076,1183.21,27.1,28.0,5.0,0,1,65.4
3,South America,32.8,40.5,222.1965,5.68,93,74,25.3,92,93,0.79,4146,0.75,5.7,5.5,7.9,0,1,67.0
4,Middle East,3.4,4.3,57.951,2.89,97,89,27.0,94,94,0.08,33995,7.91,1.2,1.1,12.8,1,0,81.7


# Train/Test Split

In [None]:
# Define feature columns
feature_cols = list(df.columns)
feature_cols.remove('Life_expectancy') # Take out 'Life_expectancy' as this is the target

# Create X, and y
X = df[feature_cols]
y = df['Life_expectancy']

In [None]:
# Split data into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
X_train.head()

Unnamed: 0,Region,Infant_deaths,Under_five_deaths,Adult_mortality,Alcohol_consumption,Hepatitis_B,Measles,BMI,Polio,Diphtheria,Incidents_HIV,GDP_per_capita,Population_mln,Thinness_ten_nineteen_years,Thinness_five_nine_years,Schooling,Economy_status_Developed,Economy_status_Developing
2026,Asia,7.9,9.3,111.2825,2.45,99,99,22.9,99,99,0.01,3694,20.78,15.2,15.0,10.9,0,1
651,European Union,3.7,4.6,114.2985,13.42,98,98,26.6,96,98,0.08,14070,10.2,2.1,2.2,11.6,1,0
2225,South America,15.4,18.0,143.0785,6.6,78,83,26.6,79,78,0.4,16056,30.04,1.6,1.5,10.0,0,1
2357,Rest of Europe,11.8,13.3,80.9365,4.88,99,98,26.1,99,99,0.03,3577,2.91,1.4,1.5,9.3,0,1
670,Africa,43.3,74.4,495.7265,2.29,83,64,23.2,82,79,9.74,3298,1.88,14.2,14.3,5.8,0,1


# Feature Engineering

In [None]:
# Define feature engineering function
def feature_eng(df):
    df = df.copy() # Copy dataframe

    ''' One hot encoding '''
    df = pd.get_dummies(df, columns = ['Region'], prefix = 'Region')

    ''' MinMax scaling '''
    minmax = MinMaxScaler() # Initialise scaler
    # Fit and transform scaler
    df[['BMI', 'Schooling', 'Alcohol_consumption']] = minmax.fit_transform(df[['BMI', 'Schooling', 'Alcohol_consumption']])

    ''' Robust scaling '''
    # Define list of columns to be robust scaled
    robust_list = ['Infant_deaths',
                   'Under_five_deaths',
                   'Adult_mortality',
                   'Hepatitis_B',
                   'Measles',
                   'Polio',
                   'Diphtheria',
                   'Incidents_HIV',
                   'GDP_per_capita',
                   'Population_mln',
                   'Thinness_ten_nineteen_years',
                   'Thinness_five_nine_years',]
    rob = RobustScaler() # Initialise scaler
    # Fit and transform scaler
    df[robust_list] = rob.fit_transform(df[robust_list])

    ''' Add constant '''
    df = sm.add_constant(df)

    return df # Return df

In [None]:
# Feature engineer X_train
X_train_fe = feature_eng(X_train)
X_train_fe.head()

Unnamed: 0,const,Infant_deaths,Under_five_deaths,Adult_mortality,Alcohol_consumption,Hepatitis_B,Measles,BMI,Polio,Diphtheria,Incidents_HIV,GDP_per_capita,Population_mln,Thinness_ten_nineteen_years,Thinness_five_nine_years,Schooling,Economy_status_Developed,Economy_status_Developing,Region_Africa,Region_Asia,Region_Central America and Caribbean,Region_European Union,Region_Middle East,Region_North America,Region_Oceania,Region_Rest of Europe,Region_South America
2026,1.0,-0.298246,-0.244969,-0.369456,0.137101,0.555556,0.551724,0.252033,0.375,0.375,-0.375,-0.045581,0.564994,2.051724,2.017241,0.75969,0,1,0,1,0,0,0,0,0,0,0
651,1.0,-0.403509,-0.327209,-0.34836,0.750979,0.5,0.517241,0.552846,0.1875,0.3125,-0.2,0.85527,0.0932,-0.206897,-0.189655,0.813953,1,0,0,0,0,1,0,0,0,0,0
2225,1.0,-0.110276,-0.092738,-0.147051,0.369334,-0.611111,0.0,0.552846,-0.875,-0.9375,0.6,1.027696,0.977926,-0.293103,-0.310345,0.689922,0,1,0,0,0,0,0,0,0,0,1
2357,1.0,-0.200501,-0.174978,-0.581719,0.273083,0.555556,0.517241,0.512195,0.375,0.375,-0.325,-0.055739,-0.231884,-0.327586,-0.310345,0.635659,0,1,0,0,0,0,0,0,0,1,0
670,1.0,0.588972,0.894138,2.319636,0.128148,-0.333333,-0.655172,0.276423,-0.6875,-0.875,23.95,-0.079962,-0.277815,1.87931,1.896552,0.364341,0,1,1,0,0,0,0,0,0,0,0


In [None]:
# Feature engineer X_test
X_test_fe = feature_eng(X_test)
X_test_fe.head()

Unnamed: 0,const,Infant_deaths,Under_five_deaths,Adult_mortality,Alcohol_consumption,Hepatitis_B,Measles,BMI,Polio,Diphtheria,Incidents_HIV,GDP_per_capita,Population_mln,Thinness_ten_nineteen_years,Thinness_five_nine_years,Schooling,Economy_status_Developed,Economy_status_Developing,Region_Africa,Region_Asia,Region_Central America and Caribbean,Region_European Union,Region_Middle East,Region_North America,Region_Oceania,Region_Rest of Europe,Region_South America
1590,1.0,-0.131868,-0.111969,0.102023,0.603565,0.4375,0.0,0.603306,0.125,0.1875,1.375,3.081819,-0.374263,-0.173077,-0.169811,0.751938,0,1,0,0,1,0,0,0,0,0,0
1752,1.0,-0.417582,-0.34749,-0.726046,0.580824,-0.0625,0.1,0.595041,0.1875,0.1875,-0.21875,3.930425,3.07185,-0.5,-0.54717,0.891473,1,0,0,0,0,0,0,0,0,1,0
772,1.0,0.793956,0.936293,0.415246,0.364474,-1.1875,-0.4,0.31405,0.0,0.0,2.90625,-0.297444,-0.383914,0.807692,0.792453,0.24031,0,1,0,0,1,0,0,0,0,0,0
1735,1.0,1.81044,2.206564,0.940366,0.088506,-1.0,0.0,0.181818,-0.875,-0.875,2.75,-0.315033,-0.023592,1.211538,1.169811,0.108527,0,1,1,0,0,0,0,0,0,0,0
387,1.0,-0.406593,-0.339768,-0.643476,0.559312,0.1875,-0.2,0.578512,0.1875,0.1875,-0.34375,1.746186,0.19571,-0.5,-0.490566,0.651163,1,0,0,0,0,1,0,0,0,0,0


In [None]:
# Check min and max of one of the robust-scaled columns
print(f'Maximum of column Infant_deaths: {X_train_fe.Infant_deaths.max()}')
print(f'Maximum of column Infant_deaths: {X_train_fe.Infant_deaths.min()}')

Maximum of column Infant_deaths: 2.9022556390977443
Maximum of column Infant_deaths: -0.4511278195488722


In [None]:
# Check min and max of one of the minmax-scaled columns
print(f'Minimum of column BMI: {pd.DataFrame(X_train_fe).BMI.min()}')
print(f'Maximum of column BMI: {pd.DataFrame(X_train_fe).Schooling.max()}')

Minimum of column BMI: 0.0
Maximum of column BMI: 0.9999999999999999


In [None]:
# Save train/split data as csv files
X_train_fe.to_csv('X_train_fe.csv', index=True)
X_test_fe.to_csv('X_test_fe.csv', index=True)
y_train.to_csv('y_train.csv', index=True)
y_test.to_csv('y_test.csv', index=True)


##Save csv file on local machine to downloads
from google.colab import files
files.download('X_train_fe.csv')
files.download('X_test_fe.csv')
files.download('y_train.csv')
files.download('y_test.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>