In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import OrdinalEncoder
from sklearn.svm import SVC


In [3]:
df = pd.read_csv(r'..\data\raw\AgeDataset-V1.csv')
df = df[df.Gender.notna()].drop(['Id','Short description'],axis=1)


In [4]:
m= df[df['Gender']=='Male' ]
f= df[df['Gender']=='Female']
df=pd.concat([m,f],ignore_index=True)
df.head()

Unnamed: 0,Name,Gender,Country,Occupation,Birth year,Death year,Manner of death,Age of death
0,George Washington,Male,United States of America; Kingdom of Great Bri...,Politician,1732,1799.0,natural causes,67.0
1,Douglas Adams,Male,United Kingdom,Artist,1952,2001.0,natural causes,49.0
2,Abraham Lincoln,Male,United States of America,Politician,1809,1865.0,homicide,56.0
3,Wolfgang Amadeus Mozart,Male,Archduchy of Austria; Archbishopric of Salzburg,Artist,1756,1791.0,,35.0
4,Ludwig van Beethoven,Male,Holy Roman Empire; Austrian Empire,Artist,1770,1827.0,,57.0


In [5]:
df=df.dropna().reset_index().drop('index',axis=1)

In [6]:
df['Occupation']=[x.split(';')[0] for x in df.Occupation.values]


In [7]:
count=df.Occupation.value_counts()
count

Artist                18026
Politician             7282
Athlete                5656
Researcher             2719
Military personnel     2115
                      ...  
Hermit                    1
Seminarian                1
Grammarian                1
Orator                    1
Captain                   1
Name: Occupation, Length: 796, dtype: int64

In [8]:
lista=count.index[count.values>10]
lista

Index(['Artist', 'Politician', 'Athlete', 'Researcher', 'Military personnel',
       'Journalist', 'Businessperson', 'Religious figure', 'Teacher',
       'Engineer',
       ...
       'Philanthropist', 'Dentist', 'Impresario', 'Long-distance runner',
       'Mangaka', 'Bobsledder', 'Spree killer', 'Author', 'Stunt performer',
       'Political activist'],
      dtype='object', length=137)

In [9]:
df=df[df.Occupation.isin(lista)]
df

Unnamed: 0,Name,Gender,Country,Occupation,Birth year,Death year,Manner of death,Age of death
0,George Washington,Male,United States of America; Kingdom of Great Bri...,Politician,1732,1799.0,natural causes,67.0
1,Douglas Adams,Male,United Kingdom,Artist,1952,2001.0,natural causes,49.0
2,Abraham Lincoln,Male,United States of America,Politician,1809,1865.0,homicide,56.0
4,Claude Monet,Male,France,Artist,1840,1926.0,natural causes,86.0
5,Elvis Presley,Male,United States of America,Artist,1935,1977.0,natural causes,42.0
...,...,...,...,...,...,...,...,...
45822,Sahar Khodayari,Female,Iran,Athlete,1990,2019.0,suicide,29.0
45823,Shuping Wang,Female,United States of America; People's Republic of...,Researcher,1959,2019.0,natural causes,60.0
45824,Hevrin Khalaf,Female,Syria,Politician,1984,2019.0,homicide,35.0
45825,Sara Champion,Female,United Kingdom,Researcher,1946,2000.0,natural causes,54.0


In [10]:
df['Death year']=df['Death year'].astype(int)
df['Age of death']=df['Age of death'].astype(int)
df

Unnamed: 0,Name,Gender,Country,Occupation,Birth year,Death year,Manner of death,Age of death
0,George Washington,Male,United States of America; Kingdom of Great Bri...,Politician,1732,1799,natural causes,67
1,Douglas Adams,Male,United Kingdom,Artist,1952,2001,natural causes,49
2,Abraham Lincoln,Male,United States of America,Politician,1809,1865,homicide,56
4,Claude Monet,Male,France,Artist,1840,1926,natural causes,86
5,Elvis Presley,Male,United States of America,Artist,1935,1977,natural causes,42
...,...,...,...,...,...,...,...,...
45822,Sahar Khodayari,Female,Iran,Athlete,1990,2019,suicide,29
45823,Shuping Wang,Female,United States of America; People's Republic of...,Researcher,1959,2019,natural causes,60
45824,Hevrin Khalaf,Female,Syria,Politician,1984,2019,homicide,35
45825,Sara Champion,Female,United Kingdom,Researcher,1946,2000,natural causes,54


In [11]:

df['Country']=[x.split(';')[0]for x in df.Country.values]
df

Unnamed: 0,Name,Gender,Country,Occupation,Birth year,Death year,Manner of death,Age of death
0,George Washington,Male,United States of America,Politician,1732,1799,natural causes,67
1,Douglas Adams,Male,United Kingdom,Artist,1952,2001,natural causes,49
2,Abraham Lincoln,Male,United States of America,Politician,1809,1865,homicide,56
4,Claude Monet,Male,France,Artist,1840,1926,natural causes,86
5,Elvis Presley,Male,United States of America,Artist,1935,1977,natural causes,42
...,...,...,...,...,...,...,...,...
45822,Sahar Khodayari,Female,Iran,Athlete,1990,2019,suicide,29
45823,Shuping Wang,Female,United States of America,Researcher,1959,2019,natural causes,60
45824,Hevrin Khalaf,Female,Syria,Politician,1984,2019,homicide,35
45825,Sara Champion,Female,United Kingdom,Researcher,1946,2000,natural causes,54


In [12]:
X=df[['Gender','Country','Occupation','Birth year','Manner of death']]
y=df[['Age of death']]

In [13]:
X

Unnamed: 0,Gender,Country,Occupation,Birth year,Manner of death
0,Male,United States of America,Politician,1732,natural causes
1,Male,United Kingdom,Artist,1952,natural causes
2,Male,United States of America,Politician,1809,homicide
4,Male,France,Artist,1840,natural causes
5,Male,United States of America,Artist,1935,natural causes
...,...,...,...,...,...
45822,Female,Iran,Athlete,1990,suicide
45823,Female,United States of America,Researcher,1959,natural causes
45824,Female,Syria,Politician,1984,homicide
45825,Female,United Kingdom,Researcher,1946,natural causes


In [14]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

In [15]:
def data_preparatio(X):
    Gender_enc = OrdinalEncoder()
    Country_enc = OrdinalEncoder()
    Occupation_enc = OrdinalEncoder()
    death_enc = OrdinalEncoder()
    X['Gender'] = Gender_enc.fit_transform(X[['Gender']])
    X['Country'] =Country_enc.fit_transform(X[['Country']])
    X['Occupation'] = Occupation_enc.fit_transform(X[['Occupation']])
    X['Manner of death'] = death_enc.fit_transform(X[['Manner of death']])
    return X

In [16]:
X_train=data_preparatio(X_train)
X_test=data_preparatio(X_test)


### Regressione lineae sklear

In [17]:
linear_model= LinearRegression()
grid_linear = {
    'fit_intercept':[True,False]
}
grid_search_linear =GridSearchCV(estimator=linear_model,param_grid=grid_linear,cv=5,verbose=3,return_train_score=True)
grid_search_linear.fit(X_train, y_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV 1/5] END fit_intercept=True;, score=(train=0.076, test=0.076) total time=   0.0s
[CV 2/5] END fit_intercept=True;, score=(train=0.076, test=0.076) total time=   0.0s
[CV 3/5] END fit_intercept=True;, score=(train=0.075, test=0.079) total time=   0.0s
[CV 4/5] END fit_intercept=True;, score=(train=0.074, test=0.085) total time=   0.0s
[CV 5/5] END fit_intercept=True;, score=(train=0.080, test=0.061) total time=   0.0s
[CV 1/5] END fit_intercept=False;, score=(train=0.062, test=0.059) total time=   0.0s
[CV 2/5] END fit_intercept=False;, score=(train=0.063, test=0.053) total time=   0.0s
[CV 3/5] END fit_intercept=False;, score=(train=0.059, test=0.069) total time=   0.0s
[CV 4/5] END fit_intercept=False;, score=(train=0.059, test=0.068) total time=   0.0s
[CV 5/5] END fit_intercept=False;, score=(train=0.063, test=0.054) total time=   0.0s


In [18]:
grid_search_linear.cv_results_.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_fit_intercept', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score', 'split0_train_score', 'split1_train_score', 'split2_train_score', 'split3_train_score', 'split4_train_score', 'mean_train_score', 'std_train_score'])

In [19]:
pd.DataFrame.from_dict(grid_search_linear.cv_results_)[[
    'param_fit_intercept', 'std_test_score', 'mean_test_score','std_train_score', 'mean_train_score', 'rank_test_score']].sort_values(by='rank_test_score').head(15)

Unnamed: 0,param_fit_intercept,std_test_score,mean_test_score,std_train_score,mean_train_score,rank_test_score
0,True,0.007976,0.075302,0.001982,0.076003,1
1,False,0.006776,0.060626,0.001681,0.061232,2


In [20]:
grid_search_linear.best_params_

{'fit_intercept': True}

In [21]:
linear_model_grid= LinearRegression()
linear_model_grid.fit(X_train,y_train)
predict_linear =linear_model_grid.predict(X_test)

In [22]:
mse_linear=mean_squared_error(y_test,predict_linear)

np.sqrt(mse_linear)

18.428557613303724

In [39]:
Random_model= RandomForestRegressor()
grid_Random = {
    'n_estimators': [270],
    'max_depth': [11],
    'min_samples_leaf':[3],
    'max_leaf_nodes': [None]
}
grid_search_Random =GridSearchCV(estimator=Random_model,param_grid=grid_Random,cv=5,verbose=3,return_train_score=True,n_jobs=-1)
grid_search_Random.fit(X_train, y_train)

Fitting 5 folds for each of 21 candidates, totalling 105 fits


  self.best_estimator_.fit(X, y, **fit_params)


In [37]:
grid_search_Random.cv_results_.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_max_depth', 'param_max_leaf_nodes', 'param_min_samples_leaf', 'param_n_estimators', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score', 'split0_train_score', 'split1_train_score', 'split2_train_score', 'split3_train_score', 'split4_train_score', 'mean_train_score', 'std_train_score'])

In [40]:
pd.DataFrame.from_dict(grid_search_Random.cv_results_)[[
    'param_max_leaf_nodes', 'std_test_score', 'mean_test_score','std_train_score', 'mean_train_score', 'rank_test_score']].sort_values(by='rank_test_score').head(15)

Unnamed: 0,param_max_leaf_nodes,std_test_score,mean_test_score,std_train_score,mean_train_score,rank_test_score
0,,0.007796,0.473938,0.001816,0.547803,1
20,49.0,0.009156,0.459029,0.002217,0.471025,2
19,48.0,0.008803,0.458707,0.002184,0.470501,3
18,47.0,0.009046,0.458231,0.002228,0.469689,4
17,46.0,0.008973,0.457866,0.002265,0.469121,5
16,45.0,0.008973,0.457496,0.002268,0.468515,6
15,44.0,0.008781,0.45718,0.00224,0.467828,7
14,43.0,0.008845,0.456716,0.00228,0.467065,8
13,42.0,0.008942,0.456412,0.002387,0.466589,9
12,41.0,0.008809,0.455846,0.002363,0.465754,10


In [41]:
grid_search_Random.best_params_

{'max_depth': 11,
 'max_leaf_nodes': None,
 'min_samples_leaf': 3,
 'n_estimators': 270}

In [42]:
Random_model_grid= RandomForestRegressor(n_estimators=270,max_leaf_nodes=None,min_samples_leaf=3,max_depth=11)
Random_model_grid.fit(X_train,y_train)
predict_Random =linear_model_grid.predict(X_test)

  Random_model_grid.fit(X_train,y_train)


In [44]:
mean_absolute_error(y_test,predict_Random)



15.319448209625524