In [1]:
import pandas as pd
import numpy as np

train_titanic = pd.read_csv('titanic_train.csv', index_col = 'PassengerId')
test_titanic = pd.read_csv('test-2.csv')

train_titanic.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
train_titanic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [92]:
# pipelines return sparse matrix, with this function we reconver the dataframe
def dataframe_from_slice(X, ind, cols):
    df = pd.DataFrame( X, columns=list(cols), index=ind)
    return df

In [93]:
# pipeline that fills nan values and drops two useless columns

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

null_val = ['Age', 'Fare']
droppable = [ 'Ticket', 'Cabin']

pipeline_fillna = ColumnTransformer([
        ("imputer", SimpleImputer(strategy="median"), null_val),
        ('imputer_emb', SimpleImputer(strategy= 'most_frequent'), ['Embarked']),
        ('drop', 'drop', droppable),
    ], remainder='passthrough')  


train_titanic_fillna = pipeline_fillna.fit_transform(train_titanic)

train_titanic_fillna = dataframe_from_slice(train_titanic_fillna, 
                                                 train_titanic.index, list(train_titanic.columns.drop(droppable)))
train_titanic_fillna.info()  # GOOD!

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Survived  891 non-null    object
 1   Pclass    891 non-null    object
 2   Name      891 non-null    object
 3   Sex       891 non-null    object
 4   Age       891 non-null    object
 5   SibSp     891 non-null    object
 6   Parch     891 non-null    object
 7   Fare      891 non-null    object
 8   Embarked  891 non-null    object
dtypes: object(9)
memory usage: 69.6+ KB


In [96]:
# create a transformer that creates new attributes and bins age and fare

from sklearn.base import BaseEstimator, TransformerMixin

# column index
col_names = 'Pclass', 'Age', 'SibSp', 'Parch', 'Name'# , 'Fare'
 # get the column indices

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    # BaseEstimator gets the extra methods get_params() and set_params()
    # TransformerMixin gives the fit_transform() method for free
    
    def __init__(self, add_age_per_class=True): # no *args or **kargs
        self.add_age_per_class = add_age_per_class
        
    def fit(self, X, y=None):
        return self  # nothing else to do
    
    def transform(self, X):
        class_ix, age_ix, sibsp_ix, parch_ix,  name_ix = [train_titanic_fillna.columns.get_loc(c) for c in col_names]
        relatives = X[:, sibsp_ix] + X[:, parch_ix]
        # fare_per_person = X[:, fare_ix] / (relatives+1)
        title = pd.Series(X[:, name_ix]).str.extract(pat= ' ([A-Za-z]+)\.')
        
        
        ind = train_titanic.index
        
        if self.add_age_per_class:
            age_per_class = X[:, age_ix] * X[:, class_ix]
            Y =  np.c_[X, #fare_per_person,
                         relatives,  title, age_per_class]
            return dataframe_from_slice(Y, ind, ['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 
       'Fare',  'Embarked', 'relatives', 'title', 'age_per_class'])
                   
            # np.c_ translates slice objects to concatenation along the second axis.
        else:
            Y = np.c_[X, #fare_per_person, 
                         relatives, 
                         title]
            return dataframe_from_slice(Y, ind, ['Survived', 'Pclass', 'Name', 'Sex', 'Age', 
                                                 'SibSp', 'Parch',  'Fare',  'Embarked', 'relatives', 
                                                 'title'])
        


In [97]:
train_titanic_attr = attr_adder.transform(train_titanic_fillna.values)
train_titanic_attr.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,relatives,title,age_per_class
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,22.0,7.25,S,0,3,"Braund, Mr. Owen Harris",male,1,0,"Braund, Mr. Owen Harrismale",,21.75
2,38.0,71.2833,C,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,1,0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",,71.2833
3,26.0,7.925,S,1,3,"Heikkinen, Miss. Laina",female,0,0,"Heikkinen, Miss. Lainafemale",,23.775
4,35.0,53.1,S,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,1,0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)fe...",,53.1
5,35.0,8.05,S,0,3,"Allen, Mr. William Henry",male,0,0,"Allen, Mr. William Henrymale",,24.15


In [87]:
# Now we add new attributes, bin age and fare and encode categorical attributes

from sklearn. preprocessing import OneHotEncoder, KBinsDiscretizer
from sklearn.impute import SimpleImputer

attr = ['Pclass', 'Age', 
        'SibSp', 'Parch', #'Fare', 
        'Name']
cat_attr = ['Sex', 'Embarked']
droppable_2 = ['Name', 'SibSp', 'Parch',]


pipe = ColumnTransformer([           # (name, tranformer, list of columns that the trans should be applied to)
            ('fillna', pipeline_fillna, ['Age', 'Fare', 'Ticket', 'Cabin' , 'Embarked']),
           # ('attribs_adder', CombinedAttributesAdder(), attr),            # add new attr: 'age_per_class', 'n_relatives', 'title'
            ('bins', KBinsDiscretizer(encode = 'ordinal'), ['Fare']),                            # bin fare 
            ('cat', OneHotEncoder(sparse=False), cat_attr),                          # encode categorical ones
            ('drop', 'drop', droppable_2),
              ], remainder='passthrough' )


train_titanic_pipe = pipe.fit_transform(train_titanic)

train_titanic_pipe = dataframe_from_slice(train_titanic_pipe, train_titanic.index,
                                         np.arange(12))
train_titanic_pipe

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,22.0,7.25,S,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0,3
2,38.0,71.2833,C,4.0,1.0,0.0,1.0,0.0,0.0,0.0,1,1
3,26.0,7.925,S,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1,3
4,35.0,53.1,S,4.0,1.0,0.0,0.0,0.0,1.0,0.0,1,1
5,35.0,8.05,S,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...
887,27.0,13.0,S,2.0,0.0,1.0,0.0,0.0,1.0,0.0,0,2
888,19.0,30.0,S,3.0,1.0,0.0,0.0,0.0,1.0,0.0,1,1
889,28.0,23.45,S,3.0,1.0,0.0,0.0,0.0,1.0,0.0,0,3
890,26.0,30.0,C,3.0,0.0,1.0,1.0,0.0,0.0,0.0,1,1


In [84]:
train_titanic.columns

Index(['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket',
       'Fare', 'Cabin', 'Embarked'],
      dtype='object')

19

In [None]:
# let's see if it works: 


train_titanic_transformed 

train_titanic_tr = pd.DataFrame(
    train_titanic_transformed,
    columns=list(train_titanic.columns)+[#'age_per_class',
                                         #'fare_per_person', 'n_relatives', 
        'rel'],
    index=train_titanic.index)
train_titanic_tr.head()

In [None]:
def transformer(df, train= True):
    
    # fill nan:
    if train == True:
        global imputer_num 
        imputer_num = SimpleImputer(strategy = 'median')
        df[['Age', 'Fare']] = imputer_num.fit_transform(df[['Age', 'Fare']])
        
        global imputer_cat 
        imputer_cat = SimpleImputer(strategy = 'most_frequent')
        df[['Embarked']] = imputer_cat.fit_transform(df[['Embarked']])
    else:
        df[['Age', 'Fare']] = imputer_num.transform(df[['Age', 'Fare']])
        df[['Embarked']] = imputer_cat.transform(df[['Embarked']])
    
    # new attributes
    df["AgeBucket"] = df["Age"] // 15 * 15
    df['Relatives'] = df['SibSp']+df['Parch']
    df['Title']= df['Name'].str.extract(pat= ' ([A-Za-z]+)\.')
    df['Age_Class']= df['Age']* df['Pclass']
    df['Fare_Per_Person'] = df['Fare']/(df['Relatives']+1)
    
    # drop useless col
    df.drop(['Name', 'SibSp', 'Parch', 'Ticket', 'Cabin'], axis=1, inplace= True)
    
    # encode categorical values
    df = pd.get_dummies(df,columns=['Title', 'Embarked', 'Sex'])
    
    # bin Fare
    if train == True:
        global fare_bin
        fare_bin = KBinsDiscretizer()
        df['fare_1','fare_2','fare_3','fare_4','fare_5'] = pd.DataFrame.sparse.from_spmatrix(KBinsDiscretizer().fit_transform(train_titanic[['Fare']]), 
                                  columns = ['fare_1','fare_2','fare_3','fare_4','fare_5'])
    else:
        df[['Fare']] = pd.DataFrame.sparse.from_spmatrix(KBinsDiscretizer().transform(train_titanic[['Fare']]), 
                                  columns = ['fare_1','fare_2','fare_3','fare_4','fare_5',])
    
    
    return df

In [None]:
X_train = transformer(train_titanic.copy())
y_train = X_train['Survived']
X_train.drop(['Survived'], axis=1, inplace= True)


X_train.info()

In [None]:
from sklearn.svm import SVC

svm_clf = SVC(gamma="auto")
svm_clf.fit(X_train, y_train) 

In [None]:
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10)
forest_scores.mean()

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators' : [20,30, 50], 'max_features': [3,5,8, 10]},    # first evaluate these 12 combinations
    #{'bootstrap': [False], 'n_estimators' : [3,10], 'max_features': [2,3,4]}    # and after try the 6 combinations with bootstrap False
]

forest = RandomForestClassifier()

grid_search = GridSearchCV(forest, param_grid, cv = 8)

grid_search.fit( X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
def display_scores(scores):
    print('Scores: ', scores)
    print('Mean: ', scores.mean())
    print('Scores std: ', scores.std())

In [None]:
scores = cross_val_score(grid_search.best_estimator_,  X_train, y_train, cv = 10) # recall cv means it takes 10 subsets

display_scores(scores)

In [None]:
fforest = grid_search.best_estimator_.fit( X_train, y_train)
feat = fforest.feature_importances_
feature_names = [f'feature {i}' for i in range(X_train.shape[1])]


sorted(zip(feat, X_train.columns), reverse = True)

In [None]:
less_impo = ['Title_Dr', 'Title_Major','Title_Rev','Title_Col','Title_Don','Title_Sir','Title_Capt','Title_Mlle',
'Title_Ms', 'Title_Mme','Title_Lady','Title_Jonkheer','Title_Countess']
X_train.drop(less_impo, axis=1, inplace=True)

In [None]:
y_test_real= transformer(test_titanic, train= False)