In [159]:
import pandas as pd
import numpy as np

train_titanic = pd.read_csv('titanic_train.csv', index_col = 'PassengerId')
test_titanic = pd.read_csv('test-2.csv')

train_titanic.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [160]:
train_titanic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [82]:
# create a transformer that creates new attributes and bins age and fare

from sklearn.base import BaseEstimator, TransformerMixin

# column index
col_names = 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Name'
 # get the column indices

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    # BaseEstimator gets the extra methods get_params() and set_params()
    # TransformerMixin gives the fit_transform() method for free
    
    def __init__(self, add_age_per_class=True): # no *args or **kargs
        self.add_age_per_class = add_age_per_class
        
    def fit(self, X, y=None):
        return self  # nothing else to do
    
    def transform(self, X):
        class_ix, age_ix, sibsp_ix, parch_ix, fare_ix, name_ix = [train_titanic.columns.get_loc(c) for c in col_names]
        #relatives = X[:, sibsp_ix] + X[:, parch_ix]
        #fare_per_person = X[:, fare_ix] / (relatives+1)
        title = pd.Series(X[:, name_ix]).str.extract(pat= ' ([A-Za-z]+)\.')
        if self.add_age_per_class:
            #age_per_class = X[:, age_ix] * X[:, class_ix]
            return np.c_[X, #age_per_class, 
                         #fare_per_person,
                         #relatives, 
                         title]
            # np.c_ translates slice objects to concatenation along the second axis.
        else:
            return np.c_[X, #fare_per_person, relatives, 
                         title]

attr_adder = CombinedAttributesAdder()
train_titanic_attr = attr_adder.transform(train_titanic.values)

In [83]:
# train_titanic_attr is a NumPy array, we've lost the column names. To recover a DataFrame, run this:

train_titanic_attr = pd.DataFrame(
    train_titanic_attr,
    columns=list(train_titanic.columns)+[#'age_per_class',
                                         #'fare_per_person', 'n_relatives', 
        'title'],
    index=train_titanic.index)
train_titanic_attr.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,title
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S,Mr


In [88]:
from sklearn.compose import ColumnTransformer
from sklearn. preprocessing import OneHotEncoder, KBinsDiscretizer
from sklearn.impute import SimpleImputer

attr = [#'Pclass', #'Age', 
        #'SibSp', 'Parch', 'Fare', 
        'Name']
null_val = ['Age', 'Fare']
cat_attr = ['Sex', 'Embarked']
droppable = ['Name', 'SibSp', 'Parch', 'Ticket', 'Cabin']

pipe = ColumnTransformer([           # (name, tranformer, list of columns that the trans should be applied to)
            #('imputer', SimpleImputer(strategy = 'median'), null_val),    # fill null values with median
            #('imputer_emb', SimpleImputer(strategy= 'most_frequent'), ['Embarked']),
            ('attribs_adder', CombinedAttributesAdder(), attr),            # add new attr: 'age_per_class','fare_per_person', 'n_relatives', 'title'
            #('bins', KBinsDiscretizer(), ['Age', 'Fare']),                            # scale 
            #('cat', OneHotEncoder(), cat_attr),                          # encode categorical ones
            #('drop', 'drop', droppable),                                 # don't need explanation XD
        ], remainder='passthrough' )


train_titanic_transformed = pipe.fit_transform(train_titanic)



ValueError: Specifying the columns using strings is only supported for pandas DataFrames

In [80]:
# let's see if it works: 


train_titanic_transformed 

train_titanic_tr = pd.DataFrame(
    train_titanic_transformed,
    columns=list(train_titanic.columns)+[#'age_per_class',
                                         #'fare_per_person', 'n_relatives', 
        'rel'],
    index=train_titanic.index)
train_titanic_tr.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,rel
0,3,1,0,7.25,"Braund, Mr. Owen Harris",1,1,0,male,22,A/5 21171,,S
1,1,1,0,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,2,1,female,38,PC 17599,C85,C
2,3,0,0,7.925,"Heikkinen, Miss. Laina",0,3,1,female,26,STON/O2. 3101282,,S
3,1,1,0,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,4,1,female,35,113803,C123,S
4,3,0,0,8.05,"Allen, Mr. William Henry",0,5,0,male,35,373450,,S


In [213]:
def transformer(df, train= True):
    
    # fill nan:
    if train == True:
        global imputer_num 
        imputer_num = SimpleImputer(strategy = 'median')
        df[['Age', 'Fare']] = imputer_num.fit_transform(df[['Age', 'Fare']])
        
        global imputer_cat 
        imputer_cat = SimpleImputer(strategy = 'most_frequent')
        df[['Embarked']] = imputer_cat.fit_transform(df[['Embarked']])
    else:
        df[['Age', 'Fare']] = imputer_num.transform(df[['Age', 'Fare']])
        df[['Embarked']] = imputer_cat.transform(df[['Embarked']])
    
    # new attributes
    df["AgeBucket"] = df["Age"] // 15 * 15
    df['Relatives'] = df['SibSp']+df['Parch']
    df['Title']= df['Name'].str.extract(pat= ' ([A-Za-z]+)\.')
    df['Age_Class']= df['Age']* df['Pclass']
    df['Fare_Per_Person'] = df['Fare']/(df['Relatives']+1)
    
    # drop useless col
    df.drop(['Name', 'SibSp', 'Parch', 'Ticket', 'Cabin'], axis=1, inplace= True)
    
    # encode categorical values
    df = pd.get_dummies(df,columns=['Title', 'Embarked', 'Sex'])
    
    # bin Fare
    if train == True:
        global fare_bin
        fare_bin = KBinsDiscretizer()
        df['fare_1','fare_2','fare_3','fare_4','fare_5'] = pd.DataFrame.sparse.from_spmatrix(KBinsDiscretizer().fit_transform(train_titanic[['Fare']]), 
                                  columns = ['fare_1','fare_2','fare_3','fare_4','fare_5'])
    else:
        df[['Fare']] = pd.DataFrame.sparse.from_spmatrix(KBinsDiscretizer().transform(train_titanic[['Fare']]), 
                                  columns = ['fare_1','fare_2','fare_3','fare_4','fare_5',])
    
    
    return df

In [214]:
X_train = transformer(train_titanic.copy())
y_train = X_train['Survived']
X_train.drop(['Survived'], axis=1, inplace= True)


X_train.info()

ValueError: Wrong number of items passed 5, placement implies 1

In [165]:
from sklearn.svm import SVC

svm_clf = SVC(gamma="auto")
svm_clf.fit(X_train, y_train) 

0.6981398252184768

In [166]:
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10)
forest_scores.mean()

0.8081523096129837

In [189]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators' : [20,30, 50], 'max_features': [3,5,8, 10]},    # first evaluate these 12 combinations
    #{'bootstrap': [False], 'n_estimators' : [3,10], 'max_features': [2,3,4]}    # and after try the 6 combinations with bootstrap False
]

forest = RandomForestClassifier()

grid_search = GridSearchCV(forest, param_grid, cv = 8)

grid_search.fit( X_train, y_train)

GridSearchCV(cv=8, estimator=RandomForestClassifier(),
             param_grid=[{'max_features': [3, 5, 8, 10],
                          'n_estimators': [20, 30, 50]}])

In [190]:
grid_search.best_params_

{'max_features': 8, 'n_estimators': 50}

In [179]:
def display_scores(scores):
    print('Scores: ', scores)
    print('Mean: ', scores.mean())
    print('Scores std: ', scores.std())

In [191]:
scores = cross_val_score(grid_search.best_estimator_,  X_train, y_train, cv = 10) # recall cv means it takes 10 subsets

display_scores(scores)

Scores:  [0.75555556 0.85393258 0.75280899 0.87640449 0.87640449 0.86516854
 0.85393258 0.76404494 0.83146067 0.84269663]
Mean:  0.8272409488139825
Scores std:  0.04756766635385972


In [184]:
fforest = grid_search.best_estimator_.fit( X_train, y_train)
feat = fforest.feature_importances_
feature_names = [f'feature {i}' for i in range(X_train.shape[1])]


sorted(zip(feat, X_train.columns), reverse = True)

[(0.14116297883561413, 'Fare_Per_Person'),
 (0.1307440387358542, 'Age_Class'),
 (0.12750442509953802, 'Title_Mr'),
 (0.11461326357664008, 'Fare'),
 (0.1084961667591269, 'Age'),
 (0.10587828194128615, 'Sex_female'),
 (0.07252963591856768, 'Sex_male'),
 (0.05799812615450878, 'Relatives'),
 (0.04649944642027047, 'Pclass'),
 (0.02349948458131217, 'AgeBucket'),
 (0.01435475833706345, 'Title_Mrs'),
 (0.011897789574283075, 'Embarked_S'),
 (0.011008764257686437, 'Title_Master'),
 (0.009686376410403964, 'Title_Miss'),
 (0.008850994391261491, 'Embarked_C'),
 (0.00829382949412452, 'Embarked_Q'),
 (0.001901508647012435, 'Title_Dr'),
 (0.0018365224451934261, 'Title_Major'),
 (0.0017976296760664188, 'Title_Rev'),
 (0.0005787650070834961, 'Title_Col'),
 (0.0004348842725204064, 'Title_Don'),
 (0.00020176942589209256, 'Title_Sir'),
 (0.0001610555473982628, 'Title_Capt'),
 (6.909631164676382e-05, 'Title_Mlle'),
 (4.081796452215888e-07, 'Title_Ms'),
 (0.0, 'Title_Mme'),
 (0.0, 'Title_Lady'),
 (0.0, 'Titl

In [188]:
less_impo = ['Title_Dr', 'Title_Major','Title_Rev','Title_Col','Title_Don','Title_Sir','Title_Capt','Title_Mlle',
'Title_Ms', 'Title_Mme','Title_Lady','Title_Jonkheer','Title_Countess']
X_train.drop(less_impo, axis=1, inplace=True)

In [192]:
y_test_real= transformer(test_titanic, train= False)

UnboundLocalError: local variable 'imputer_num' referenced before assignment