In [130]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

In [131]:
df_train = pd.read_csv('data/titanic/train.csv')
df_test = pd.read_csv('data/titanic/test.csv')
df_all = pd.concat([df_train, df_test], sort=False)
df_all.sample(15)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
341,1233,,3,"Lundstrom, Mr. Thure Edvin",male,32.0,0,0,350403,7.5792,,S
658,659,0.0,2,"Eitemiller, Mr. George Floyd",male,23.0,0,0,29751,13.0,,S
216,1108,,3,"Mahon, Miss. Bridget Delia",female,,0,0,330924,7.8792,,Q
186,187,1.0,3,"O'Brien, Mrs. Thomas (Johanna ""Hannah"" Godfrey)",female,,1,0,370365,15.5,,Q
605,606,0.0,3,"Lindell, Mr. Edvard Bengtsson",male,36.0,1,0,349910,15.55,,S
280,1172,,3,"Oreskovic, Miss. Jelka",female,23.0,0,0,315085,8.6625,,S
188,189,0.0,3,"Bourke, Mr. John",male,40.0,1,1,364849,15.5,,Q
303,1195,,3,"Pokrnic, Mr. Tome",male,24.0,0,0,315092,8.6625,,S
214,1106,,3,"Andersson, Miss. Ida Augusta Margareta",female,38.0,4,2,347091,7.775,,S
815,816,0.0,1,"Fry, Mr. Richard",male,,0,0,112058,0.0,B102,S


* **PassengerId** is the unique id of the row and it doesn't have any effect on target
* **Survived is** the target variable we are trying to predict (0 or 1):
    * 1 = Survived
    * 0 = Not Survived
* **Pclass** (Passenger Class) is the socio-economic status of the passenger and it is a categorical ordinal feature which has 3 unique values (1, 2 or 3):
    * 1 = Upper Class
    * 2 = Middle Class
    * 3 = Lower Class
* **Name, Sex and Age** are self-explanatory
* **SibSp** is the total number of the passengers' siblings and spouse
* **Parch** is the total number of the passengers' parents and children
* **Ticket** is the ticket number of the passenger
* **Fare** is the passenger fare
* **Cabin** is the cabin number of the passenger
* **Embarked** is port of embarkation and it is a categorical feature which has 3 unique values (C, Q or S):
    * C = Cherbourg
    * Q = Queenstown
    * S = Southampton

In [132]:
# missing values by highest to lowest
df_all.isnull().sum().sort_values(ascending=False)

Cabin          1014
Survived        418
Age             263
Embarked          2
Fare              1
PassengerId       0
Pclass            0
Name              0
Sex               0
SibSp             0
Parch             0
Ticket            0
dtype: int64

In [133]:
def add_and_drop_features(df):
    df['Deck'] = df.Cabin.apply(lambda s: s[0] if pd.notnull(s) else 'M')
    df['Title'] = df.Name.str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
    df['Surname'] = df.Name.str.split(', ', expand=True)[0]
    df['Family_size'] = df.SibSp + df.Parch + 1

    # family id grouped by surname and family size
    df['Family_id'] = df.groupby(['Surname','Family_size']).ngroup()
    # single to -1
    df.Family_id = df.Family_id.where(df.Family_size > 1, -1)
    df.drop(['Cabin','Name','Ticket'], axis=1, inplace=True)

add_and_drop_features(df_all)

In [134]:
df_all

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Deck,Title,Surname,Family_size,Family_id
0,1,0.0,3,male,22.0,1,0,7.2500,S,M,Mr,Braund,2,105
1,2,1.0,1,female,38.0,1,0,71.2833,C,C,Mrs,Cumings,2,190
2,3,1.0,3,female,26.0,0,0,7.9250,S,M,Miss,Heikkinen,1,-1
3,4,1.0,1,female,35.0,1,0,53.1000,S,C,Mrs,Futrelle,2,281
4,5,0.0,3,male,35.0,0,0,8.0500,S,M,Mr,Allen,1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,,3,male,,0,0,8.0500,S,M,Mr,Spector,1,-1
414,1306,,1,female,39.0,0,0,108.9000,C,C,Dona,Oliva y Ocana,1,-1
415,1307,,3,male,38.5,0,0,7.2500,S,M,Mr,Saether,1,-1
416,1308,,3,male,,0,0,8.0500,S,M,Mr,Ware,1,-1


In [135]:
df_all.Deck.replace('T', 'M', inplace=True)

In [136]:
def survival_rates(df, col):
    return df.groupby(col).Survived.mean().sort_values(ascending=False)

In [137]:
survival_rates(df_all, 'Deck')

Deck
D    0.757576
E    0.750000
B    0.744681
F    0.615385
C    0.593220
G    0.500000
A    0.466667
M    0.299419
Name: Survived, dtype: float64

In [138]:
def group_decks_by_survival_rate(df):
    df.Deck.replace(['B','D','E'], 'BDE', inplace=True)
    df.Deck.replace(['F','C'],'FC', inplace=True)
    df.Deck.replace(['G','A'],'GA', inplace=True)

In [139]:
group_decks_by_survival_rate(df_all)
survival_rates(df_all, 'Deck')

Deck
BDE    0.750000
FC     0.597222
GA     0.473684
M      0.299419
Name: Survived, dtype: float64

In [140]:
df_all.Title.value_counts()

Mr              757
Miss            260
Mrs             197
Master           61
Rev               8
Dr                8
Col               4
Mlle              2
Major             2
Ms                2
Lady              1
Sir               1
Mme               1
Don               1
Capt              1
the Countess      1
Jonkheer          1
Dona              1
Name: Title, dtype: int64

In [141]:
def group_titles(df):
    df.Title.replace(['Capt','Col','Major','Dr','Rev'], 'Officer', inplace=True)
    df.Title.replace(['Jonkheer','Don','Sir','the Countess','Dona','Lady'], 'Royalty', inplace=True)
    df.Title.replace(['Mme','Ms','Mrs'], 'Married woman', inplace=True)
    df.Title.replace(['Mlle','Miss'], 'Unmarried woman', inplace=True)
    df.Title.replace(['Mr'], 'Adult man', inplace=True)
    df.Title.replace(['Master'], 'Young man', inplace=True)

In [142]:
# drop capt
group_titles(df_all)

In [143]:
print(df_all.Title.value_counts(), '\n')
print(survival_rates(df_all, 'Title'))

Adult man          757
Unmarried woman    262
Married woman      200
Young man           61
Officer             23
Royalty              6
Name: Title, dtype: int64 

Title
Married woman      0.795276
Unmarried woman    0.701087
Royalty            0.600000
Young man          0.575000
Officer            0.277778
Adult man          0.156673
Name: Survived, dtype: float64


In [144]:
# fill missing age by randomforest
def fill_missing_age(df):
    df_age = df[['Age','Pclass','Title','Family_size','Family_id']]
    df_age = pd.get_dummies(df_age, columns=['Title','Pclass','Family_size','Family_id'])
    known_age = df_age[df_age.Age.notnull()].values
    unknown_age = df_age[df_age.Age.isnull()].values
    X = known_age[:, 1:]
    y = known_age[:, 0]

    rfr = RandomForestRegressor(random_state=0, n_estimators=100, n_jobs=-1)
    rfr.fit(X, y)
    predicted_age = rfr.predict(unknown_age[:, 1:])
    df.loc[(df.Age.isnull()), 'Age'] = predicted_age
    df.Age = df.Age.astype(int)

# fill missing fare by median fare of pclass
def fill_missing_fare(df):
    df.Fare.fillna(df.groupby('Pclass').Fare.transform('median'), inplace=True)

# fill missing embarked by mode
def fill_missing_embarked(df):
    df.Embarked.fillna(df.Embarked.mode()[0], inplace=True)

fill_missing_age(df_all)
fill_missing_fare(df_all)
fill_missing_embarked(df_all)

In [145]:
# nan in all columns
df_all.isnull().sum().sort_values(ascending=False)

Survived       418
PassengerId      0
Pclass           0
Sex              0
Age              0
SibSp            0
Parch            0
Fare             0
Embarked         0
Deck             0
Title            0
Surname          0
Family_size      0
Family_id        0
dtype: int64

In [146]:
df_all

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Deck,Title,Surname,Family_size,Family_id
0,1,0.0,3,male,22,1,0,7.2500,S,M,Adult man,Braund,2,105
1,2,1.0,1,female,38,1,0,71.2833,C,FC,Married woman,Cumings,2,190
2,3,1.0,3,female,26,0,0,7.9250,S,M,Unmarried woman,Heikkinen,1,-1
3,4,1.0,1,female,35,1,0,53.1000,S,FC,Married woman,Futrelle,2,281
4,5,0.0,3,male,35,0,0,8.0500,S,M,Adult man,Allen,1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,,3,male,28,0,0,8.0500,S,M,Adult man,Spector,1,-1
414,1306,,1,female,39,0,0,108.9000,C,FC,Royalty,Oliva y Ocana,1,-1
415,1307,,3,male,38,0,0,7.2500,S,M,Adult man,Saether,1,-1
416,1308,,3,male,28,0,0,8.0500,S,M,Adult man,Ware,1,-1


In [147]:
# convert categorical to numerical
def convert_to_numerical(df):
    df.Deck = pd.Categorical(df.Deck, categories=['M','FC','BDE','GA']).codes
    df.Sex = pd.Categorical(df.Sex, categories=['male', 'female']).codes
    df.Embarked = pd.Categorical(df.Embarked, categories=['S','C','Q']).codes
    df.Title = pd.Categorical(df.Title, categories=['Adult man', 'Officer', 'Royalty', 'Married woman', 'Unmarried woman', 'Young man']).codes

convert_to_numerical(df_all)

In [148]:
# cross validation using XGBoost
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from xgboost import XGBClassifier

In [149]:
# train split by nonan in survived
df = df_all[df_all.Survived.notnull()]

# drop columns
df.drop(['PassengerId','Surname'], axis=1, inplace=True)

# split df train and test
df_train = df.sample(frac=0.8, random_state=0)
df_test = df.drop(df_train.index)

# split df_train to X_train and y_train
X_train = df_train.drop('Survived', axis=1)
y_train = df_train.Survived

# split df_test to X_test and y_test
X_test = df_test.drop('Survived', axis=1)
y_test = df_test.Survived

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(['PassengerId','Surname'], axis=1, inplace=True)


In [155]:
# grid search
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', XGBClassifier(random_state=0, n_jobs=-1))
])

# parameters
params = {
    'clf__n_estimators': [100,300, 500],
    'clf__learning_rate': [0.01, 0.1],
    'clf__max_depth': [3, 5, 7],
    'clf__min_child_weight': [1, 3, 5],
    'clf__gamma': [0, 0.2, 0.4],
    'clf__subsample': [0.6, 0.8, 1],
    'clf__colsample_bytree': [0.6, 0.8, 1],
    'clf__reg_alpha': [0, 0.2,  0.4],
    'clf__reg_lambda': [0.6, 0.8, 1]
}

# grid search
grid = GridSearchCV(pipeline, params, cv=5, n_jobs=-1, verbose=1)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 13122 candidates, totalling 65610 fits


KeyboardInterrupt: 

In [152]:
# test set
y_pred = pipeline.predict(X_test)
print('accuracy score: ', accuracy_score(y_test, y_pred))

accuracy score:  0.7696629213483146


In [153]:
# get predictions on df_test
df_test = df_all[df_all.Survived.isnull()]

# drop columns
df_test.drop(['PassengerId','Surname','Survived'], axis=1, inplace=True)

# get predictions
y_pred = pipeline.predict(df_test)

# create submission file
submission = pd.DataFrame({'PassengerId': df_all[df_all.Survived.isnull()].PassengerId, 'Survived': y_pred.astype(int)})
submission.to_csv('submission.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test.drop(['PassengerId','Surname','Survived'], axis=1, inplace=True)
