# 0 - Import operations

In [31]:
import pandas as pd

In [32]:
# Visualisation
pd.set_option('display.max_columns', None) # Show all columns
pd.set_option('display.max_rows', None) # Show all rows

# 1 - Data import 

In [33]:
dataTrain = pd.read_csv('../input/titanic/train.csv')
dataTest = pd.read_csv('../input/titanic/test.csv')

dataTrain.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [34]:
X_train = pd.concat([dataTrain, dataTest])

# 2 - Data transformation

## Null values

In [35]:
# Column with null values in X_train
X_train_sum = X_train.isnull().sum()
#Percentage of null values in X_train
X_train_percent = X_train.isnull().sum()/X_train.isnull().count()*100
# Datatype of X_train
X_train_dtypes = X_train.dtypes

#Concatenating X_train_sum, X_train_percent and X_train_dtypes
missing_data = pd.concat([X_train_sum, X_train_percent, X_train_dtypes], axis=1, keys=['Null values', '%', 'Types'])
missing_data

Unnamed: 0,Null values,%,Types
PassengerId,0,0.0,int64
Survived,418,31.932773,float64
Pclass,0,0.0,int64
Name,0,0.0,object
Sex,0,0.0,object
Age,263,20.091673,float64
SibSp,0,0.0,int64
Parch,0,0.0,int64
Ticket,0,0.0,object
Fare,1,0.076394,float64


## Unique values

In [36]:
# Unique values in X_train
X_train_unique = X_train.nunique()
X_train_unique

PassengerId    1309
Survived          2
Pclass            3
Name           1307
Sex               2
Age              98
SibSp             7
Parch             8
Ticket          929
Fare            281
Cabin           186
Embarked          3
dtype: int64

In [37]:
#Unique values in Survived, Pclass, Sex, SibSp, Parch, Embarked
print(X_train['Survived'].unique()) # Print unique values of Survived column
print(X_train['Pclass'].unique()) # Print unique values of Pclass column
print(X_train['Sex'].unique()) # Print unique values of Sex column
print(X_train['SibSp'].unique()) # Print unique values of SibSp column
print(X_train['Parch'].unique()) # Print unique values of Parch column
print(X_train['Embarked'].unique()) # Print unique values of Embarked column


[ 0.  1. nan]
[3 1 2]
['male' 'female']
[1 0 3 4 2 5 8]
[0 1 2 5 3 4 6 9]
['S' 'C' 'Q' nan]


## OHE for Sex Feature

In [38]:
#

In [39]:
X_train['Cabin'].str.replace(" ", "")
X_train = X_train.replace({'Cabin': r' \w*'}, {'Cabin': ''}, regex=True)
X_train.head(10)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0.0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0.0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0.0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1.0,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1.0,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [40]:
from sklearn.impute import SimpleImputer

NanImputer = SimpleImputer(strategy = 'constant', fill_value=1)

#Training Dataset
X_train = X_train.replace({'Cabin': r'A\w*'}, {'Cabin': 'A'}, regex=True)
X_train = X_train.replace({'Cabin': r'B\w*'}, {'Cabin': 'B'}, regex=True)
X_train = X_train.replace({'Cabin': r'C\w*'}, {'Cabin': 'C'}, regex=True)
X_train = X_train.replace({'Cabin': r'D\w*'}, {'Cabin': 'D'}, regex=True)
X_train = X_train.replace({'Cabin': r'E\w*'}, {'Cabin': 'E'}, regex=True)
X_train = X_train.replace({'Cabin': r'F\w*'}, {'Cabin': 'F'}, regex=True)
X_train = X_train.replace({'Cabin': r'G\w*'}, {'Cabin': 'G'}, regex=True)
X_train = X_train.replace({'Cabin': r'T\w*'}, {'Cabin': 'T'}, regex=True)

X_train[['Cabin']]=NanImputer.fit_transform(X_train[['Cabin']])
X_train.head(40)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,1,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,1,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,1,S
5,6,0.0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,1,Q
6,7,0.0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E,S
7,8,0.0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,1,S
8,9,1.0,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,1,S
9,10,1.0,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,1,C


In [5]:
#Training DATA
ohe_sex = pd.get_dummies(X_train['Sex'], prefix='ohe_sex')
X_train.drop('Sex', axis=1, inplace=True)
ohe_embarked = pd.get_dummies(X_train['Embarked'], prefix='ohe_embarked')
X_train.drop('Embarked', axis=1, inplace=True)
ohe_Cabin = pd.get_dummies(X_train['Cabin'], prefix='ohe_Cabin')
X_train.drop('Cabin', axis=1, inplace=True)
X_train = pd.concat([X_train, ohe_sex, ohe_embarked, ohe_Cabin], axis=1)


In [6]:
from sklearn.impute import SimpleImputer
mean_imputer= SimpleImputer(strategy='mean')

# Mean imputation for Age feature - DATA
X_train[['Age']]=mean_imputer.fit_transform(X_train[['Age']])

# Mean imputation for Fair feature - DATA
X_train[['Fare']]=mean_imputer.fit_transform(X_train[['Fare']])

X_train.head()
X_train.shape

(1309, 23)

In [7]:
#New Category for Nan values of Cabín feature

#X_train.drop('Cabin', axis=1, inplace=True)
#X_train.head()

print(" \nCount total NaN at each column in a DataFrame : \n\n",
      X_train.isnull().sum())
X_train.shape

 
Count total NaN at each column in a DataFrame : 

 PassengerId         0
Survived          418
Pclass              0
Name                0
Age                 0
SibSp               0
Parch               0
Ticket              0
Fare                0
ohe_sex_female      0
ohe_sex_male        0
ohe_embarked_C      0
ohe_embarked_Q      0
ohe_embarked_S      0
ohe_Cabin_1         0
ohe_Cabin_A         0
ohe_Cabin_B         0
ohe_Cabin_C         0
ohe_Cabin_D         0
ohe_Cabin_E         0
ohe_Cabin_F         0
ohe_Cabin_G         0
ohe_Cabin_T         0
dtype: int64


(1309, 23)

In [8]:
X_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,ohe_sex_female,...,ohe_embarked_S,ohe_Cabin_1,ohe_Cabin_A,ohe_Cabin_B,ohe_Cabin_C,ohe_Cabin_D,ohe_Cabin_E,ohe_Cabin_F,ohe_Cabin_G,ohe_Cabin_T
0,1,0.0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,0,...,1,1,0,0,0,0,0,0,0,0
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,1,...,0,0,0,0,1,0,0,0,0,0
2,3,1.0,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,1,...,1,1,0,0,0,0,0,0,0,0
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,1,...,1,0,0,0,1,0,0,0,0,0
4,5,0.0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,0,...,1,1,0,0,0,0,0,0,0,0


In [9]:
X_train.drop(['Name', 'Ticket'], axis=1, inplace=True)
X_train = X_train.astype({"Age":'int', "Fare":'int'}) 
display(X_train.dtypes)


PassengerId         int64
Survived          float64
Pclass              int64
Age                 int64
SibSp               int64
Parch               int64
Fare                int64
ohe_sex_female      uint8
ohe_sex_male        uint8
ohe_embarked_C      uint8
ohe_embarked_Q      uint8
ohe_embarked_S      uint8
ohe_Cabin_1         uint8
ohe_Cabin_A         uint8
ohe_Cabin_B         uint8
ohe_Cabin_C         uint8
ohe_Cabin_D         uint8
ohe_Cabin_E         uint8
ohe_Cabin_F         uint8
ohe_Cabin_G         uint8
ohe_Cabin_T         uint8
dtype: object

In [10]:
#Get the train and test data
X_trainF = X_train[X_train['PassengerId'].isin(dataTrain['PassengerId'])]
X_testF = X_train[X_train['PassengerId'].isin(dataTest['PassengerId'])]

In [11]:
#Dismiss same features for X_test
X_testF.drop(['Survived'], axis=1, inplace=True)
X_testF.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,ohe_sex_female,ohe_sex_male,ohe_embarked_C,ohe_embarked_Q,ohe_embarked_S,ohe_Cabin_1,ohe_Cabin_A,ohe_Cabin_B,ohe_Cabin_C,ohe_Cabin_D,ohe_Cabin_E,ohe_Cabin_F,ohe_Cabin_G,ohe_Cabin_T
0,892,3,34,0,0,7,0,1,0,1,0,1,0,0,0,0,0,0,0,0
1,893,3,47,1,0,7,1,0,0,0,1,1,0,0,0,0,0,0,0,0
2,894,2,62,0,0,9,0,1,0,1,0,1,0,0,0,0,0,0,0,0
3,895,3,27,0,0,8,0,1,0,0,1,1,0,0,0,0,0,0,0,0
4,896,3,22,1,1,12,1,0,0,0,1,1,0,0,0,0,0,0,0,0


In [12]:
#Dismiss same features for X_trainF
X_trainF.drop(['PassengerId'], axis=1, inplace=True)
X_trainF.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,ohe_sex_female,ohe_sex_male,ohe_embarked_C,ohe_embarked_Q,ohe_embarked_S,ohe_Cabin_1,ohe_Cabin_A,ohe_Cabin_B,ohe_Cabin_C,ohe_Cabin_D,ohe_Cabin_E,ohe_Cabin_F,ohe_Cabin_G,ohe_Cabin_T
0,0.0,3,22,1,0,7,0,1,0,0,1,1,0,0,0,0,0,0,0,0
1,1.0,1,38,1,0,71,1,0,1,0,0,0,0,0,1,0,0,0,0,0
2,1.0,3,26,0,0,7,1,0,0,0,1,1,0,0,0,0,0,0,0,0
3,1.0,1,35,1,0,53,1,0,0,0,1,0,0,0,1,0,0,0,0,0
4,0.0,3,35,0,0,8,0,1,0,0,1,1,0,0,0,0,0,0,0,0


In [13]:
y_trainF = X_trainF['Survived']
X_trainF.drop(['Survived'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [14]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from matplotlib import pyplot
from sklearn.metrics import accuracy_score

titanic_DMatrix = xgb.DMatrix(X_trainF, y_trainF)

params = {"objective":"reg:logistic",
         'max_depth':8}
xg_cl= xgb.train(params=params, dtrain=titanic_DMatrix, num_boost_round=2)


In [15]:
import xgboost as xgb
from hyperopt import hp, tpe, fmin
import numpy as np
from sklearn.model_selection import cross_val_score

from sklearn.metrics import mean_squared_error
from matplotlib import pyplot
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_trainF, y_trainF , random_state=104,train_size=0.8, shuffle=True)

space={'max_depth':hp.quniform('max_depth', 1, 50, 1),
       'eta':hp.uniform ('eta', 0, 1),
       'subsample':hp.uniform ('subsample', 0, 1),
       'n_estimators':hp.quniform('n_estimators', 50, 500, 5)}


def objective(params):
    params={'max_depth':int(params['max_depth']),
            'eta':params['eta'],
            'subsample':params['subsample'],
            'n_estimators':int(params['n_estimators'])}
    
    model = xgb.XGBClassifier(objective='reg:logistic',
                              **params)

    evaluation = [(X_train, y_train), (X_test, y_test)]

    model.fit(X_train, y_train,
              eval_set = evaluation, eval_metric = 'error',
              early_stopping_rounds = 5, verbose = False)

    pred = model.predict(X_test)
    accuracy = 1-accuracy_score(y_test, pred>0.5)

    return accuracy

best = fmin(fn=objective, space=space, max_evals=20, rstate=np.random.default_rng(42), algo=tpe.suggest)
print(best)

 10%|█         | 2/20 [00:00<00:01, 15.43trial/s, best loss: 0.17318435754189943]











 30%|███       | 6/20 [00:00<00:00, 14.15trial/s, best loss: 0.16759776536312854]









 40%|████      | 8/20 [00:00<00:00, 13.42trial/s, best loss: 0.16759776536312854]









 60%|██████    | 12/20 [00:00<00:00, 12.19trial/s, best loss: 0.16201117318435754]









 70%|███████   | 14/20 [00:01<00:00, 11.86trial/s, best loss: 0.16201117318435754]









 90%|█████████ | 18/20 [00:01<00:00, 13.21trial/s, best loss: 0.16201117318435754]











100%|██████████| 20/20 [00:01<00:00, 13.19trial/s, best loss: 0.16201117318435754]
{'eta': 0.5734609694174653, 'max_depth': 4.0, 'n_estimators': 395.0, 'subsample': 0.7232984036743606}


In [16]:
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X_trainF, y_trainF , random_state=104,train_size=0.8, shuffle=True)
    
    evaluation = [(X_train, y_train), (X_test, y_test)]


    
    model = xgb.XGBClassifier(objective='reg:logistic', eta= 0.574, max_depth = 4, n_estimators= 395 , subsample= 0.723, early_stopping_rounds = 5)
    model.fit(X_train, y_train, eval_set = evaluation, eval_metric = 'error')
    
    pred = model.predict(X_test)
    accuracy = 1-accuracy_score(y_test, pred>0.5)
    print(accuracy)

[0]	validation_0-error:0.15449	validation_1-error:0.20112
[1]	validation_0-error:0.14185	validation_1-error:0.20670
[2]	validation_0-error:0.14045	validation_1-error:0.18994
[3]	validation_0-error:0.13202	validation_1-error:0.18994
[4]	validation_0-error:0.13062	validation_1-error:0.18436
[5]	validation_0-error:0.12921	validation_1-error:0.17877
[6]	validation_0-error:0.13202	validation_1-error:0.17318
[7]	validation_0-error:0.12921	validation_1-error:0.16201
[8]	validation_0-error:0.11938	validation_1-error:0.17877
[9]	validation_0-error:0.11938	validation_1-error:0.17318
[10]	validation_0-error:0.11376	validation_1-error:0.17318
[11]	validation_0-error:0.10815	validation_1-error:0.17318
0.16201117318435754




In [17]:
X_testF.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,ohe_sex_female,ohe_sex_male,ohe_embarked_C,ohe_embarked_Q,ohe_embarked_S,ohe_Cabin_1,ohe_Cabin_A,ohe_Cabin_B,ohe_Cabin_C,ohe_Cabin_D,ohe_Cabin_E,ohe_Cabin_F,ohe_Cabin_G,ohe_Cabin_T
0,892,3,34,0,0,7,0,1,0,1,0,1,0,0,0,0,0,0,0,0
1,893,3,47,1,0,7,1,0,0,0,1,1,0,0,0,0,0,0,0,0
2,894,2,62,0,0,9,0,1,0,1,0,1,0,0,0,0,0,0,0,0
3,895,3,27,0,0,8,0,1,0,0,1,1,0,0,0,0,0,0,0,0
4,896,3,22,1,1,12,1,0,0,0,1,1,0,0,0,0,0,0,0,0


In [18]:
X_testF.drop(['PassengerId'], axis=1, inplace=True)

In [19]:

y_pred = model.predict(X_testF)

In [20]:
output = pd.DataFrame({'PassengerID': dataTest['PassengerId'],
                       'Survived': y_pred})
output.to_csv('submission.csv', index=False) 