In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier

In [3]:
import xgboost as xgb

In [4]:
df = pd.read_csv('train.csv')

In [5]:
print(df.head)

<bound method NDFrame.head of      PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                             Allen, Mr. William Henry    male  35.0      0   
..                                     

In [6]:
df_encoded = pd.get_dummies(df, columns=['Pclass','Sex', 'SibSp', 'Parch', 'Embarked'], drop_first=False)

In [7]:
print(df_encoded.head)

<bound method NDFrame.head of      PassengerId  Survived                                               Name  \
0              1         0                            Braund, Mr. Owen Harris   
1              2         1  Cumings, Mrs. John Bradley (Florence Briggs Th...   
2              3         1                             Heikkinen, Miss. Laina   
3              4         1       Futrelle, Mrs. Jacques Heath (Lily May Peel)   
4              5         0                           Allen, Mr. William Henry   
..           ...       ...                                                ...   
886          887         0                              Montvila, Rev. Juozas   
887          888         1                       Graham, Miss. Margaret Edith   
888          889         0           Johnston, Miss. Catherine Helen "Carrie"   
889          890         1                              Behr, Mr. Karl Howell   
890          891         0                                Dooley, Mr. Patrick  

In [8]:
X = df_encoded[['Fare', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male', 'SibSp_0', 'SibSp_1', 'SibSp_2', 'SibSp_3', 'SibSp_4','SibSp_5','SibSp_8', 'Parch_0', 'Parch_1', 'Parch_2', 'Parch_3', 'Parch_4', 'Parch_5', 'Parch_6', 'Embarked_C', 'Embarked_Q', 'Embarked_S']]

In [9]:
y = df_encoded['Survived']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [12]:
params = {
    'objective' : 'binary:logistic',
    'max_depth' : 4,
    'eta' : 0.1,
    'eval_metric' : 'logloss'
}

In [13]:
num_boost_round = 100
bst = xgb.train(params, dtrain, num_boost_round)

In [14]:
y_pred_prob = bst.predict(dtest)
y_pred = [1 if prob>0.5 else 0 for prob in y_pred_prob]

In [15]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.84      0.83       105
           1       0.76      0.73      0.74        74

    accuracy                           0.79       179
   macro avg       0.79      0.78      0.79       179
weighted avg       0.79      0.79      0.79       179



In [16]:
#applying the xgboost model on train.csv file from Kaggle

In [17]:
df_test = pd.read_csv('test.csv')

In [18]:
df_test['Survived'] = 0

In [19]:
print(df_test.head)

<bound method NDFrame.head of      PassengerId  Pclass                                          Name  \
0            892       3                              Kelly, Mr. James   
1            893       3              Wilkes, Mrs. James (Ellen Needs)   
2            894       2                     Myles, Mr. Thomas Francis   
3            895       3                              Wirz, Mr. Albert   
4            896       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)   
..           ...     ...                                           ...   
413         1305       3                            Spector, Mr. Woolf   
414         1306       1                  Oliva y Ocana, Dona. Fermina   
415         1307       3                  Saether, Mr. Simon Sivertsen   
416         1308       3                           Ware, Mr. Frederick   
417         1309       3                      Peter, Master. Michael J   

        Sex   Age  SibSp  Parch              Ticket      Fare Cabin Embarked  \
0

In [20]:
#encoding the test dataset

In [21]:
df_test_encoded = pd.get_dummies(df_test, columns=['Pclass','Sex', 'SibSp', 'Parch', 'Embarked'], drop_first=False)

In [22]:
print(df_test_encoded.head)

<bound method NDFrame.head of      PassengerId                                          Name   Age  \
0            892                              Kelly, Mr. James  34.5   
1            893              Wilkes, Mrs. James (Ellen Needs)  47.0   
2            894                     Myles, Mr. Thomas Francis  62.0   
3            895                              Wirz, Mr. Albert  27.0   
4            896  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  22.0   
..           ...                                           ...   ...   
413         1305                            Spector, Mr. Woolf   NaN   
414         1306                  Oliva y Ocana, Dona. Fermina  39.0   
415         1307                  Saether, Mr. Simon Sivertsen  38.5   
416         1308                           Ware, Mr. Frederick   NaN   
417         1309                      Peter, Master. Michael J   NaN   

                 Ticket      Fare Cabin  Survived  Pclass_1  Pclass_2  \
0                330911    7.829

In [23]:
#due to the presence of Parch_9 variable, I need to retrain the model 'bst' with this variable

In [24]:
df_encoded['Parch_9']=0

In [25]:
print(df_encoded.head)

<bound method NDFrame.head of      PassengerId  Survived                                               Name  \
0              1         0                            Braund, Mr. Owen Harris   
1              2         1  Cumings, Mrs. John Bradley (Florence Briggs Th...   
2              3         1                             Heikkinen, Miss. Laina   
3              4         1       Futrelle, Mrs. Jacques Heath (Lily May Peel)   
4              5         0                           Allen, Mr. William Henry   
..           ...       ...                                                ...   
886          887         0                              Montvila, Rev. Juozas   
887          888         1                       Graham, Miss. Margaret Edith   
888          889         0           Johnston, Miss. Catherine Helen "Carrie"   
889          890         1                              Behr, Mr. Karl Howell   
890          891         0                                Dooley, Mr. Patrick  

In [26]:
X = df_encoded[['Fare', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male', 'SibSp_0', 'SibSp_1', 'SibSp_2', 'SibSp_3', 'SibSp_4','SibSp_5','SibSp_8', 'Parch_0', 'Parch_1', 'Parch_2', 'Parch_3', 'Parch_4', 'Parch_5', 'Parch_6', 'Parch_9', 'Embarked_C', 'Embarked_Q', 'Embarked_S']]

In [27]:
y = df_encoded['Survived']

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [30]:
params = {
    'objective' : 'binary:logistic',
    'max_depth' : 4,
    'eta' : 0.1,
    'eval_metric' : 'logloss'
}

In [31]:
num_boost_round = 100
bst = xgb.train(params, dtrain, num_boost_round)

In [None]:
#applying this retrained model on the test dataset on kaggle

In [32]:
X = df_test_encoded[['Fare', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male', 'SibSp_0', 'SibSp_1', 'SibSp_2', 'SibSp_3', 'SibSp_4','SibSp_5','SibSp_8', 'Parch_0', 'Parch_1', 'Parch_2', 'Parch_3', 'Parch_4', 'Parch_5', 'Parch_6', 'Parch_9', 'Embarked_C', 'Embarked_Q', 'Embarked_S']]

In [33]:
X_test = X
y_test = df_test_encoded['Survived']

In [34]:
dtest = xgb.DMatrix(X_test, label=y_test)

In [35]:
y_pred_prob = bst.predict(dtest)
y_pred = [1 if prob>0.5 else 0 for prob in y_pred_prob]

In [36]:
df_col = pd.DataFrame(y_pred)

In [37]:
df_col.to_csv('submission.csv')