# 🚢 **Titanic prediction project** 
---------------------------------------

# 📚 Import Libraries

In [98]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# 📂 Initial settup

In [3]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
test = pd.read_csv('test.csv')
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


# 🔍 Data Exploration

### Are there null values ?

In [18]:
# Lets look at null values
null = train.isnull().sum().reset_index()
null['Percentage'] = ( null[0] / len(null)).astype(float) 
null.columns = ['Index','Values','Percentage']
null

Unnamed: 0,Index,Values,Percentage
0,PassengerId,0,0.0
1,Survived,0,0.0
2,Pclass,0,0.0
3,Name,0,0.0
4,Sex,0,0.0
5,Age,177,14.75
6,SibSp,0,0.0
7,Parch,0,0.0
8,Ticket,0,0.0
9,Fare,0,0.0


### Looking at correlations 

In [30]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [27]:
train[['SibSp','Parch', 'Fare']].corr()

Unnamed: 0,SibSp,Parch,Fare
SibSp,1.0,0.414838,0.159651
Parch,0.414838,1.0,0.216225
Fare,0.159651,0.216225,1.0


In [46]:
corr1 = pd.get_dummies(train[['Pclass','Sex','Age','SibSp','Parch','Cabin_str']])
corr1.corr()

Unnamed: 0,Pclass,Age,SibSp,Parch,Sex_female,Sex_male,Cabin_str_A,Cabin_str_B,Cabin_str_C,Cabin_str_D,Cabin_str_E,Cabin_str_F,Cabin_str_G,Cabin_str_T
Pclass,1.0,-0.369226,0.083081,0.018443,-0.1319,0.1319,-0.204934,-0.369572,-0.417048,-0.27869,-0.230091,0.011063,0.055561,-0.052496
Age,-0.369226,1.0,-0.308247,-0.189119,-0.093254,0.093254,0.136309,0.093914,0.122041,0.136975,0.12144,-0.08397,-0.077296,0.039474
SibSp,0.083081,-0.308247,1.0,0.414838,0.114631,-0.114631,-0.046266,-0.034538,0.029251,-0.017575,-0.036865,0.001706,-0.001402,-0.015907
Parch,0.018443,-0.189119,0.414838,1.0,0.245489,-0.245489,-0.040325,0.056498,0.030736,-0.019125,-0.016554,0.023694,0.072388,-0.015878
Sex_female,-0.1319,-0.093254,0.114631,0.245489,1.0,-1.0,-0.078271,0.109689,0.058649,0.079248,0.047003,0.008202,0.091031,-0.024728
Sex_male,0.1319,0.093254,-0.114631,-0.245489,-1.0,1.0,0.078271,-0.109689,-0.058649,-0.079248,-0.047003,-0.008202,-0.091031,0.024728
Cabin_str_A,-0.204934,0.136309,-0.046266,-0.040325,-0.078271,0.078271,1.0,-0.03088,-0.034846,-0.025663,-0.025256,-0.015923,-0.008787,-0.004386
Cabin_str_B,-0.369572,0.093914,-0.034538,0.056498,0.109689,-0.109689,-0.03088,1.0,-0.062841,-0.04628,-0.045547,-0.028715,-0.015847,-0.00791
Cabin_str_C,-0.417048,0.122041,0.029251,0.030736,0.058649,-0.058649,-0.034846,-0.062841,1.0,-0.052225,-0.051398,-0.032403,-0.017883,-0.008926
Cabin_str_D,-0.27869,0.136975,-0.017575,-0.019125,0.079248,-0.079248,-0.025663,-0.04628,-0.052225,1.0,-0.037852,-0.023864,-0.01317,-0.006574


In [43]:
train.groupby('Cabin_str')['Pclass'].value_counts()

Cabin_str  Pclass
A          1         15
B          1         47
C          1         59
D          1         29
           2          4
E          1         25
           2          4
           3          3
F          2          8
           3          5
G          3          4
T          1          1
Name: count, dtype: int64

# Fixing null values

In [49]:
train['Cabin_str'].value_counts()

Cabin_str
C    59
B    47
D    33
E    32
A    15
F    13
G     4
T     1
Name: count, dtype: int64

# XGBoost as it is

In [50]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Cabin_num,Cabin_str
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,,
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,85.0,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,,
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,123.0,C
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,,


In [51]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [57]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
 12  Cabin_num    204 non-null    object 
 13  Cabin_str    204 non-null    object 
dtypes: float64(2), int64(5), object(7)
memory usage: 97.6+ KB


In [66]:
continuous = train.select_dtypes(include='number')
continuous = continuous[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']]

In [68]:
from sklearn.preprocessing import StandardScaler, Normalizer, MinMaxScaler
import matplotlib.pyplot as plt
scaler = StandardScaler().fit(continuous)
continuous = scaler.transform(continuous)

In [70]:
# numpy preprocessed continuous features
continuous

array([[ 0.82737724, -0.53037664,  0.43279337, -0.47367361, -0.50244517],
       [-1.56610693,  0.57183099,  0.43279337, -0.47367361,  0.78684529],
       [ 0.82737724, -0.25482473, -0.4745452 , -0.47367361, -0.48885426],
       ...,
       [ 0.82737724,         nan,  0.43279337,  2.00893337, -0.17626324],
       [-1.56610693, -0.25482473, -0.4745452 , -0.47367361, -0.04438104],
       [ 0.82737724,  0.15850313, -0.4745452 , -0.47367361, -0.49237783]])

In [77]:
categorical = train.select_dtypes(include='object')
categorical = pd.get_dummies(categorical[['Sex','Embarked']]).astype(int)
categorical = categorical.to_numpy()
categorical

array([[0, 1, 0, 0, 1],
       [1, 0, 1, 0, 0],
       [1, 0, 0, 0, 1],
       ...,
       [1, 0, 0, 0, 1],
       [0, 1, 1, 0, 0],
       [0, 1, 0, 1, 0]])

In [79]:
import numpy as np

In [80]:
x_transformed = np.hstack((categorical,continuous))

In [82]:
pd.DataFrame(x_transformed)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.0,1.0,0.0,0.0,1.0,0.827377,-0.530377,0.432793,-0.473674,-0.502445
1,1.0,0.0,1.0,0.0,0.0,-1.566107,0.571831,0.432793,-0.473674,0.786845
2,1.0,0.0,0.0,0.0,1.0,0.827377,-0.254825,-0.474545,-0.473674,-0.488854
3,1.0,0.0,0.0,0.0,1.0,-1.566107,0.365167,0.432793,-0.473674,0.420730
4,0.0,1.0,0.0,0.0,1.0,0.827377,0.365167,-0.474545,-0.473674,-0.486337
...,...,...,...,...,...,...,...,...,...,...
886,0.0,1.0,0.0,0.0,1.0,-0.369365,-0.185937,-0.474545,-0.473674,-0.386671
887,1.0,0.0,0.0,0.0,1.0,-1.566107,-0.737041,-0.474545,-0.473674,-0.044381
888,1.0,0.0,0.0,0.0,1.0,0.827377,,0.432793,2.008933,-0.176263
889,0.0,1.0,1.0,0.0,0.0,-1.566107,-0.254825,-0.474545,-0.473674,-0.044381


In [85]:
y = train['Survived'].to_numpy()

In [90]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_transformed,y, test_size=0.2, random_state=42)

# 🔍 Lets do a gridsearch 

In [99]:
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score

# Initialize the XGBoost classifier
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [3, 5],
    'learning_rate': [0.1, 0.3],
    'subsample': [0.8],
    'colsample_bytree': [0.8],
    'min_child_weight': [1],
    'gamma': [0],
    'reg_alpha': [0],
    'reg_lambda': [1]
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=xgb_clf, 
    param_distributions=param_grid, 
    n_iter=10,  # Reduced number of iterations
    cv=2,  # 2-fold cross-validation
    scoring='accuracy', 
    verbose=1,
    n_jobs=-1  # Use all available cores
)

# Perform random search
random_search.fit(x_train, y_train)

# Get the best parameters and the best model
best_params = random_search.best_params_
best_model = random_search.best_estimator_

print("Best Parameters:", best_params)

# Predict and evaluate with the best model
y_pred = best_model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of Best Model: {accuracy:.2f}")




Fitting 2 folds for each of 8 candidates, totalling 16 fits
Best Parameters: {'subsample': 0.8, 'reg_lambda': 1, 'reg_alpha': 0, 'n_estimators': 50, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.8}
Accuracy of Best Model: 0.83


Parameters: { "use_label_encoder" } are not used.



# 🎰 XGBoost classifier

In [103]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

model = XGBClassifier(colsample_bytree=0.8, n_estimators=50, learning_rate=0.1, max_depth=3,
                     min_child_weight=1, subsample=0.8,reg_alpha=0,gamma=0,reg_lambda=1 ,random_state=42)
model.fit(x_train,y_train)
y_pred = model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.82
