# Importing Tools

In [51]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

#Preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

#Models
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

# Data Preprocessing

In [31]:
train= pd.read_csv("/kaggle/input/titanic/train.csv")
test= pd.read_csv("/kaggle/input/titanic/test.csv")

# Quick dataset information 🤓

In [32]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [36]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [35]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [34]:
test.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


# 🧹 Data Processing
1. Deal with Null values
2. Change categorical values using one hot encoder
3. Feature Scaling


In [37]:
#Dropping Passenger Id, ticket, and name
train.drop(['PassengerId', 'Ticket', 'Name', 'Cabin'], axis=1, inplace=True)
dfps=test.PassengerId.copy()
test.drop(['PassengerId', 'Ticket', 'Name', 'Cabin'], axis=1, inplace=True)

In [38]:
# Taking care of missing data ('Age')
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(train.iloc[: , 3:7])
train.iloc[: , 3:7] = imputer.transform(train.iloc[: , 3:7])

# Taking care of missing data ('Embarked'), most frequent value
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer.fit(train.iloc[: , 6:])
train.iloc[: , 6:] = imputer.transform(train.iloc[: , 6:])

In [39]:
# Taking care of missing data ('Age') & ('Fare') Test
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(test.iloc[: , 2:6])
test.iloc[: , 2:6] = imputer.transform(test.iloc[: , 2:6])


In [43]:
categ = ['Embarked','Sex']

# Encode Categorical Columns
le = LabelEncoder()
test[categ] = test[categ].apply(le.fit_transform)
train[categ] = train[categ].apply(le.fit_transform)

In [44]:
train

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.000000,1.0,0.0,7.2500,2
1,1,1,0,38.000000,1.0,0.0,71.2833,0
2,1,3,0,26.000000,0.0,0.0,7.9250,2
3,1,1,0,35.000000,1.0,0.0,53.1000,2
4,0,3,1,35.000000,0.0,0.0,8.0500,2
...,...,...,...,...,...,...,...,...
886,0,2,1,27.000000,0.0,0.0,13.0000,2
887,1,1,0,19.000000,0.0,0.0,30.0000,2
888,0,3,0,29.699118,1.0,2.0,23.4500,2
889,1,1,1,26.000000,0.0,0.0,30.0000,0


# Training Model

In [52]:
models = {
    LogisticRegression(): "        Logistic Regression",
    XGBClassifier():      "        XGBClassifier",
    SVC():                "        Support Vector Machine",
    MLPClassifier():      "        Neural Network",
    
}

for model in models.keys():
    model.fit(X_train, y_train)

In [53]:

y = train['Survived'].copy()
X = train.drop('Survived', axis=1).copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=143)


In [54]:
# Accuracy Score
for model, name in models.items():
    print(name + ": {:.2f}%".format(model.score(X_test, y_test)*100))

        Logistic Regression: 77.99%
        XGBClassifier: 80.97%
        Support Vector Machine: 70.52%
        Neural Network: 79.48%


## Adjusting XGBoost Parameters

In [62]:
## Hyper Parameter Optimization

params={
 "learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
 "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
 "min_child_weight" : [ 1, 3, 5, 7 ],
 "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
 "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]
    
}

In [67]:
## Hyperparameter optimization using RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import xgboost

In [68]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [83]:
classifier=xgboost.XGBClassifier()


In [84]:
random_search=RandomizedSearchCV(classifier,param_distributions=params,n_iter=5,scoring='roc_auc',n_jobs=-1,cv=5,verbose=3)


In [85]:
from datetime import datetime
# Here we go
start_time = timer(None) # timing starts from this point for "start_time" variable
random_search.fit(X,y)
timer(start_time) # timing ends here for "start_time" variable

Fitting 5 folds for each of 5 candidates, totalling 25 fits

 Time taken: 0 hours 0 minutes and 5.21 seconds.


In [86]:
random_search.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.5,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0.1, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.05, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=4, max_leaves=0, min_child_weight=3,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

In [87]:
random_search.best_params_

{'min_child_weight': 3,
 'max_depth': 4,
 'learning_rate': 0.05,
 'gamma': 0.1,
 'colsample_bytree': 0.5}

In [89]:
classifier=xgboost.XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.5,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0.1, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.05, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=4, max_leaves=0, min_child_weight=3,
              monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1,)

In [90]:
from sklearn.model_selection import cross_val_score
score=cross_val_score(classifier,X,y,cv=10)

In [91]:
score

array([0.76666667, 0.78651685, 0.76404494, 0.86516854, 0.86516854,
       0.82022472, 0.83146067, 0.76404494, 0.85393258, 0.82022472])

In [93]:
score.mean()

0.81374531835206

In [96]:
classifier.fit(X,y)
print("Optimized XGBoost: {:.2f}%".format(classifier.score(X_test, y_test)*100))
y_prd_xgb= classifier.predict(test)

Optimized XGBoost: 84.33%


In [97]:
output = pd.DataFrame({'PassengerId': dfps, 'Survived': y_prd_xgb})
output.to_csv('submission.csv', index=False)
print("Your submission was successful!")

Your submission was successful!


In [98]:
print(output)
classification_report()

     PassengerId  Survived
0            892         0
1            893         0
2            894         0
3            895         0
4            896         0
..           ...       ...
413         1305         0
414         1306         1
415         1307         0
416         1308         0
417         1309         0

[418 rows x 2 columns]


TypeError: classification_report() missing 2 required positional arguments: 'y_true' and 'y_pred'