<div style="text-align:center">
    <img src="../../files/monolearn-logo.png" height="150px">
    <h1>ML course</h1>
    <h3>Session 12: Titanic project</h3>
    <h4><a href="https://amzenterprise.ir/">Ali Momenzadeh</a></h5>
</div>

#### Import libraries

In [None]:
# pip install xgboost

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
#When using the 'inline' backend, your matplotlib graphs will be included in your notebook, next to the code.

import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

#### Load and prepare data

In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

#### EDA

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.info()

In [None]:
test.info()

In [None]:
train.describe()

In [None]:
test.describe()

#### Data Preprocessing

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
# Fill null values
train.fillna({"Age": train["Age"].median()}, inplace=True)
train.fillna({"Cabin": train["Cabin"].mode()[0]}, inplace=True)
train.fillna({"Embarked": train["Embarked"].mode()[0]}, inplace=True)

test.fillna({"Age": test["Age"].median()}, inplace=True)
test.fillna({"Fare": test["Fare"].median()}, inplace=True)
test.fillna({"Cabin": test["Cabin"].mode()[0]}, inplace=True)

In [None]:
train.info()

In [None]:
test.info()

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

#### Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

train["Sex"] = LabelEncoder().fit_transform(train["Sex"])
train = pd.get_dummies(train, columns=["Pclass", "Embarked"])

test["Sex"] = LabelEncoder().fit_transform(test["Sex"])
test = pd.get_dummies(test, columns=["Pclass", "Embarked"])

In [None]:
train["Family"] = train["SibSp"] + train["Parch"]
test["Family"] = test["SibSp"] + test["Parch"]

Remove unnecessary columns

In [None]:
train.drop(
    ["PassengerId", "Name", "Ticket", "Cabin", "SibSp", "Parch"], axis=1, inplace=True
)

test.drop(
    ["PassengerId", "Name", "Ticket", "Cabin", "SibSp", "Parch"], axis=1, inplace=True
)

In [None]:
train.info()

In [None]:
test.info()

#### Strorytelling - Visualization

In [None]:
corr = train.corr()
corr.shape

In [None]:
plt.figure(figsize=(12, 12))
sns.heatmap(
    corr,
    cbar=True,
    square=True,
    fmt=".1f",
    annot=True,
    annot_kws={"size": 15},
    cmap="coolwarm",
)

#### Train and test (Classification)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics

In [None]:
X = train.drop(["Survived"], axis=1)
y = train["Survived"]

In [None]:
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.2, random_state=0)

##### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

model_lr = LogisticRegression()
model_lr.fit(X_train, y_train)
y_pred_lr = model_lr.predict(X_validation)

print("Logistic Regression Accuracy: ", model_lr.score(X_validation, y_validation))
print("Logistic Regression Cross validation score: ", np.mean(cross_val_score(model_lr, X_train, y_train, cv=10)))
print("Logistic Regression Classification report:\n", metrics.classification_report(y_validation, y_pred_lr))

sns.heatmap(confusion_matrix(y_validation, y_pred_lr), annot=True, fmt="d")
plt.title('Logistic Regression Confusion matrix', y=1.05, size=15)

##### KNN Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier

model_knn = KNeighborsClassifier(n_neighbors=7)
model_knn.fit(X_train, y_train)
y_pred_knn = model_knn.predict(X_validation)

print("Knn Regressor Accuracy: ", model_knn.score(X_validation, y_validation))
print("Knn Regressor validation score: ", np.mean(cross_val_score(model_knn, X_train, y_train, cv=10)))
print("Knn Regressor Classification report:\n", metrics.classification_report(y_validation, y_pred_knn))

sns.heatmap(confusion_matrix(y_validation, y_pred_knn), annot=True, fmt="d")
plt.title("Knn Regressor Confusion matrix", y=1.05, size=15)

##### Dicision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

model_dt = DecisionTreeClassifier(max_depth=5)
model_dt = model_dt.fit(X_train, y_train)
y_pred_dt = model_dt.predict(X_validation)

print("Decision tree Accuracy: ", model_dt.score(X_validation, y_validation))
print("Decision tree Cross validation score: ", np.mean(cross_val_score(model_dt, X_train, y_train, cv=10)))
print("Decision tree Classification report:\n", metrics.classification_report(y_validation, y_pred_dt))

sns.heatmap(confusion_matrix(y_validation, y_pred_dt), annot=True, fmt="d")
plt.title('Decision tree Confusion matrix', y=1.05, size=15)

##### RandomForest

In [None]:
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier(
    criterion="entropy",
    n_estimators=700,
    min_samples_split=10,
    min_samples_leaf=1,
    max_features="auto",
    oob_score=True,
    random_state=1,
    n_jobs=-1,
)
model_rf.fit(X_train, y_train)
y_pred_rf = model_rf.predict(X_validation)

print("RandomForest Accuracy: ", model_rf.score(X_validation, y_validation))
print("RandomForest Cross validation score: ", np.mean(cross_val_score(model_rf, X_train, y_train, cv=10)))
print("RandomForest Classification report:\n", metrics.classification_report(y_validation, y_pred_rf))

sns.heatmap(confusion_matrix(y_validation, y_pred_rf), annot=True, fmt="d")
plt.title('RandomForest Confusion matrix', y=1.05, size=15)

##### SVM (SVC)

In [None]:
from sklearn.svm import SVC

model_svm = SVC(kernel="linear", C=1, gamma=0.1)
model_svm.fit(X_train, y_train)
y_pred_svm = model_svm.predict(X_validation)

print("SVM Accuracy: ", model_svm.score(X_validation, y_validation))
print("SVM Cross validation score: ", np.mean(cross_val_score(model_svm, X_train, y_train, cv=10)))
print("SVM Classification report:\n", metrics.classification_report(y_validation, y_pred_svm))

sns.heatmap(confusion_matrix(y_validation, y_pred_svm), annot=True, fmt="d")
plt.title('SVM Confusion matrix', y=1.05, size=15)

##### XGBoost

In [None]:
from xgboost import XGBClassifier

model_xgb = XGBClassifier(gamma=0.1, max_depth=10, n_estimators=700)
model_xgb.fit(X_train, y_train)
y_pred_xgb = model_xgb.predict(X_validation)

print("XGBoost Accuracy: ", model_xgb.score(X_validation, y_validation))
print("XGBoost Cross validation score: ", np.mean(cross_val_score(model_xgb, X_train, y_train, cv=10)))
print("XGBoost Classification report:\n", metrics.classification_report(y_validation, y_pred_xgb))

sns.heatmap(confusion_matrix(y_validation, y_pred_xgb), annot=True, fmt="d")
plt.title('XGBoost Confusion matrix', y=1.05, size=15)

> As we have seen, RandomForest and XGBoost has the best results. We can use either one to predict results based on the test data.

#### Save output

In [None]:
y_pred = model_xgb.predict(test)
output = pd.DataFrame({'Survived': y_pred})
output.head()
output.to_csv('output.csv', index=False)