In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn import preprocessing
import seaborn as sns
%matplotlib inline

In [None]:
df = pd.read_csv(r"train.csv") #Loading the dataset

# **Exploratory Data Analysis**

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.isnull().sum()

# **Cleaning and Processing the dataset**

In [None]:
# Dropping the PassengerID, Name and Ticket columns as they do not contribute to the model
df = df.drop("PassengerId", axis = 1)
df = df.drop("Name", axis = 1)
df = df.drop("Ticket", axis = 1)

In [None]:
# Obtaining all the columns that have object datatype
categorical_cols = []
for column in df.columns:
    if df[column].dtype == 'object':
        categorical_cols.append(column)
        print(column.upper(),': ',df[column].nunique())
        print(df[column].value_counts().sort_values())
        print('\n')

In [None]:
df["Cabin"] = df["Cabin"].apply(lambda x: x[0] if type(x) == str else x) #Getting the Block in which the passenger stayed among the cabins

In [None]:
df.loc[:, categorical_cols]

In [None]:
# One hot encoding the categorical columns
onehot = preprocessing.OneHotEncoder(drop = "if_binary").fit(df.loc[:, categorical_cols])
onehotdf = np.array(onehot.transform(df.loc[:, categorical_cols]).toarray())
encoded_df = pd.DataFrame(onehotdf, columns=onehot.get_feature_names_out(categorical_cols))
df = pd.concat([df.drop(columns=categorical_cols), encoded_df], axis=1)

In [None]:
# Inserting the mean value of ages in the NaN values of Age
from sklearn.impute import SimpleImputer
imputer_numerical = SimpleImputer(strategy='mean')
df[['Age']] = imputer_numerical.fit_transform(df[['Age']])

In [None]:
df

In [None]:
def plot_corr(df, size=11):
    corr = df.corr()
    fig, ax = plt.subplots(figsize=(size, size))
    ax.matshow(corr)
    plt.xticks(range(len(corr.columns)), corr.columns)
    plt.yticks(range(len(corr.columns)), corr.columns)

In [None]:
plot_corr(df)

Very little correlation between features can be observed.

In [None]:
sns.pairplot(df,diag_kind='kde')

<seaborn.axisgrid.PairGrid at 0x7a14b26e0410>

In [None]:
survived_samples = len(df[df['Survived'] == 1])
died_samples = len(df[df['Survived'] == 0])
print(f"Survived samples: {survived_samples/(survived_samples+died_samples) * 100}")
print(f"Died samples: {died_samples/(survived_samples+died_samples) * 100}")

# **Training the model**

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = df.drop("Survived", axis = 1)
Y = df["Survived"]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1)

In [None]:
x_train.head()

In [None]:
y_train.head()

In [None]:
print("Original Survived Values    : {0} ({1:0.2f}%)".format(len(df.loc[df['Survived'] == 1]), ((len(df.loc[df['Survived'] == 1])/len(df.index)) * 100)))
print("Original Died Values   : {0} ({1:0.2f}%)".format(len(df.loc[df['Survived'] == 0]), ((len(df.loc[df['Survived'] == 0])/len(df.index)) * 100)))
print("")
print("Training Survived Values    : {0} ({1:0.2f}%)".format(len(y_train[y_train[:] == 1]), (len(y_train[y_train[:] == 1])/len(y_train)) * 100))
print("Training Died Values   : {0} ({1:0.2f}%)".format(len(y_train[y_train[:] == 0]), (len(y_train[y_train[:] == 0])/len(y_train)) * 100))
print("")
print("Test Survived Values        : {0} ({1:0.2f}%)".format(len(y_test[y_test[:] == 1]), (len(y_test[y_test[:] == 1])/len(y_test)) * 100))
print("Test Died Values       : {0} ({1:0.2f}%)".format(len(y_test[y_test[:] == 0]), (len(y_test[y_test[:] == 0])/len(y_test)) * 100))
print("")

1. Logistic Regression

In [None]:
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver="liblinear")
model.fit(x_train, y_train)
y_predict = model.predict(x_test)

In [None]:
model_score = model.score(x_test, y_test)
print(model_score)

In [None]:
f1_score_model = metrics.f1_score(y_test, y_predict)
print(f1_score_model)

The model has 77.2 % accuracy. And an F1 score of 0.73

In [None]:
cm=metrics.confusion_matrix(y_test, y_predict, labels=[1, 0])

df_cm = pd.DataFrame(cm, index = [i for i in ["Real 1","Real 0"]],
                  columns = [i for i in ["Predicted 1","Predicted 0"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True)

2. Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rf = RandomForestClassifier()
parameters = {"n_estimators":[100, 150, 200], "max_depth": [2, 3, 4, 5, 6, 7]}
clf = GridSearchCV(rf, parameters)
clf.fit(x_train, y_train)


In [None]:
print("Best score obtained :", clf.best_score_)
print("Best set of parameters obtained :",clf.best_params_)

In [None]:
cv_results = pd.DataFrame(clf.cv_results_)
cv_results.sort_values("mean_test_score", ascending=False)

We can see that for estimators with n_estimators as 200 and max depth as 6 have their performance only slightly better than estimators with n_estimators as 150 and max_depth as 6.

3. ADA Boost

In [321]:
from sklearn.ensemble import AdaBoostClassifier
ADA = AdaBoostClassifier()
parameters = {"n_estimators":[50, 100, 200], "learning_rate": np.logspace(-1, 1, 5)}
clf = GridSearchCV(ADA, parameters)
clf.fit(x_train, y_train)

In [322]:
print("Best score obtained :", clf.best_score_)
print("Best set of parameters obtained :",clf.best_params_)

Best score obtained : 0.8154451612903225
Best set of parameters obtained : {'learning_rate': np.float64(1.0), 'n_estimators': 200}


4. Linear Discriminant Analysis

In [323]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.pipeline import Pipeline
pipe = Pipeline([('lda', LinearDiscriminantAnalysis()), ("Logistic Regression", LogisticRegression())])
parameters = {"lda__n_components":[1, 2, 3, 4]}
clf = GridSearchCV(pipe, parameters)
clf.fit(x_train, y_train)


In [324]:
print("Best score obtained :", clf.best_score_)
print("Best set of parameters obtained :",clf.best_params_)

Best score obtained : 0.8138193548387097
Best set of parameters obtained : {'lda__n_components': 1}


In [325]:
pipe2 = Pipeline([('lda', LinearDiscriminantAnalysis()), ("rf", RandomForestClassifier())])
parameters = {"lda__n_components":[1, 2, 3, 4], "rf__n_estimators":[100, 150, 200], "rf__max_depth": [2, 3, 4, 5, 6, 7]}
clf = GridSearchCV(pipe2, parameters)
clf.fit(x_train, y_train)


In [326]:
print("Best score obtained :", clf.best_score_)
print("Best set of parameters obtained :",clf.best_params_)

Best score obtained : 0.8186193548387097
Best set of parameters obtained : {'lda__n_components': 1, 'rf__max_depth': 2, 'rf__n_estimators': 100}


In [327]:
pipe3 = Pipeline([('lda', LinearDiscriminantAnalysis()), ("adb", AdaBoostClassifier())])
parameters = {"lda__n_components":[1, 2, 3, 4], "adb__n_estimators":[50, 100, 200], "adb__learning_rate": np.logspace(-1, 1, 5)}
clf = GridSearchCV(pipe3, parameters)
clf.fit(x_train, y_train)

In [328]:
print("Best score obtained :", clf.best_score_)
print("Best set of parameters obtained :",clf.best_params_)

Best score obtained : 0.8250322580645161
Best set of parameters obtained : {'adb__learning_rate': np.float64(3.1622776601683795), 'adb__n_estimators': 50, 'lda__n_components': 1}


The best model of the them can be seen as the Random Forest model with 150 estimators and max depth as 6

In [329]:
model = RandomForestClassifier(n_estimators=150, max_depth=6)
model.fit(x_train, y_train)
y_predict = model.predict(x_test)
model.score(x_test, y_test)
f1_score_model = metrics.f1_score(y_test, y_predict)
print(f1_score_model)

0.736318407960199


# **Prediction of the model on the given test data**

In [330]:
X_predict = pd.read_csv(r"test.csv")

In [331]:
X_predict.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [332]:
X_predict.isnull().sum()

Unnamed: 0,0
PassengerId,0
Pclass,0
Name,0
Sex,0
Age,86
SibSp,0
Parch,0
Ticket,0
Fare,1
Cabin,327


In [333]:
imputer_numerical = SimpleImputer(strategy='mean')
imputer_categorical = SimpleImputer(strategy='most_frequent')

X_predict[['Cabin']] = imputer_categorical.fit_transform(X_predict[['Cabin']])
X_predict[['Age']] = imputer_numerical.fit_transform(X_predict[['Age']])
X_predict[['Fare']] = imputer_numerical.fit_transform(X_predict[['Fare']])


In [334]:
X_predict["Cabin"] = X_predict["Cabin"].apply(lambda x: x[0] if type(x) == str else x)

In [335]:
onehotdf = np.array(onehot.transform(X_predict.loc[:, categorical_cols]).toarray())
encoded_df = pd.DataFrame(onehotdf, columns=onehot.get_feature_names_out(categorical_cols))
X_predict = pd.concat([X_predict.drop(columns=categorical_cols), encoded_df], axis=1)

In [336]:
X_predict = X_predict.drop("PassengerId", axis = 1)
X_predict = X_predict.drop("Name", axis = 1)
X_predict = X_predict.drop("Ticket", axis = 1)

In [337]:
X_predict

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_nan,Embarked_C,Embarked_Q,Embarked_S,Embarked_nan
0,3,34.50000,0,0,7.8292,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,3,47.00000,1,0,7.0000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,2,62.00000,0,0,9.6875,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,3,27.00000,0,0,8.6625,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,3,22.00000,1,1,12.2875,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,3,30.27259,0,0,8.0500,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
414,1,39.00000,0,0,108.9000,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
415,3,38.50000,0,0,7.2500,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
416,3,30.27259,0,0,8.0500,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [338]:
model_prediction = model.predict(X_predict)

In [339]:
model_prediction

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [340]:
pd.DataFrame(model_prediction, ).to_csv('model_predictions.csv', index=False)