<a href="https://www.kaggle.com/code/georgevelkov/logistic-regression-titanic?scriptVersionId=114014447" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from mlxtend.plotting import plot_confusion_matrix
import matplotlib.pyplot as plt

Note: largely following AICVS tutorial for data cleaning and prep. Available at: https://aicvscummins.weebly.com/home/tutorial-to-approach-the-titanic-dataset

### Load Data

In [None]:
# read csv files into pandas dataframes
train_df = pd.read_csv('/kaggle/input/titanic/train.csv')
test_df = pd.read_csv('/kaggle/input/titanic/test.csv')

In [None]:
combine = [train_df, test_df]
combine

In [None]:
train_df

In [None]:
print(train_df.columns)

In [None]:
# show datatypes for every column
print(train_df.dtypes)

In [None]:
# print some statistics
print(train_df.describe())

In [None]:
print(train_df.describe(include=['O']))

In [None]:
# show missing values for every column
print("Missing values")
print("AGE: ", train_df['Age'].isna().sum())
print("CABIN: ",train_df['Cabin'].isna().sum())
print("PASSENGER_ID: ",train_df['PassengerId'].isna().sum())
print("PARCH: ",train_df['Parch'].isna().sum())
print("SEX: ",train_df['Sex'].isna().sum())
print("FARE: ",train_df['Fare'].isna().sum())
print("SURVIVED: ",train_df['Survived'].isna().sum())
print("PCLASS: ",train_df['Pclass'].isna().sum())
print("SIBSB: ",train_df['SibSp'].isna().sum())
print("Ticket: ",train_df['Ticket'].isna().sum())
print("EMBARKED: ",train_df['Embarked'].isna().sum())
print("NAME: ",train_df['Name'].isna().sum())

### **Data Cleaning**

In [None]:
# change the missing values to the mean value for the "Age" column
# this will not affect the pattern negatively
mean_age = train_df["Age"].mean()
train_df.fillna({'Age': mean_age}, inplace=True)

In [None]:
# change the missing values for the "Embarked" column to the most frequent value
# this will not affect the pattern negatively and there are just two NaN values
train_df["Embarked"].fillna(train_df["Embarked"].value_counts().idxmax(), inplace=True)

In [None]:
# the column "Cabin" has too many missing values so it is getting dropped altogether
train_df.drop(["Cabin"], axis=1, inplace=True)

In [None]:
# check for remaining missing values
train_df.isnull().sum()

In [None]:
# convert categorical features into numerical ones
training_df = pd.get_dummies(train_df,columns=["Sex", "Pclass", "Embarked"])
training_df.drop(["Sex_female"], axis=1, inplace=True) # surplus feature
training_df

#### Repeat for test set

In [None]:
test_df

In [None]:
# check for missing values in test set
test_df.isnull().sum()

In [None]:
# data cleaning for test_df
mean_age = test_df["Age"].mean()
test_df.fillna({'Age': mean_age}, inplace=True)

In [None]:
test_df.drop(["Cabin"], axis=1, inplace=True)

In [None]:
testing_df = pd.get_dummies(test_df,columns=["Sex", "Pclass", "Embarked"])
testing_df.drop(["Sex_female"], axis=1, inplace=True) # surplus feature
testing_df

### Feature Selection

In [None]:
# drop irrelevant features
# "Name", "Ticket", "PassengerId" do not affect the survival rate -> drop them
training_df.drop(["Name"], axis=1, inplace=True)
testing_df.drop(["Name"], axis=1, inplace=True)
training_df.drop(["Ticket"], axis=1, inplace=True)
testing_df.drop(["Ticket"], axis=1, inplace=True)
training_df.drop(["PassengerId"], axis=1, inplace=True)
testing_df.drop(["PassengerId"], axis=1, inplace=True)

In [None]:
# combine "SibSp" and "Parch" into one feature that determines if person was travelling alone
training_df['Alone'] = np.where((training_df["SibSp"] + training_df["Parch"]) > 0, 0, 1)

In [None]:
testing_df['Alone'] = np.where((testing_df["SibSp"] + testing_df["Parch"]) > 0, 0, 1)

In [None]:
# now "SibSp" and "Parch" can be dropped
training_df.drop(["SibSp"], axis=1, inplace=True)
testing_df.drop(["SibSp"], axis=1, inplace=True)
training_df.drop(["Parch"], axis=1, inplace=True)
testing_df.drop(["Parch"], axis=1, inplace=True)

In [None]:
# final check for missing values
training_df.isnull().sum()

In [None]:
testing_df.isnull().sum()

In [None]:
# there is a single remaining NaN value in the testing set
# The median fare value will be used to fill the missing value
testing_df["Fare"].fillna(testing_df["Fare"].median(), inplace=True)

### Model

The problem requires to predict if a person survives the disaster or not
-> a Binary Classification problem. The model that I will use is Logistic Regression.

In [None]:
# split into training and testing set 
X_train, X_test, y_train, y_test = train_test_split(
    training_df.drop('Survived', axis=1), training_df['Survived'], 
    test_size=0.2, random_state=101)

In [None]:
# load the Logistic Regression model
lr_model = LogisticRegression()

#### Train

In [None]:
lr_model.fit(X_train,y_train)

In [None]:
predict = lr_model.predict(X_test)

#### Visualise results

In [None]:
print(classification_report(y_test, predict))

In [None]:
# confusion matrix
print("Confusion matrix for Logistic Regression")
matrix = confusion_matrix(y_test, predict)
print(matrix)
print('\n')

fig, ax = plot_confusion_matrix(conf_mat=matrix,figsize=(10, 10),
                                show_absolute=True,
                                show_normed=True,
                                colorbar=True)
plt.show()

#### On Test Data

In [None]:
prediction = lr_model.predict(testing_df)

In [None]:
test_df

In [None]:
final_preds = pd.DataFrame(prediction)
final_preds.rename(columns={0:'Survived'}, inplace=True)
final_preds = pd.concat([test_df['PassengerId'], final_preds], axis=1)
final_preds

In [None]:
final_preds["Survived"].value_counts()

Finally convert the predistions dataframe to CSV for submission

In [None]:
final_preds.to_csv('titanic_predictions.csv', index=False)