In [None]:
#import necessary libraries
import pandas as pd
import numpy as np


- Pandas library is used to manipulate and read data in the notebook
-Numpy library is used to work on arrays and numerical data


In [None]:
#import dataset from drive
from google.colab import drive
drive.mount('/content/drive')

Through the google colab library use the function drive to give the notebook permission to access my files

In [None]:
titanic = pd.read_csv('/content/drive/My Drive/Titanic  Dataset.csv')

Using my given variable titanic I manipulated the dataset from my Google drive to the notebook

In [None]:
#View data
titanic.head(10)

- Viewing the first 10 rows in my data

**DATA PREPROCESSING**

In [None]:
#Checking the structure of the data
titanic.info()

-Checking a summary of the data(How many rows in each column and their datatypes and non-null counts)

In [None]:
#Check for categorical and non categorical
categorical_column = []
non_categorical_column =[]
for column in titanic.columns:
  if titanic[column].dtype == 'object' or titanic[column].dtype == 'category':
    categorical_column.append(column)
  else:
    non_categorical_column.append(column)
print("Categorical column is:")
print(categorical_column)
print("\nnon_categorical column is:")
print(non_categorical_column)

- Grouping our data to check for categorical and non categorical dtypes

**Data Cleaning**

In [None]:
#Check nulls
titanic.isnull().sum()

- Since there are nulls in Age, Cabin and Embarked, clean the data by removing the nulls

In [None]:
#Replacing the age nulls with mean
titanic['Age'].fillna(titanic['Age'].mean(),inplace=True)
titanic['Age'].isnull().sum()

- Since age is an integer, replace the nulls with mean

In [None]:
#Replacing cabin and embarked nulls to mode
titanic['Cabin'].fillna(titanic['Cabin'].mode()[0],inplace=True)
titanic['Embarked'].fillna(titanic['Embarked'].mode()[0],inplace=True)


In [None]:
titanic['Cabin'].isnull().sum()
titanic['Embarked'].isnull().sum()

Cabin and Embarked are objects in categorical data and their nulls are replaced with mode

In [None]:
#Check for duplicates
titanic.duplicated().sum()

Check if the data had duplicates in which case there wasn't

In [None]:
#Checking the statistical data
titanic.describe()

- View a summary of the statistics of the data

**MODEL BUILDING**

In [None]:
titanic['Survived'].unique()

In binary classification 0 means false and 1 meants true
so 0 means no survival and 1 means survival

Our target variable is Survived and it is binary so its best to use Logistic regression which deals with binary classification


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,roc_auc_score


Import the necessary libraries for training and testing the model

**Feature Engineering**

Transform raw data from our data to features usable in machine learning.

Do this by encoding the relevant data to be used


In [None]:
#Encoding the relevant categorical columns
#encoding sex
titanic['Sex'].unique()
titanic['Sex'] = titanic['Sex'].map({'male':0,'female':1})
print(titanic['Sex'].unique())


In [None]:
#Encoding embarked
titanic['Embarked'].unique()
titanic['Embarked'] = titanic['Embarked'].map( {'S':0, 'C':1, 'Q':2})
print(titanic['Embarked'].unique())

**Training and Testing the model**

In [None]:
#give variable x and y
x = titanic[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
y = titanic['Survived']

In [None]:
#train and test the data
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=50)


- Train the data at 80% and test it at 20%
- Random_state is a parameter used in shuffling the data before splitting it
- the more it increases the better your model performs and in this case the best RS was 50

In [None]:
#Give model variable
model = LogisticRegression(max_iter=1000)

Give the model a variable

In [None]:
#fit the model
model.fit(x_train,y_train)

In [None]:
#make predictions
y_pred = model.predict(x_test)

Make a prediction of the outcome you need

In [None]:
#Get the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Get the accuracy of the prediction

The model was 80% accurate

**Retraining the data with more independent variables to increase performance**


- Encode the remaining columns in categorical columns to use them as indepent variables

In [None]:
#Encode the columns using labelencoder
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
#encoding the remaining categorical columns
encoded_columns = ['Name', 'Ticket', 'Cabin']
for column in encoded_columns:
    titanic[column] = le.fit_transform(titanic[column])

In [None]:
#Add the columns to x variables and train and test
x = titanic[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Name', 'Ticket', 'Cabin']]
y = titanic['Survived']
#train and test the data
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=50)
#give the model a variable
model = LogisticRegression(max_iter=1000)
#fit the model
model.fit(x_train,y_train)
#make predictions
y_pred = model.predict(x_test)
#Get the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

After including all the columns as the independent variable, the model accuracy increased to 82%

All columns were important in predicting the outcome of our model.

**HYPERPARAMETERS**

The model was 82% accurate. Hyperparameters are used to improve the model performance by tuning the model using parameters to a better percentage

In [None]:
model = LogisticRegression(max_iter=1000)
#parameters in logistic regression
param_grid = {
    'C': [0.001,0.01,0.1,1]
}

Make a library of the parameter of which will be used to find the best parameter

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score,roc_auc_score


Import the gridsearchcv funtion from the sklearn model selection library which will help in finding the best parameters to use

In [None]:
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
grid_search.fit(x_train, y_train)
best_params = grid_search.best_params_
print(best_params)

- cv stands for cross validdation in which the data will be split 5 times and trained/tested to different hyperparameters
- Fit the grid search to the data.
- After the grid search, using the function best_params_, find the best parameters in the parameter grid

Fit the new model to the data

In [None]:
model.fit(x_train,y_train)

In [None]:
#retrain model using the best params
best_model = LogisticRegression(max_iter=1000,**best_params)
best_model.fit(x_train, y_train)

In [None]:
#predict the outcome of the retrained model
y_pred = best_model.predict(x_test)

In [None]:
#Get the accuracy score
from sklearn.metrics import accuracy_score,roc_auc_score
accuracy = accuracy_score(y_test, y_pred)
roc_auc_score = roc_auc_score(y_test, best_model.predict_proba(x_test)[:,1])
print(f"Accuracy: {accuracy}")
print(f"ROC AUC Score: {roc_auc_score}")


**Hyperparameter tuning 2**

In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(x_train, y_train)

In [None]:
param_grid = {
  'penalty': ['l2']

}

In [None]:
grid_search = GridSearchCV(estimator = model, param_grid = param_grid, cv=5)
grid_search.fit(x_train,y_train)
grid_search =GridSearchCV(estimator=model,param_grid=param_grid,cv=5)
grid_search.fit(x_train,y_train)
best_params = grid_search.best_params_
print(best_params)

In [None]:
#Retraining the retrained model on the entire retrained training set
best_model = LogisticRegression(max_iter=1000,**best_params)
best_model.fit(x_train, y_train)

In [None]:
#evaluate the best model1
y_pred = best_model.predict(x_test)

In [None]:
#Get the accuracy
from sklearn.metrics import roc_auc_score
accuracy1 = accuracy_score(y_test, y_pred)
roc_auc= roc_auc_score(y_test, model.predict_proba(x_test)[:,1])
print("Accuracy:", accuracy1)
print("ROC AUC Score:", roc_auc)

ALTERNATIVELY


Using a different model

**Random Forest**

In [None]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
#Add the columns to x variables and train and test
x = titanic[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Name', 'Ticket', 'Cabin']]
y = titanic['Survived']
#train and test the data
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=50)
#give our model a variable
model = RandomForestClassifier(criterion = 'gini', max_depth = 3,random_state=0)
#Fit and train our model
model.fit(x_train, y_train)
#make predictions
y_pred = model.predict(x_test)
#Get the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Random Forest classifier had a 82% accuracy better than Logistic regression

**Hyperparameter tuning in Random Forest**

In [None]:
#get the param grid
param_grid = {
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3]
}


In [None]:
from sklearn.model_selection import GridSearchCV
#create a model variable
dt = RandomForestClassifier(criterion='gini',max_depth=3)
#perform grid search
grid_search = GridSearchCV(dt, param_grid, cv=5)
#fit the grid search
grid_search.fit(x_train, y_train)
best_params = grid_search.best_params_
print(best_params)

In [None]:
from sklearn.metrics import roc_auc_score
#Original fit and prediction
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
 #name the retrained model
best_model =RandomForestClassifier(**best_params)
best_model.fit(x_train,y_train)
y_pred_best = best_model.predict_proba(x_test)[:, 1]


accuracy = accuracy_score(y_test,y_pred)

roc_auc_score = roc_auc_score(y_test, y_pred_best)

print(f"Accuracy score: {accuracy}")
print(f"ROC AUC SCORE: {roc_auc_score}")

Hyperparameters in Random forest tuned the 82% model to 88% model

This makes Random Forest the most fit model in predicting the data