<a href="https://colab.research.google.com/github/Israr-11/Machine-Learning-model-for-predicting-the-Titanic-survivor/blob/main/Titanic_Survival_prediction_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step#01 Importing libraries

In [23]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import joblib
from google.colab import drive

# Step#02 Importing the datasets

In [None]:
#Mount the drive manually before running code, and then give the path
path='/content/drive/MyDrive/Datasets/Titanic/'
training_dataset=pd.read_csv(path+'train.csv')
testing_dataset=pd.read_csv(path+'test.csv')
training_dataset.head()

# Step#03 Exploring the dataset

In [None]:
sns.heatmap(training_dataset.isnull(), cbar=False, cmap='viridis')
plt.show()

In [None]:
training_dataset.info()
training_dataset.describe()
sns.countplot(x='Survived', data=training_dataset)
plt.show()

# Step#04 Data Preprocessing

1. Handling Missing Values

In [5]:
# This fills the missing values in the Age column with the median age of the passengers in both the training and testing datasets. The inplace=True argument  ensures that the
# changes are made directly to the DataFrames.
training_dataset['Age'].fillna(training_dataset['Age'].median(), inplace=True)
testing_dataset['Age'].fillna(testing_dataset['Age'].median(), inplace=True)

# This fills the missing values in the Embarked column with the most frequent value (mode) of the Embarked column in both the training and testing datasets. The mode()[0] part
# extracts the most frequent value.
training_dataset['Embarked'].fillna(training_dataset['Embarked'].mode(), inplace=True)
testing_dataset['Embarked'].fillna(testing_dataset['Embarked'].mode(), inplace=True)

#This fills the missing values in the Fare column with the median fare of the passengers in the testing dataset.
training_dataset['Fare'].fillna(training_dataset['Fare'].median(), inplace=True)

#This removes the Cabin column from both the training and testing datasets. The column is likely dropped because it has too many missing values, making it less useful
#for analysis or modeling.
training_dataset.drop(columns=['Cabin'], inplace=True)
testing_dataset.drop(columns=['Cabin'], inplace=True)

2. Encode Categorical Variables

In [6]:
# The for loop iterates over the columns ['Sex', 'Embarked'].
# fit_transform is applied to the training set to fit the encoder to the data and then transform it.
# transform is applied to the test set to ensure that it uses the same encoding as the training set.
label_encoder=LabelEncoder()
# fit_transform on train_df[column]: The fit part learns the unique values of the column and assigns them an integer value. The transform part converts the original categorical
# values into these integer values.
# transform on test_df[column]: This ensures that the same encoding learned from the training set is applied to the test set, maintaining consistency between the two datasets.
# This encoding allows the categorical columns Sex and Embarked to be used in machine learning models, which typically require numeric input.
for column in ['Sex', 'Embarked']:
    training_dataset[column]=label_encoder.fit_transform(training_dataset[column])
    testing_dataset[column] = label_encoder.transform(testing_dataset[column])

3. Feature Engineering

In [7]:
# This code extracts the title (e.g., "Mr", "Mrs", "Miss") from the Name column. It splits the name by the comma, takes the second part, then splits by the period,
# and strips any extra whitespace. #Lamda function's syntax is as : lamda a: a+10

training_dataset['Title']=training_dataset['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())
testing_dataset['Title']=testing_dataset['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())

# This dictionary maps common titles to themselves and rare titles to the category 'Rare'. The map method applies this mapping to the Title column in both the training and
# testing datasets.

title_map = {
    "Mr": "Mr", "Mrs": "Mrs", "Miss": "Miss", "Master": "Master",
    "Dr": "Rare", "Rev": "Rare", "Col": "Rare", "Major": "Rare",
    "Mlle": "Rare", "Countess": "Rare", "Ms": "Rare", "Lady": "Rare",
    "Jonkheer": "Rare", "Don": "Rare", "Dona": "Rare", "Mme": "Rare",
    "Capt": "Rare", "Sir": "Rare"
}

training_dataset['Title']=training_dataset['Title'].map(title_map)
testing_dataset['Title']=testing_dataset['Title'].map(title_map)

# The titles are encoded into numeric values using LabelEncoder. The fit_transform method is applied to the training set, and the transform method ensures the same encoding is
# applied to the test set. LabelEncoder can be used to normalize labels. It can also be used to transform non-numerical labels (as long as they are hashable and comparable) to
# numerical labels.

training_dataset['Title']=label_encoder.fit_transform(training_dataset['Title'])
testing_dataset['Title']=label_encoder.fit_transform(testing_dataset['Title'])

# The Name and Ticket columns are dropped from both the training and testing datasets as they are no longer needed for the analysis.

training_dataset.drop(columns=['Name', 'Ticket'], inplace=True)
testing_dataset.drop(columns=['Name', 'Ticket'], inplace=True)

4. Feature Scaling

In [30]:
#StandardScaler: Standardizes features by removing the mean and scaling to unit variance. Standard Scaler z= (x-mean)/std

scaler=StandardScaler()
features=['Age', 'Fare', 'SibSp', 'Parch']

# The fit(data) method is used to compute the mean and std dev for a given feature to be used further for scaling. The transform(data) method is used to
# perform scaling using mean and std dev calculated using the . fit() method. The fit_transform() method does both fits and transform.
#fit_transform computes the mean and standard deviation for each feature on the training data and scales the training data accordingly.

training_dataset[features]=scaler.fit_transform(training_dataset[features])

#transform uses the mean and standard deviation computed from the training data to scale the test data. This ensures that the test data is scaled in the same way as the training data,
#which is crucial for consistency.

testing_dataset[features]=scaler.transform(testing_dataset[features])

training_dataset.head()



Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,1,0,3,1,-0.565736,0.432793,-0.473674,-0.502445,2,2
1,2,1,1,0,0.663861,0.432793,-0.473674,0.786845,0,3
2,3,1,3,0,-0.258337,-0.474545,-0.473674,-0.488854,2,1
3,4,1,1,0,0.433312,0.432793,-0.473674,0.42073,2,3
4,5,0,3,1,0.433312,-0.474545,-0.473674,-0.486337,2,2


# Step#05 Model training and evaluation

1. Split the Data



In [9]:
X= training_dataset.drop('Survived', axis=1)
y=training_dataset['Survived']
X_train, X_val, y_train, y_val = train_test_split(X,y, random_state=42, test_size=0.2)

2. Train multiple models

In [None]:
# Creating a dictionary of classifiers where the key is the classifier name and the value is the instantiated classifier object.Adjusting the LogisticRegression
# parameter max_iter to ensure convergence.

classifiers = {
    'Logistic Regression': LogisticRegression(max_iter=200),
    'k-Nearest Neighbors': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'Naive Bayes': GaussianNB()
}

# Output of printing classifiers.items():
# dict_items([('Logistic Regression', LogisticRegression(max_iter=200)), ('k-Nearest Neighbors', KNeighborsClassifier()), ('Decision Tree', DecisionTreeClassifier()),
# ('Random Forest', RandomForestClassifier()), ('SVM', SVC()), ('Naive Bayes', GaussianNB())])

# This loop iterates through each item in the classifiers, dictionary.name is the key (a string representing the classifier's name like 'SVM').
# clf is the value (the classifier object itself, e.g., LogisticRegression(), KNeighborsClassifier(), etc.).

for name, clf in classifiers.items():

# clf.fit(X_train, y_train) trains the classifier (clf) using the training data (X_train and y_train).fit is a method in scikit-learn that trains the model on the provided data.
  clf.fit(X_train, y_train)

# y_pred = clf.predict(X_val) makes predictions on the validation set (X_val) using the trained model.predict is a method that generates the predicted values (labels) for the
# input data. We n
  y_pred=clf.predict(X_val)

#This line prints the name of the classifier currently being evaluated, followed by a newline character for better readability.
  print(f'{name}:\n')

# classification_report(y_val, y_pred) generates a detailed classification report, which includes metrics such as precision, recall, f1-score, and support for each class.
# y_val is the true labels for the validation set, and y_pred is the predicted labels.

  print(classification_report(y_val, y_pred))  # Print classification report
  print(f'Accuracy: {accuracy_score(y_val, y_pred)}\n')  # Print accuracy score


# Step#06 Hyperparameter Tuning (Example for K nearest neighbor)

Hyperparamter tuning is used to increase the accuracy of model, As for the k nearest neighbor it's pretty less like 56.9% or 0.569. So, that's why it's being fine tuned

1. Grid Search for Hyperparameters

In [None]:
# param_grid is a dictionary that specifies the hyperparameters and their respective values to be tested:
# 'n_neighbors': List of values for the number of neighbors to use.
# 'weights': List of strategies to use for weighting the neighbors ('uniform' means all neighbors have equal weight, 'distance' means closer neighbors have more weight).
# 'metric': List of distance metrics to use ('euclidean', 'manhattan', and 'minkowski').

param_grid={
    'n_neighbors':[7,9,11,13,15,17,19],
    'weights':['unform', 'distance'],
    'metric':['euclidean', 'manhattan', 'minkowski']
}

knn= KNeighborsClassifier()

# GridSearchCV: This is a scikit-learn class used to perform an exhaustive search over a specified parameter grid.
# estimator=knn: The estimator to use is our k-NN classifier.
# param_grid=param_grid: The grid of parameters to search over is the one we defined earlier.
# cv=3: Cross-validation strategy to use. Here, cv=3 means 3-fold cross-validation. The data will be split into three parts, and the model will be trained and validated three times,
# each time using a different part for validation and the remaining parts for training.
# n_jobs=-1: This parameter allows the grid search to use all available CPU cores for computation, speeding up the process.
# verbose=2: This parameter controls the verbosity of the output. A value of 2 means that it will print messages about the progress of the grid search.

grid_search_knn= GridSearchCV(estimator=knn, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search_knn.fit(X_train, y_train)

# grid_search_knn.best_params_ prints the best combination of hyperparameters found during the search.
# grid_search_knn.best_score_ prints the best cross-validation score achieved with the best parameters.
print(grid_search_knn.best_params_)
#print(grid_search_knn.best_score_)

2. Train final k-NN Model

Accuracy of k nearest neighbour improved by almost 8% to 64.24% from inital value of 56.9%




In [None]:
#Stores the best parameters found by the grid search.

best_params= grid_search_knn.best_params_

#Creates a new k-NN classifier using the best parameters.

best_knn=KNeighborsClassifier(**best_params)

best_knn.fit(X_train, y_train)
y_pred_knn= best_knn.predict(X_val)


print(classification_report(y_val, y_pred_knn))
print(f'Accuracy: {accuracy_score(y_val, y_pred_knn)}')


# Step#07 Prediction on test data and Plotting of results

In [None]:
conf_matrix = confusion_matrix(y_val, y_pred_knn)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix for k-NN Model')
plt.show()

# The confusion matrix for the k-NN model shows that the model accurately predicted 97 passengers who did not survive and 18 passengers who
# survived. However, it incorrectly classified 56 actual survivors as non-survivors and 8 non-survivors as survivors, indicating a significant
# number of false negatives and suggesting room for improvement in model performance.

# `Step#08 Saving the Model`

In [25]:
joblib.dump(best_knn, 'Titanic_survivor_prediction_2.pkl')

['Titanic_survivor_prediction_2.pkl']

# Step#09 Testing on real world data

In [42]:
sample_data = {
    'PassengerId': [1],    # Example ID, not used for prediction
    'Pclass': [1],         # Example class: 1 for first class
    'Sex': [1],            # Example encoding: 1 for male
    'Age': [29],
    'SibSp': [1],
    'Parch': [0],
    'Fare': [50],
    'Embarked': [1],       # Example encoding: 1 for C
    'Title': [0]           # Example encoding: 0 for 'Mr'
}

# Convert to DataFrame
sample_df = pd.DataFrame(sample_data)
# Predict
survival_prediction = best_knn.predict(sample_df)
print(f'Survival Prediction: {"Survived" if survival_prediction[0] == 1 else "Did not survive"}')



Survival Prediction: Survived
