<a href="https://colab.research.google.com/github/KingsleySepeng/DeepLearningNotebooks/blob/main/Titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#load the train data
train_data = pd.read_csv("/content/drive/MyDrive/ML_Practice/Titanic/titanic.zip (Unzipped Files)/train.csv")

In [None]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
#Explore the data
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [None]:
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [None]:
train_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [None]:
#Handle the missing values for Age, Cabin and Emabarked
median_age = train_data['Age'].median()
train_data['Age'].fillna(median_age,inplace=True)


In [None]:
train_data.drop('Cabin',axis=1,inplace=True) #Dropping this column because there are so many nulls

In [None]:
mode_embarked = train_data['Embarked'].mode()[0]
train_data['Embarked'].fillna(mode_embarked,inplace=True)

In [None]:
train_data.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [None]:
train_data['Sex'] = train_data['Sex'].map({'male':0,'female':1})

In [None]:
#one hot encoding creates dummy variables for each category
train_data = pd.get_dummies(train_data,columns=['Embarked'],drop_first=True)

In [None]:
#We need to split our data into features X and target variable y then split further into training and validation sets
from sklearn.model_selection import train_test_split
X = train_data.drop(['PassengerId','Name','Ticket','Survived'],axis=1)
y = train_data['Survived']

X_train,X_val,y_train,y_val = train_test_split(X,y,test_size=0.2,random_state=42)


In [None]:
#Scaling the numerical features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)


In [None]:
#Building the model
from keras.models import Sequential
from keras.layers import Dense
model = Sequential()
model.add(Dense(64,activation='relu',input_shape =(X_train_scaled.shape[1],)))
model.add(Dense(32,activation='relu'))
model.add(Dense(1,activation='sigmoid'))

model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model.fit(X_train_scaled,y_train,epochs=10,batch_size=32,validation_data=(X_val_scaled,y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7e0f3138d090>

In [None]:
loss,accuracy = model.evaluate(X_val_scaled,y_val)
print("Validation Loss: ", loss)
print("Validation Accuracy: ",accuracy)

Validation Loss:  0.4267108738422394
Validation Accuracy:  0.7988826632499695


In [None]:
#load and preprocess the test_data
test_data = pd.read_csv("/content/drive/MyDrive/ML_Practice/Titanic/titanic.zip (Unzipped Files)/test.csv")

In [None]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [None]:
test_data.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [None]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [None]:
median_test_age = test_data['Age'].median()
test_data['Age'].fillna(median_test_age,inplace=True)

median_test_fare = train_data['Fare'].median()
test_data['Fare'].fillna(median_test_fare,inplace=True)

test_data.drop('Cabin',axis=1,inplace=True)
# mode_test_embarked = test_data['Embarked'].mode()[0]
# test_data['Embarked'].fillna(mode_test_embarked,inplace=True)
test_data['Sex'] = test_data['Sex'].map({'male':0,'female':1})

In [None]:
test_data = pd.get_dummies(test_data,columns=['Embarked'],drop_first=True)
test_features = test_data.drop(['PassengerId','Name','Ticket'],axis=1)
test_features_scaled= scaler.transform(test_features)

In [None]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked_Q,Embarked_S
0,892,3,"Kelly, Mr. James",0,34.5,0,0,330911,7.8292,1,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,1,0,363272,7.0,0,1
2,894,2,"Myles, Mr. Thomas Francis",0,62.0,0,0,240276,9.6875,1,0
3,895,3,"Wirz, Mr. Albert",0,27.0,0,0,315154,8.6625,0,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,3101298,12.2875,0,1


In [None]:
#Predict
predictions = model.predict(test_features_scaled)



In [None]:
nan_indices = np.isnan(predictions)
nan_values = predictions[nan_indices]
print("Number of NaN values: ",nan_values.shape[0])

Number of NaN values:  0


In [None]:
binary_predictions = np.round(predictions).astype(int)
submission_df = pd.DataFrame({'PassengerId': test_data['PassengerId'],'Survived':binary_predictions.flatten()})
submission_df.to_csv('/content/drive/MyDrive/ML_Practice/Practice_Data/titanicSubmission',index=False)

In [None]:
#Kaggle Score: 0.77033

EDA analysis and feature engineering to be done.