In [None]:
#
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('~/Documents/Titanic_project'):

    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
#IMPORT LIBRARIES
import seaborn as sns #library visualization
import matplotlib.pyplot as plt


In [None]:
#LOAD THE DATA

df_train = pd.read_csv('~/Documents/Titanic_project/titanicproject/train.csv')
df_test = pd.read_csv('~/Documents/Titanic_project/titanicproject/test.csv')



In [None]:
#SHOW FIRST 10 ROWS (TRAIN DATA)
df_train.head(10)


In [None]:
#SHOW FIRST 5 ROWS (TEST DATA)
df_test.head(5)

In [None]:
#Check for Null values (TRAIN DATA)
print("Columns with missing values: ")
print(df_train.columns[df_train.isnull().any()].tolist())

In [None]:
#Check for Null values (TEST DATA)
print("Columns with missing values: ")
print(df_test.columns[df_test.isnull().any()].tolist())

In [None]:
print(df_train.isnull().sum())

In [None]:
print(df_test.isnull().sum())

In [None]:
# 3 ways to handle null values

#1.Remove NaN rows 
#2.Set NaN to hard coded value 
#3.Impute NaN values based on other rows


In [None]:
# DROPPING NaN ROWS
print("Before dropping - " + str(len(df_train)) + " rows")
df_train = df_train[~df_train['Embarked'].isna()]
print("After dropping - " + str(len(df_train)) + " rows")

In [None]:
#FILL NaN WITH MEAN VALUE
mean_fare = df_test['Fare'].mean()
print("Mean value of fare = " + str(mean_fare))
print("NA count before fill = " + str(len(df_test[df_test['Fare'].isna()])))
df_test['Fare'] = df_test['Fare'].fillna(mean_fare)
print("NA count after fill = " + str(len(df_test[df_test['Fare'].isna()])))

In [None]:
print(df_test.isnull().sum())

In [None]:
#FILL NaN WITH MEAN VALUE (TRAIN DATA)
mean_age = df_train['Age'].mean()
print("Mean age of passengers = " + str(mean_age))
print("NA count before fill = " + str(len(df_train[df_train['Age'].isna()])))
df_train['Age'] = df_train['Age'].fillna(mean_age)
print("NA count after fill = " + str(len(df_train[df_train['Age'].isna()])))

In [None]:
#FILL NaN WITH MEAN VALUE (TEST DATA)
mean_age = df_test['Age'].mean()
print("Mean age of passengers = " + str(mean_age))
print("NA count before fill = " + str(len(df_test[df_test['Age'].isna()])))
df_test['Age'] = df_test['Age'].fillna(mean_age)
print("NA count after fill = " + str(len(df_test[df_test['Age'].isna()])))

In [None]:
#Statistical distribution
df_train.describe()

In [None]:
hist = df_train.hist(figsize=(10,10),layout=(3,4))


In [None]:
sns.pairplot(df_train)
plt.show()

In [None]:
corr=df_train.corr()

corr.style.background_gradient(cmap='coolwarm')



Correaltions observed in Titanic:

Pclass and Fare Age and Parch Age and SibSp Age and Pclass Pclass and Survived

In [None]:
#FEATURE ENGINEERING - GET RELATIVE COUNT BY SUMMING 'SibSp' &  'Parch'
df_train['RelativeCount'] = df_train['SibSp'] + df_train['Parch']
df_test['RelativeCount'] = df_test['SibSp'] + df_test['Parch']
df_train['RelativeCount'].describe()
df_test.head(5)


Was a passenger travelling alone?

numpy.where(): Return elements chosen from x or y depending on condition. https://docs.scipy.org/doc/numpy/reference/generated/numpy.where.html

In [None]:
#FEATURE ENGINEERING - GET "Was a passenger travelling alone?" 
df_train['TravelAlone'] = np.where(df_train['SibSp']+df_train['Parch']>0, 0, 1)
df_test['TravelAlone'] = np.where(df_test['SibSp']+df_test['Parch']>0, 0, 1)
# No - 0 , YES - 1


In [None]:
# FEATURE 'Age' CONVERT TO CATEGORICAL VALUE - AGEBUCKETING
def bucket_age(age):
  if age < 15:
    return "<15"
  if age >= 15 and age < 30:
    return "15-30"
  if age >=30 and age < 45:
    return "30-45"
  if age>=45 and age < 60:
    return "45-60"
  return ">60"

df_train['AgeBucket'] = df_train['Age'].apply(bucket_age)
df_test['AgeBucket'] = df_test['Age'].apply(bucket_age)

#Visualising with a pie chart
print('TRAIN:')
pie = df_train['AgeBucket'].value_counts().plot(kind="pie",title='AgeBucket Distribution',legend=True,autopct='%1.1f%%')


In [None]:
print('TEST:')
pie_test = df_test['AgeBucket'].value_counts().plot(kind="pie",title='AgeBucket Distribution',legend=True,autopct='%1.1f%%')

In [None]:
# EXTRACTING NAME LENGTH FROM FEATURE 'Name'

df_train['Name_Length'] = df_train['Name'].apply(lambda x : len(x))
df_train['Name_Length'] = (df_train.Name_Length).astype(np.int64)+1

# PLOTTING
plt.subplots(figsize=(15, 6))
sns.barplot(data=df_train,x='Name_Length',y='Survived')


In [None]:
df_test['Name_Length'] = df_test['Name'].apply(lambda x : len(x))
df_test['Name_Length'] = (df_test.Name_Length).astype(np.int64)+1




In [None]:
# EXTRACTING TITLE FROM A NAME (TRAIN DATA)
title = df_train.Name.values
import re
for i,t in enumerate(title):
    r = re.search(',([A-Za-z ]*)',t)
    title[i] = r.group(1)


In [None]:
# EXTRACTING TITLE FROM NAME (TEST DATA)
title = df_test.Name.values

for i,t in enumerate(title):
    r = re.search(',([A-Za-z ]*)',t)
    title[i] = r.group(1)


In [None]:
# In order to encode the text values by putting a running sequence for each text values from whole dataset, we first concatenate df_train and df_test before label encoding
all_data = pd.concat([df_train, df_test])
all_data.shape

In [None]:
# SEX -LABEL ENCODING 
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
trained_le = le.fit(all_data.Sex)
df_train['Sex'] = trained_le.transform(df_train.Sex)
df_test['Sex'] = trained_le.transform(df_test.Sex)


In [None]:
# EMBARKED -LABEL ENCODING

le = preprocessing.LabelEncoder()
trained_le = le.fit(all_data.Embarked)
df_train['Embarked'] = trained_le.transform(df_train.Embarked)
df_test['Embarked'] = trained_le.transform(df_test.Embarked)


In [None]:
# AGE_BUCKET - LABEL ENCODING

le = preprocessing.LabelEncoder()
trained_le = le.fit(all_data.AgeBucket)
df_train['AgeBucket'] = trained_le.transform(df_train.AgeBucket)
df_test['AgeBucket'] = trained_le.transform(df_test.AgeBucket)

In [None]:
# DUMMY ENCODING 
agebucket_dummies=pd.get_dummies(df_train.AgeBucket, prefix='AgeBucket')
df_train=pd.concat([df_train,agebucket_dummies],axis=1)

In [None]:
agebucket_dummies=pd.get_dummies(df_test.AgeBucket, prefix='AgeBucket')
df_test=pd.concat([df_test,agebucket_dummies],axis=1)

In [None]:
all_data['Fare'].sort_values(ascending=True)

In [None]:
#FEATURE 'Fare' CONVERT TO CATEGORICAL VALUE 
def bucket_Fare(Fare):
  if Fare < 100:
    return "<100"
  if Fare >= 100 and Fare < 200:
    return "100-200"
  if Fare >=200 and Fare < 300:
    return "200-300"
  if Fare>=300 and Fare < 400:
    return "300-400"
  return ">400"

df_train['FareBucket'] = df_train['Fare'].apply(bucket_Fare)
df_test['FareBucket'] = df_test['Fare'].apply(bucket_Fare)

#Visualize this with a pie chart
print('TRAIN:')
pie = df_train['FareBucket'].value_counts().plot(kind="pie",title='FareBucket Distribution',legend=True,autopct='%1.1f%%')
#pie = df_test['FareBucket'].value_counts().plot(kind="pie",title='FareBucket Distribution',legend=True,autopct='%1.1f%%')


In [None]:
pie = df_test['FareBucket'].value_counts().plot(kind="pie",title='FareBucket Distribution',legend=True,autopct='%1.1f%%')


In [None]:
all_data = pd.concat([df_train, df_test])


In [None]:
# FareBucket - LABEL ENCODING

le = preprocessing.LabelEncoder()
trained_le = le.fit(all_data.FareBucket)
df_train['FareBucket'] = trained_le.transform(df_train.FareBucket)
df_test['FareBucket'] = trained_le.transform(df_test.FareBucket)

In [None]:
# DUMMY ENCODING
sex_dummies=pd.get_dummies(df_train.Sex, prefix='Sex')
df_train=pd.concat([df_train,sex_dummies],axis=1)

sex_dummies=pd.get_dummies(df_test.Sex, prefix='Sex')
df_test=pd.concat([df_test,sex_dummies],axis=1)


In [None]:
embarked_dummies=pd.get_dummies(df_train.Embarked, prefix='Embarked')
df_train=pd.concat([df_train,embarked_dummies],axis=1)

embarked_dummies=pd.get_dummies(df_test.Embarked, prefix='Embarked')
df_test=pd.concat([df_test,embarked_dummies],axis=1)

In [None]:
# EXTRACTING FIRST CHARACTER FROM 'Cabin' (TRAIN DATA)
New_cabin = df_train['Cabin'].astype(str).str[0]
df = pd.DataFrame(New_cabin)




In [None]:
# DROPPING Cabin (TRAIN DATA)
df_train.drop(labels=['Cabin'],axis=1,inplace=True)
df_train=pd.concat([df_train,df],axis=1)
df_train.head(5)



In [None]:
#EXTRACTING FIRST CHARACTER FROM 'Cabin' (TEST DATA)
New_cabin = df_test['Cabin'].astype(str).str[0]
df = pd.DataFrame(New_cabin)
df_test.drop(labels=['Cabin'],axis=1,inplace=True)
df_test=pd.concat([df_test,df],axis=1)
df_test.head(5)

In [None]:
New_cabin = all_data['Cabin'].astype(str).str[0]
df = pd.DataFrame(New_cabin)
all_data.drop(labels=['Cabin'],axis=1,inplace=True)
all_data=pd.concat([all_data,df],axis=1)

all_data.Cabin.unique()

In [None]:
#CREATE DICTIONARY WITH UNIQUE VALUES FROM 'Cabin' (ALL_DATA) & MAP VALUES FROM 'Cabin' (TRAIN DATA)
Temp_dict = {'n' : 1, 'C' : 7, 'E': 5, 'G': 4, 'D': 6, 'A': 9, 'B': 8, 'F': 3 , 'T': 2}
df_train['NewCabin'] = df_train.Cabin.map(Temp_dict)


In [None]:
df_test['NewCabin'] = df_test.Cabin.map(Temp_dict)


In [None]:
# DROPPING COLUMNS: CABIN, TICKET, FARE, SEX, NAME, EMBARKED, AGEBUCKET
df_train.drop(labels=['Cabin','Ticket','Fare','Sex', 'Name', 'Embarked', 'AgeBucket' ],axis=1,inplace=True)
df_test.drop(labels=['Cabin','Ticket','Fare','Sex','Name', 'Embarked', 'AgeBucket'],axis=1,inplace=True)

In [None]:
#DIVIDING THE DATA INTO Y_TRAIN AND X_TRAIN AND CONVERTING THEM INTO NP ARRAYS
y_train = df_train.loc[:,'Survived'].values
x_train =df_train.drop(['Survived'],axis=1).values
x_test = df_test.values

# Confusion Matrix
from sklearn.metrics import confusion_matrix
dict_K = {}
dic = {}

#Kfold Validation
def get_acc(Xtrain,Ytrain,model):
    from sklearn.model_selection import KFold
    acc = []
    k=KFold(n_splits=4)
    for train , test in k.split(Xtrain,y=Ytrain):
        x_train = Xtrain[train,:]
        y_train = Ytrain[train]
        x_test = Xtrain[test,:]
        y_test = Ytrain[test]
        model.fit(x_train,y_train)
        y_pred = model.predict(x_test)
        cm = confusion_matrix(y_true=y_test,y_pred=y_pred)
        acc.append((cm[1,1]+cm[0,0])/((cm[1,0]+cm[0,1]+cm[1,1]+cm[0,0])+1e-5))
    return acc

from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=100,criterion='entropy')
dict_K['Random_forest'] = get_acc(x_train,y_train,classifier)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)

In [None]:
df_F = pd.DataFrame(dict_K)

In [None]:
#PREDICT RESULTS
df_F.mean()

In [None]:
# SUBMIT RESULTS
p = df_test.PassengerId
p = pd.concat([p,pd.DataFrame(y_pred.astype(np.int64),columns=['Survived'])],axis=1)
p.to_csv('Surv_predF.csv',index=False)