In [1]:
#import libraries
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score

In [2]:
#import csv files
df_train = pd.read_csv(r"C:\Users\Galvin\Desktop\Kaggle\Titanic\train.csv")
df_val = pd.read_csv(r"C:\Users\Galvin\Desktop\Kaggle\Titanic\test.csv")

In [3]:
#check dimension of dataframes
df_train.shape, df_val.shape

((891, 12), (418, 11))

In [4]:
#check data types
df_train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [5]:
#check completeness of dataframe
df_train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [6]:
#check completeness of dataframe
df_val.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [7]:
#fill blank cells with mean values
df_train['Age'].fillna(df_train['Age'].median(), inplace=True)
df_val['Age'].fillna(df_val['Age'].median(), inplace=True)

df_val['Fare'].fillna(df_val['Fare'].mean(), inplace=True)

df_train['Embarked'].fillna('S', inplace=True)

In [8]:
#drop irrelevant columns
df_train.drop(columns=['Cabin'], inplace=True)
df_val.drop(columns=['Cabin'], inplace=True)
df_train.drop(columns=['PassengerId','Name','Ticket'], inplace=True)
df_val.drop(columns=['Name','Ticket'], inplace=True)

In [9]:
#map categorical features into integers
df_train['Sex'] = df_train['Sex'].map({'male':1, 'female':0}).astype(int)
df_train['Embarked'] = df_train['Embarked'].map({'S':2, 'Q':1, 'C':0}).astype(int)
df_val['Sex'] = df_val['Sex'].map({'male':1, 'female':0}).astype(int)
df_val['Embarked'] = df_val['Embarked'].map({'S':2, 'Q':1, 'C':0}).astype(int)

In [10]:
#feature engineering
df_train.insert(6, 'IsAlone', 0)
df_val.insert(6, 'IsAlone', 0)

for index in range(len(df_train)):
    if df_train['SibSp'][index] + df_train['Parch'][index] == 0:
        df_train['IsAlone'][index] = 1
    else:
        df_train['IsAlone'][index] = 0

for index in range(len(df_val)):
    if df_val['SibSp'][index] + df_val['Parch'][index] == 0:
        df_val['IsAlone'][index] = 1
    else:
        df_val['IsAlone'][index] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


In [11]:
#feature scaling
scaler = StandardScaler()

temp = scaler.fit_transform(df_train[['Age', 'Fare']])
temp1 = scaler.fit_transform(df_val[['Age', 'Fare']])

df_train[['Age', 'Fare']] = temp
df_val[['Age', 'Fare']] = temp1

In [12]:
#principle component analysis
pca = PCA(n_components=1)

pCom = pca.fit_transform(df_train[['Age', 'Fare']])
pCom1 = pca.fit_transform(df_val[['Age', 'Fare']])

df_train['pCom'] = pCom
df_val['pCom'] = pCom1

In [13]:
#split datasets
X = df_train.drop(columns=['Survived', 'Age', 'Fare'])
Y = df_train['Survived']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [14]:
#train linear discriminant analysis model
lda = LinearDiscriminantAnalysis()
model = lda.fit(x_train, y_train)

In [15]:
#evaluate model accuracy
y_pred = model.predict(x_test)
acc = accuracy_score(y_test, y_pred, normalize=True)
acc

0.7877094972067039

In [16]:
#make predictions for validation data
prediction = model.predict(df_val.drop(columns=['PassengerId', 'Age', 'Fare']))

In [17]:
#create prediction dataframe
submission = pd.DataFrame({'PassengerId': df_val['PassengerId'], 'Survived': prediction})
submission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [18]:
#export dataframe as csv file
submission.to_csv(r"C:\Users\Galvin\Desktop\Kaggle\Titanic\submission_new.csv", index=False)