In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import decomposition
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.svm import SVC, LinearSVC

# Data Preprocessing
### I'm going to do preprocessing on the train and test data at the same time while only showing the train data, so it might look a bit confusing. Any scalers and the like are fit to train but also run on test.

In [2]:
titanic = pd.read_csv('titanic-train.csv', sep=',')
titanictest = pd.read_csv('titanic-test.csv', sep=',')
titanic.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# Dropping columns that don't matter.
titanic = titanic.drop(['PassengerId','Name','Ticket','Cabin'], axis=1)
titanictest = titanictest.drop(['PassengerId','Name','Ticket','Cabin'], axis=1)
titanic.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [4]:
nage = titanic.Age.isnull()
titanic[nage]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
5,0,3,male,,0,0,8.4583,Q
17,1,2,male,,0,0,13.0000,S
19,1,3,female,,0,0,7.2250,C
26,0,3,male,,0,0,7.2250,C
28,1,3,female,,0,0,7.8792,Q
...,...,...,...,...,...,...,...,...
859,0,3,male,,0,0,7.2292,C
863,0,3,female,,8,2,69.5500,S
868,0,3,male,,0,0,9.5000,S
878,0,3,male,,0,0,7.8958,S


In [5]:
print(titanictest.isnull().sum())
print(titanictest[titanictest['Fare'].isnull()])

Pclass       0
Sex          0
Age         86
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64
     Pclass   Sex   Age  SibSp  Parch  Fare Embarked
152       3  male  60.5      0      0   NaN        S


In [6]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


In [7]:
#Changing Pclass to categorical/object instead of int
#titanic['Pclass'] = titanic.Pclass.astype('object')
#titanictest['Pclass'] = titanictest.Pclass.astype('object')

In [8]:
"""I'm not sure whether I should be interpolating the test data on its own, or using the train data to help
define the age of the test data. I would think it's the latter, similar to scalers/transforming data."""

titanic['Age'] = titanic['Age'].interpolate()
titanictest['Age'] = titanic['Age'].interpolate()

titanictest['Fare'] = titanic['Fare'].interpolate()

In [9]:
col = list(titanic.columns)
catv = []
numv = []
numvcol = []

for i in col:
    if titanic.dtypes[i] == object:
        catv.append(i)
    elif str(titanic.dtypes[i]) in ['int64','float64']:
            numv.append(i)
            numvcol.append(i)
            
print(f'The categorical variables are:\t{catv}\n')
print(f'The numeric variables are:\t{numv}')

for i in catv:
    print(f'{i} has {titanic[i].nunique()} unique values.')

The categorical variables are:	['Sex', 'Embarked']

The numeric variables are:	['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
Sex has 2 unique values.
Embarked has 3 unique values.


In [10]:
titanic = pd.get_dummies(titanic, columns=catv, prefix_sep='_', drop_first=True)
titanictest = pd.get_dummies(titanictest, columns=catv, prefix_sep='_', drop_first=True)

In [11]:
X = titanic.iloc[:,1:]
y = titanic.iloc[:,0]

X2 = titanictest

#X_train, X_hold, y_train, y_hold = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)
#X_valid, X_test, y_valid, y_test = train_test_split(X_hold, y_hold, test_size=0.5, random_state=1, stratify=y_hold)

scaler = StandardScaler()
scaler.fit(X)
x_scaled = scaler.transform(X)
x2_scaled = scaler.transform(X2)

In [12]:
print(X.shape)
print(X2.shape)

(891, 8)
(418, 8)


# Beginning PCA

In [13]:
pca = decomposition.PCA(n_components=(len(X.columns)))
#pca = decomposition.PCA(n_components=(3))
pca.fit(x_scaled)
x_pca = pca.transform(x_scaled)

x2_pca = pca.transform(x2_scaled)

In [14]:
x_pca.shape

(891, 8)

In [15]:
print(X.shape)
print(X2.shape)

(891, 8)
(418, 8)


In [16]:
#labels=['PC' + str(i) for i in range(0,len(X.columns))]
labels=['PC' + str(i) for i in range(0,(x_pca.shape[1]))]

titanic_comp = pd.DataFrame(pca.components_,columns=X.columns,index=list(labels))
componentsT = titanic_comp.sort_values(by =labels, axis=1,ascending=False).round(decimals=6).T
components = componentsT.reindex(componentsT.PC0.abs().sort_values(ascending=False).index)
components

Unnamed: 0,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7
Fare,0.594347,-0.195529,0.033515,0.246694,-0.264802,0.032212,0.386715,0.570818
Pclass,-0.482067,0.440163,0.159971,-0.023193,0.148401,0.088437,-0.003609,0.719641
Parch,0.398464,0.43714,0.090538,0.046246,0.309353,0.717104,-0.024035,-0.171128
Sex_male,-0.348518,-0.086026,-0.290527,0.835653,-0.113282,0.255522,0.028902,-0.097227
SibSp,0.299538,0.480136,0.098264,0.429819,0.239449,-0.626132,-0.174489,-0.074318
Embarked_Q,-0.198908,-0.096689,0.667746,0.128892,0.193874,-0.045525,0.621774,-0.249652
Age,0.051901,-0.506253,-0.123213,0.069963,0.814994,-0.015204,-0.116311,0.207277
Embarked_S,-0.046306,0.270856,-0.640282,-0.178598,0.200466,-0.131405,0.646875,-0.082058


In [17]:
sum(pca.explained_variance_ratio_)

1.0

In [18]:
cumVar = 0
for i in range(0,len(labels)):
    var = pca.explained_variance_ratio_[i]
    cumVar += var
    print(f'PC{i:02} explains {100*var:.2f}% for a cumulative total of {100*cumVar:.2f}%')

PC00 explains 22.97% for a cumulative total of 22.97%
PC01 explains 21.00% for a cumulative total of 43.97%
PC02 explains 19.21% for a cumulative total of 63.18%
PC03 explains 10.46% for a cumulative total of 73.64%
PC04 explains 9.71% for a cumulative total of 83.35%
PC05 explains 6.99% for a cumulative total of 90.35%
PC06 explains 5.16% for a cumulative total of 95.51%
PC07 explains 4.49% for a cumulative total of 100.00%


In [19]:
x_pca = pd.DataFrame(x_pca)
x2_pca = pd.DataFrame(x2_pca)
x_pca

Unnamed: 0,0,1,2,3,4,5,6,7
0,-1.010042,0.878683,-0.632552,0.448283,-0.259291,-0.424099,0.033953,0.196489
1,1.802817,-1.434921,0.924438,-0.447838,-0.228118,-0.853065,-1.098385,-0.164483
2,-0.529264,0.474713,-0.148575,-1.667461,-0.008399,-0.394800,0.103545,0.534872
3,1.470564,-0.648932,-0.491630,-0.952252,0.140707,-1.155136,0.230275,-0.601541
4,-1.223691,-0.033772,-0.836451,0.127724,0.281706,0.130308,0.089679,0.467047
...,...,...,...,...,...,...,...,...
886,-0.617427,-0.288538,-0.953614,0.139785,-0.391530,0.036436,0.199510,-0.456628
887,0.862579,-0.410669,-0.454493,-1.537547,-0.891882,-0.584497,0.342666,-1.038288
888,0.904462,2.062008,0.206867,-1.103168,0.688794,0.831271,0.035737,0.168817
889,0.262634,-1.450888,0.305682,0.645936,-1.166267,0.236280,-1.100560,-0.954067


In [20]:
# There has to be a better way at getting the PCA into a DF. Maybe a function? Gonna think over ways to standardize and make this better.
# This is good enough for now.
def PCADF(dataframe,y_value,PCsWanted):
    headers = ['PC' + str(x) for x in range(PCsWanted)]
    dataframe.columns = ['PC' + str(x) for x in range(len(dataframe.columns))]
    data = [y_value, dataframe[headers]]
    df_pca = pd.concat(data, axis=1)
    return df_pca

In [21]:
df_pc1 = PCADF(x_pca,y,1)
df_pc2 = PCADF(x_pca,y,2)
df_pc3 = PCADF(x_pca,y,3)
df_pc5 = PCADF(x_pca,y,5)

dftest_pc1 = x2_pca.iloc[:,:1]
dftest_pc2 = x2_pca.iloc[:,:2]
dftest_pc3 = x2_pca.iloc[:,:3]
dftest_pc5 = x2_pca.iloc[:,:5]

In [22]:
X_train = df_pc5.iloc[:,1:]
y_train = df_pc5.iloc[:,0]

X_test = dftest_pc5

print(X_train.head(5))
print(X_test.head(5))

        PC0       PC1       PC2       PC3       PC4
0 -1.010042  0.878683 -0.632552  0.448283 -0.259291
1  1.802817 -1.434921  0.924438 -0.447838 -0.228118
2 -0.529264  0.474713 -0.148575 -1.667461 -0.008399
3  1.470564 -0.648932 -0.491630 -0.952252  0.140707
4 -1.223691 -0.033772 -0.836451  0.127724  0.281706
          0         1         2         3         4
0 -1.886277 -0.506168  3.085154  0.916001 -0.234411
1  0.545550  0.223698 -0.123074 -0.902339  0.574923
2 -1.286349 -1.181328  2.858694  0.967251 -0.180986
3 -0.684578 -0.211130 -0.806051  0.351492  0.041513
4  0.272247  1.124572 -0.026756 -1.174126  1.120097


In [23]:
svc = SVC()
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
print(f'Support Vector Classifier Training Accuracy: {svc.score(X_train, y_train):.2f}')

Support Vector Classifier Training Accuracy: 0.84


In [24]:
titanicresults = pd.read_csv('titanic-test.csv', sep=',', index_col='PassengerId')
titanicresults.insert(0,'Survived_Pred', y_pred)
titanicresults.to_csv('DicksonTitanicPredictions.csv')
titanicresults

Unnamed: 0_level_0,Survived_Pred,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
896,0,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
1305,0,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
1306,1,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
1307,0,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
1308,0,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S
