In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
from sklearn.metrics import jaccard_similarity_score
import pickle

In [2]:
df = pd.read_csv("test.csv")
df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [3]:
#drop column cabin as it has a lot of null values
df = df[[x for x in df.columns if x!= 'Cabin']]
df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,S


In [4]:
df.shape

(418, 10)

In [7]:
#drop passenger id and name as it does not correlate with survival
df.drop(columns = ['PassengerId'],inplace=True)
df.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,Q
1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,S
2,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,Q
3,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,S
4,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,S


In [8]:
#clearly name has no correlation with survival
#so drop the column
df.drop(columns = ['Name'],inplace=True)
df.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,3,male,34.5,0,0,330911,7.8292,Q
1,3,female,47.0,1,0,363272,7.0,S
2,2,male,62.0,0,0,240276,9.6875,Q
3,3,male,27.0,0,0,315154,8.6625,S
4,3,female,22.0,1,1,3101298,12.2875,S


In [9]:
df.isnull().sum()

Pclass       0
Sex          0
Age         86
SibSp        0
Parch        0
Ticket       0
Fare         1
Embarked     0
dtype: int64

In [12]:
df["Age"].sum()

10050.5

In [13]:
#test data sum is 10050.5 with 332 values
#train_data sum is 21205.17 with 714 values
mean = (21205.17+10050.5)/(714+332)
mean

29.881137667304014

In [14]:
#we replace mean by 29.8811
df.loc[df["Age"].isnull(),"Age"] = mean

In [15]:
df["Age"].isnull().sum()

0

In [16]:
df["Fare"].sum()

14856.5376

In [17]:
mean_fare = (28693.9493+14856.5376)/(891+417)
mean_fare

33.29547928134557

In [18]:
df.loc[df["Fare"].isnull(),"Fare"] = mean_fare

In [19]:
df.isnull().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Ticket      0
Fare        0
Embarked    0
dtype: int64

In [20]:
#681 different values of ticket , hence no correlation
#drop the column
df.drop(columns = ['Ticket'],inplace=True)
df.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,34.5,0,0,7.8292,Q
1,3,female,47.0,1,0,7.0,S
2,2,male,62.0,0,0,9.6875,Q
3,3,male,27.0,0,0,8.6625,S
4,3,female,22.0,1,1,12.2875,S


In [22]:
df.to_csv('mod_test.csv',index=False)

In [24]:
df = pd.read_csv('category_mod.csv')
df.head()

Unnamed: 0,Age,Fare,1,2,0,1.1,2.1,3,4,5,female,0.1,1.2,2.2,3.1,4.1,5.1,6,C,Q
0,34.5,7.8292,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1
1,47.0,7.0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0
2,62.0,9.6875,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1
3,27.0,8.6625,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
4,22.0,12.2875,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0


In [27]:
sc = pickle.load(open("feature_scaler.sav",'rb'))

In [28]:
xtest = sc.transform(df)
xtest

array([[ 3.58655461e-01, -4.92395906e-01, -5.72351471e-01, ...,
        -3.91180506e-02, -5.09769806e-01,  3.10520151e+00],
       [ 1.32928263e+00, -5.08428664e-01, -5.72351471e-01, ...,
        -3.91180506e-02, -5.09769806e-01, -3.22040292e-01],
       [ 2.49403523e+00, -4.56465282e-01, -5.72351471e-01, ...,
        -3.91180506e-02, -5.09769806e-01,  3.10520151e+00],
       ...,
       [ 6.69256155e-01, -5.03594861e-01, -5.72351471e-01, ...,
        -3.91180506e-02, -5.09769806e-01, -3.22040292e-01],
       [-2.75868833e-16, -4.88126692e-01, -5.72351471e-01, ...,
        -3.91180506e-02, -5.09769806e-01, -3.22040292e-01],
       [-2.75868833e-16, -2.11472679e-01, -5.72351471e-01, ...,
        -3.91180506e-02,  1.96166973e+00, -3.22040292e-01]])