In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

### Load Data, Combine train & test set

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [5]:
train.shape

(891, 12)

In [6]:
test.shape

(418, 11)

In [3]:
all_data = pd.concat([train, test],axis=0)

In [7]:
all_data.shape

(1309, 12)

### Data Structure

In [15]:
train.columns.values

array(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], dtype=object)

In [16]:
test.columns.values

array(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'], dtype=object)

**'Survived' is the label**

In [18]:
all_data.to_csv('check1.csv',index=False)

In [19]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     891 non-null    float64
 2   Pclass       1309 non-null   int64  
 3   Name         1309 non-null   object 
 4   Sex          1309 non-null   object 
 5   Age          1046 non-null   float64
 6   SibSp        1309 non-null   int64  
 7   Parch        1309 non-null   int64  
 8   Ticket       1309 non-null   object 
 9   Fare         1308 non-null   float64
 10  Cabin        295 non-null    object 
 11  Embarked     1307 non-null   object 
dtypes: float64(3), int64(4), object(5)
memory usage: 132.9+ KB


**Try to fix some missing values in 'Age', 'Fare' and 'Embarked'. <br>Drop 'Cabin' due to high missing values.**

In [20]:
all_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,1309.0,891.0,1309.0,1046.0,1309.0,1309.0,1308.0
mean,655.0,0.383838,2.294882,29.881138,0.498854,0.385027,33.295479
std,378.020061,0.486592,0.837836,14.413493,1.041658,0.86556,51.758668
min,1.0,0.0,1.0,0.17,0.0,0.0,0.0
25%,328.0,0.0,2.0,21.0,0.0,0.0,7.8958
50%,655.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,982.0,1.0,3.0,39.0,1.0,0.0,31.275
max,1309.0,1.0,3.0,80.0,8.0,9.0,512.3292


### Data Analysis

In [29]:
pd.crosstab(train.Sex, train.Survived, normalize='index')

Survived,0,1
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,0.257962,0.742038
male,0.811092,0.188908


**Most women survived, sadly most men died, only one third of the survivals are men.**

In [28]:
pd.crosstab(train.Pclass, train.Survived, normalize='index')

Survived,0,1
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.37037,0.62963
2,0.527174,0.472826
3,0.757637,0.242363


**Class 1 has better survival rate**

In [39]:
group_fare = train.groupby('Pclass')
group_fare['Fare'].sum() / group_fare['Fare'].count()

Pclass
1    84.154687
2    20.662183
3    13.675550
Name: Fare, dtype: float64

**Class 1 is much more expensive than Class 2 and Class 3**

In [114]:
train['Pclass_Sex'] =  train['Pclass'].astype(str) +'_' + train['Sex'].astype(str)

In [115]:
train['Pclass_Sex'].value_counts()

3_male      347
3_female    144
1_male      122
2_male      108
1_female     94
2_female     76
Name: Pclass_Sex, dtype: int64

In [116]:
pd.crosstab(train.Pclass_Sex, train.Survived, normalize='index')

Survived,0,1
Pclass_Sex,Unnamed: 1_level_1,Unnamed: 2_level_1
1_female,0.031915,0.968085
1_male,0.631148,0.368852
2_female,0.078947,0.921053
2_male,0.842593,0.157407
3_female,0.5,0.5
3_male,0.864553,0.135447


**97% women in Class 1 survived, 92% women in Class 2 survived, this can be a strong indicator**

In [120]:
train['Family_Size'] = train['SibSp'] + train['Parch'] + 1
pd.crosstab(train.Family_Size, train.Survived, normalize='index')

Survived,0,1
Family_Size,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.696462,0.303538
2,0.447205,0.552795
3,0.421569,0.578431
4,0.275862,0.724138
5,0.8,0.2
6,0.863636,0.136364
7,0.666667,0.333333
8,1.0,0.0
11,1.0,0.0


**Family Size of 4 has the highest survival rate.**

### Feature Engineering

##### Extract 'Title' from 'Name' & encode

In [58]:
all_data['Title'] = train.Name.apply(lambda x: x.split(',')[1].split('.')[0].strip())
all_data['Title'].value_counts()

Mr              745
Miss            283
Mrs             183
Master           63
Dr               10
Rev               9
Col               2
Don               2
Mme               2
Major             2
Mlle              2
the Countess      1
Capt              1
Ms                1
Sir               1
Lady              1
Jonkheer          1
Name: Title, dtype: int64

In [72]:
 other_titles = [title for title in train["Title"]
                if title not in ["Mr", "Miss", "Mrs", "Master"]]

In [77]:
all_data['Title'] = all_data['Title'].replace(other_titles, 'Other')
all_data['Title'].value_counts()

Mr        745
Miss      283
Mrs       183
Master     63
Other      35
Name: Title, dtype: int64

In [78]:
all_data['en_Title'] = all_data['Title'].map({"Mr":0, "Miss":1, "Mrs" : 2 , "Master":3, "Other":4})

In [89]:
all_data['en_Title'].value_counts()

0    745
1    283
2    183
3     63
4     35
Name: en_Title, dtype: int64

##### Encode 'Sex'

In [94]:
all_data['Sex'].value_counts()

male      843
female    466
Name: Sex, dtype: int64

In [91]:
all_data['en_Sex'] = all_data['Sex'].map({"female":0, "male":1})

In [93]:
all_data['en_Sex'].value_counts()

1    843
0    466
Name: en_Sex, dtype: int64

##### 'Pclass'+'Sex': create column & encode

In [169]:
all_data['Pclass_Sex'] =  all_data['Pclass'].astype(str) + all_data['en_Sex'].astype(str)
all_data['Pclass_Sex'] = all_data['Pclass_Sex'].astype(int)
all_data['Pclass_Sex'].value_counts()

31    493
30    216
11    179
21    171
10    144
20    106
Name: Pclass_Sex, dtype: int64

##### 'Age': Fill missing values, binning, encode

In [173]:
all_data['Age'].describe()

count    1046.000000
mean       29.881138
std        14.413493
min         0.170000
25%        21.000000
50%        28.000000
75%        39.000000
max        80.000000
Name: Age, dtype: float64

In [175]:
# fill missing age by ticket class median
all_data['Age'] = all_data['Age'].fillna(train['Age'].median())
all_data['Age'].describe()

# What to do with missing age?? more research

count    1309.000000
mean       29.503186
std        12.905241
min         0.170000
25%        22.000000
50%        28.000000
75%        35.000000
max        80.000000
Name: Age, dtype: float64

In [177]:
all_data['Age'].isna().sum()

0

In [182]:
# binning age into age group
all_data["en_Age"] =  pd.cut(all_data["Age"], bins=[0,5,12,21,65,100], labels=[0,1,2,3,4]).astype("int64")
all_data["en_Age"].value_counts()

3    1009
2     196
0      56
1      38
4      10
Name: en_Age, dtype: int64

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   PassengerId        1309 non-null   int64  
 1   Survived           891 non-null    float64
 2   Pclass             1309 non-null   int64  
 3   Name               1309 non-null   object 
 4   Sex                1309 non-null   object 
 5   Age                1046 non-null   float64
 6   SibSp              1309 non-null   int64  
 7   Parch              1309 non-null   int64  
 8   Ticket             1309 non-null   object 
 9   Fare               1309 non-null   int32  
 10  Cabin              295 non-null    object 
 11  Embarked           1309 non-null   object 
 12  Title              1309 non-null   object 
 13  en_Title           1309 non-null   int64  
 14  en_Sex             1309 non-null   int64  
 15  Pclass_Sex         1309 non-null   int32  
 16  Family_Size        1309 n

##### 'Family Size' = 'SibSp' + 'Parch' + 1

In [118]:
all_data['Family_Size'] = all_data['SibSp'] + all_data['Parch'] + 1
all_data['Family_Size'].value_counts()

1     790
2     235
3     159
4      43
6      25
5      22
7      16
11     11
8       8
Name: Family_Size, dtype: int64

##### 'Fare': Fill missing values, rounding?

In [156]:
all_data['Fare'].isna().sum()

1

In [159]:
all_data['Fare'] = all_data['Fare'].fillna(all_data['Fare'].median())

In [160]:
all_data['Fare'].isna().sum()

0

In [163]:
all_data['Fare'] = round(all_data['Fare'].astype(int))

In [164]:
all_data['Fare'].head()

0     7
1    71
2     8
3    53
4     8
Name: Fare, dtype: int32

##### 'Ticket': categorize into numeric or non-numeric, encode

In [122]:
all_data['Ticket_is_numeric'] = all_data.Ticket.apply(lambda x: 1 if x.isnumeric() else 0)
all_data['Ticket_is_numeric'].value_counts()

1    957
0    352
Name: Ticket_is_numeric, dtype: int64

##### 'Embarked': Fill missing values, encode

In [132]:
all_data['Embarked'].value_counts()

S    914
C    270
Q    123
Name: Embarked, dtype: int64

In [144]:
# the most frequent embarked station
all_data['Embarked'].mode()

0    S
dtype: object

In [145]:
all_data['Embarked'] = all_data['Embarked'].fillna('S')

In [147]:
all_data['Embarked'].value_counts()

S    916
C    270
Q    123
Name: Embarked, dtype: int64

In [150]:
all_data['en_Embarked'] = all_data['Embarked'].map({"S":1, "C":2, "Q":3})
all_data['en_Embarked'].value_counts()

1    916
2    270
3    123
Name: en_Embarked, dtype: int64

##### Feature Selections

In [183]:
all_data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Title', 'en_Title',
       'en_Sex', 'Pclass_Sex', 'Family_Size', 'Ticket_is_numeric',
       'en_Embarked', 'age_group', 'en_age', 'en_Age'],
      dtype='object')

In [185]:
final_data = all_data[['Survived', 'Fare', 'en_Title', 'en_Sex', 'Pclass_Sex', 'Family_Size', 'Ticket_is_numeric', 'en_Embarked', 'age_group', 'en_age', 'en_Age']]
final_data.head()

Unnamed: 0,Survived,Fare,en_Title,en_Sex,Pclass_Sex,Family_Size,Ticket_is_numeric,en_Embarked,age_group,en_age,en_Age
0,0.0,7,0,1,31,2,0,1,3,3,3
1,1.0,71,2,0,10,2,0,2,3,3,3
2,1.0,8,1,0,30,1,0,1,3,3,3
3,1.0,53,2,0,10,2,1,1,3,3,3
4,0.0,8,0,1,31,1,1,1,3,3,3


### Preprocessing before modelling

In [187]:
df_train=final_data.iloc[:891,:]
df_test=final_data.iloc[891:,:]

In [193]:
df_train.shape

(891, 11)

In [188]:
df_train.head()

Unnamed: 0,Survived,Fare,en_Title,en_Sex,Pclass_Sex,Family_Size,Ticket_is_numeric,en_Embarked,age_group,en_age,en_Age
0,0.0,7,0,1,31,2,0,1,3,3,3
1,1.0,71,2,0,10,2,0,2,3,3,3
2,1.0,8,1,0,30,1,0,1,3,3,3
3,1.0,53,2,0,10,2,1,1,3,3,3
4,0.0,8,0,1,31,1,1,1,3,3,3


In [189]:
df_train.tail()

Unnamed: 0,Survived,Fare,en_Title,en_Sex,Pclass_Sex,Family_Size,Ticket_is_numeric,en_Embarked,age_group,en_age,en_Age
886,0.0,13,4,1,21,1,1,1,3,3,3
887,1.0,30,1,0,10,1,1,1,2,2,2
888,0.0,23,1,0,30,4,0,1,3,3,3
889,1.0,30,0,1,11,1,1,2,3,3,3
890,0.0,8,0,1,31,1,1,3,3,3,3


In [190]:
df_test.head()

Unnamed: 0,Survived,Fare,en_Title,en_Sex,Pclass_Sex,Family_Size,Ticket_is_numeric,en_Embarked,age_group,en_age,en_Age
0,,8,0,1,31,1,1,3,3,3,3
1,,7,2,0,30,2,1,1,3,3,3
2,,10,1,1,21,1,1,3,3,3,3
3,,9,2,1,31,1,1,1,3,3,3
4,,12,0,0,30,3,1,1,3,3,3


In [191]:
df_test.drop(['Survived'],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [192]:
df_test.shape

(418, 10)

### Modelling & Evaluation

In [211]:
# KNN Neighbors
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_train, y_train) * 100, 2)
acc_knn

86.84

### Select the best performance Models & submission

In [212]:
y_pred = knn.predict(df_test)

In [213]:
## Update Sample Submission file with result in y_pred
pred = pd.DataFrame(y_pred)
sub_df = pd.read_csv('gender_submission.csv')
datasets = pd.concat([sub_df['PassengerId'],pred],axis=1)
datasets.columns = ['PassengerId','Survived']
datasets.to_csv('gender_submission.csv',index=False)