# **0. Acquiring data**

In [1]:
! pip install kaggle

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
! mkdir ~/.kaggle

In [3]:
! cp kaggle.json ~/.kaggle/

In [4]:
! chmod 600 ~/.kaggle/kaggle.json

In [5]:
! kaggle competitions download -c titanic

Downloading titanic.zip to /content
  0% 0.00/34.1k [00:00<?, ?B/s]
100% 34.1k/34.1k [00:00<00:00, 27.4MB/s]


In [6]:
!ls

kaggle.json  sample_data  titanic.zip


In [7]:
! unzip titanic.zip

Archive:  titanic.zip
  inflating: gender_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [8]:
!ls

gender_submission.csv  sample_data  titanic.zip
kaggle.json	       test.csv     train.csv


# **1. Preprocessing**

### Preliminarities

In [9]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

%matplotlib inline 

In [10]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [11]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [12]:
train_df.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [13]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [14]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [15]:
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [16]:
# also categorical features
train_df.describe(include=['O'])

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Braund, Mr. Owen Harris",male,347082,B96 B98,S
freq,1,577,7,4,644


### Missing values

In [17]:
def missing_percentage(df):
    """This function takes a DataFrame(df) as input and returns two columns, total missing values and total missing values percentage"""
    total = df.isnull().sum().sort_values(ascending = False)
    percent = round(df.isnull().sum().sort_values(ascending = False)/len(df)*100, 2)
    return pd.concat([total, percent], axis=1, keys=['Total','Percent'])

In [18]:
missing_percentage(train_df)

Unnamed: 0,Total,Percent
Cabin,687,77.1
Age,177,19.87
Embarked,2,0.22
PassengerId,0,0.0
Survived,0,0.0
Pclass,0,0.0
Name,0,0.0
Sex,0,0.0
SibSp,0,0.0
Parch,0,0.0


In [19]:
missing_percentage(test_df)

Unnamed: 0,Total,Percent
Cabin,327,78.23
Age,86,20.57
Fare,1,0.24
PassengerId,0,0.0
Pclass,0,0.0
Name,0,0.0
Sex,0,0.0
SibSp,0,0.0
Parch,0,0.0
Ticket,0,0.0


### Missing Embarked values

In [20]:
train_df[train_df.Embarked.isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,


In [21]:
train_df[(train_df.Sex == "female") & (train_df.Pclass == 1)].groupby(['Embarked'])['Embarked'].count()

Embarked
C    43
Q     1
S    48
Name: Embarked, dtype: int64

In [22]:
train_df[train_df.Ticket == 113572]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked


In [23]:
train_df[(train_df.Sex == "female") & (train_df.Pclass == 1)].groupby(['Embarked'])['Fare'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Embarked,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
C,43.0,115.640309,88.571645,27.7208,69.3,83.1583,134.5,512.3292
Q,1.0,90.0,,90.0,90.0,90.0,90.0,90.0
S,48.0,99.02691,61.094407,25.9292,53.1,79.65,139.612475,263.0


In [24]:
## Replacing the null values in the Embarked column with s. 
train_df.Embarked.fillna("S", inplace=True)

### Missing Cabin values



In [25]:
pd.DataFrame(round(train_df.loc[:,"Cabin"].value_counts(dropna=False, normalize=True)*100,2))

Unnamed: 0,Cabin
,77.10
C23 C25 C27,0.45
G6,0.45
B96 B98,0.45
C22 C26,0.34
...,...
E34,0.11
C7,0.11
C54,0.11
E36,0.11


In [26]:
pd.DataFrame(round(test_df.loc[:,"Cabin"].value_counts(dropna=False, normalize=True)*100,2))

Unnamed: 0,Cabin
,78.23
B57 B59 B63 B66,0.72
C89,0.48
C116,0.48
C80,0.48
...,...
E45,0.24
E52,0.24
B58 B60,0.24
C62 C64,0.24


**Simplification**

*   Cabin column is messy, most values missing
*   We are dropping this feature



In [27]:
train_df.drop(["Cabin"],axis=1, inplace=True)
test_df.drop(["Cabin"],axis=1, inplace=True)

### Missing Fare values

In [28]:
test_df[test_df.Fare.isnull()]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
152,1044,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,S


In [29]:
missing_value = test_df[(test_df.Pclass == 3) & 
                     (test_df.Embarked == "S") & 
                     (test_df.Sex == "male")].Fare.mean()

test_df.Fare.fillna(missing_value, inplace=True)

In [30]:
test_df[test_df.Fare.isnull()]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked


### Almost done

In [31]:
survivers = train_df.Survived

train_df.drop(["Survived"],axis=1, inplace=True)

In [32]:
# saving PassengerId in advance since this feature will be dropped 
passengerid = test_df.PassengerId

In [33]:
train_df.drop(['PassengerId'], axis=1, inplace=True)
test_df.drop(['PassengerId'], axis=1, inplace=True)

In [34]:
train_df.drop(['Ticket'], axis=1, inplace=True)
test_df.drop(['Ticket'], axis=1, inplace=True)

In [35]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Name      891 non-null    object 
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  891 non-null    object 
dtypes: float64(2), int64(3), object(3)
memory usage: 55.8+ KB


In [36]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Name      418 non-null    object 
 2   Sex       418 non-null    object 
 3   Age       332 non-null    float64
 4   SibSp     418 non-null    int64  
 5   Parch     418 non-null    int64  
 6   Fare      418 non-null    float64
 7   Embarked  418 non-null    object 
dtypes: float64(2), int64(3), object(3)
memory usage: 26.2+ KB


The age feature could be an important factor and should not be droppped. Since it a great amount of age feature is missing more advanced technique outght to be used.

### Feature engineering: Title extracted from Name feature

In [37]:
train_df["title"] = [i.split('.')[0] for i in train_df.Name]
train_df["title"] = [i.split(',')[1] for i in train_df.title]

In [38]:
print(train_df.title.unique())

[' Mr' ' Mrs' ' Miss' ' Master' ' Don' ' Rev' ' Dr' ' Mme' ' Ms' ' Major'
 ' Lady' ' Sir' ' Mlle' ' Col' ' Capt' ' the Countess' ' Jonkheer']


In [39]:
train_df.title = train_df.title.apply(lambda x: x.strip())

In [40]:
print(train_df.title.unique())

['Mr' 'Mrs' 'Miss' 'Master' 'Don' 'Rev' 'Dr' 'Mme' 'Ms' 'Major' 'Lady'
 'Sir' 'Mlle' 'Col' 'Capt' 'the Countess' 'Jonkheer']


In [41]:
test_df['title'] = [i.split('.')[0].split(',')[1].strip() for i in test_df.Name]

In [42]:
train_df.groupby(['title'])['title'].count()

title
Capt              1
Col               2
Don               1
Dr                7
Jonkheer          1
Lady              1
Major             2
Master           40
Miss            182
Mlle              2
Mme               1
Mr              517
Mrs             125
Ms                1
Rev               6
Sir               1
the Countess      1
Name: title, dtype: int64

In [43]:
test_df.groupby(['title'])['title'].count()

title
Col         2
Dona        1
Dr          1
Master     21
Miss       78
Mr        240
Mrs        72
Ms          1
Rev         2
Name: title, dtype: int64

In [44]:
train_df["title"] = [i.replace('Ms', 'Miss') for i in train_df.title]
train_df["title"] = [i.replace('Mlle', 'Miss') for i in train_df.title]
train_df["title"] = [i.replace('Mme', 'Mrs') for i in train_df.title]
train_df["title"] = [i.replace('Dr', 'rare') for i in train_df.title]
train_df["title"] = [i.replace('Col', 'rare') for i in train_df.title]
train_df["title"] = [i.replace('Major', 'rare') for i in train_df.title]
train_df["title"] = [i.replace('Don', 'rare') for i in train_df.title]
train_df["title"] = [i.replace('Jonkheer', 'rare') for i in train_df.title]
train_df["title"] = [i.replace('Sir', 'rare') for i in train_df.title]
train_df["title"] = [i.replace('Lady', 'rare') for i in train_df.title]
train_df["title"] = [i.replace('Capt', 'rare') for i in train_df.title]
train_df["title"] = [i.replace('the Countess', 'rare') for i in train_df.title]
train_df["title"] = [i.replace('Rev', 'rare') for i in train_df.title]

In [45]:
train_df.groupby(['title'])['title'].count()

title
Master     40
Miss      185
Mr        517
Mrs       126
rare       23
Name: title, dtype: int64

In [46]:
test_df["title"] = [i.replace('Ms', 'Miss') for i in test_df.title]
test_df["title"] = [i.replace('Mlle', 'Miss') for i in test_df.title]
test_df["title"] = [i.replace('Mme', 'Mrs') for i in test_df.title]
test_df["title"] = [i.replace('Dr', 'rare') for i in test_df.title]
test_df["title"] = [i.replace('Col', 'rare') for i in test_df.title]
test_df["title"] = [i.replace('Major', 'rare') for i in test_df.title]
test_df["title"] = [i.replace('Dona', 'rare') for i in test_df.title]
test_df["title"] = [i.replace('Don', 'rare') for i in test_df.title]
test_df["title"] = [i.replace('Jonkheer', 'rare') for i in test_df.title]
test_df["title"] = [i.replace('Sir', 'rare') for i in test_df.title]
test_df["title"] = [i.replace('Lady', 'rare') for i in test_df.title]
test_df["title"] = [i.replace('Capt', 'rare') for i in test_df.title]
test_df["title"] = [i.replace('the Countess', 'rare') for i in test_df.title]
test_df["title"] = [i.replace('Rev', 'rare') for i in test_df.title]

In [47]:
test_df.groupby(['title'])['title'].count()

title
Master     21
Miss       79
Mr        240
Mrs        72
rare        6
Name: title, dtype: int64

In [48]:
train_df.drop(['Name'], axis=1, inplace=True)
test_df.drop(['Name'], axis=1, inplace=True)

In [49]:
train_df.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,title
0,3,male,22.0,1,0,7.25,S,Mr
1,1,female,38.0,1,0,71.2833,C,Mrs
2,3,female,26.0,0,0,7.925,S,Miss
3,1,female,35.0,1,0,53.1,S,Mrs
4,3,male,35.0,0,0,8.05,S,Mr


In [50]:
test_df.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,title
0,3,male,34.5,0,0,7.8292,Q,Mr
1,3,female,47.0,1,0,7.0,S,Mrs
2,2,male,62.0,0,0,9.6875,Q,Mr
3,3,male,27.0,0,0,8.6625,S,Mr
4,3,female,22.0,1,1,12.2875,S,Mrs


### One-hot encoding

In [51]:
train_df = pd.get_dummies(train_df, columns=["Pclass", "Sex", "Embarked", "title"], drop_first=False)
test_df = pd.get_dummies(test_df, columns=["Pclass", "Sex", "Embarked", "title"], drop_first=False)

In [52]:
train_df.head()

Unnamed: 0,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,title_Master,title_Miss,title_Mr,title_Mrs,title_rare
0,22.0,1,0,7.25,0,0,1,0,1,0,0,1,0,0,1,0,0
1,38.0,1,0,71.2833,1,0,0,1,0,1,0,0,0,0,0,1,0
2,26.0,0,0,7.925,0,0,1,1,0,0,0,1,0,1,0,0,0
3,35.0,1,0,53.1,1,0,0,1,0,0,0,1,0,0,0,1,0
4,35.0,0,0,8.05,0,0,1,0,1,0,0,1,0,0,1,0,0


In [53]:
test_df.head()

Unnamed: 0,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,title_Master,title_Miss,title_Mr,title_Mrs,title_rare
0,34.5,0,0,7.8292,0,0,1,0,1,0,1,0,0,0,1,0,0
1,47.0,1,0,7.0,0,0,1,1,0,0,0,1,0,0,0,1,0
2,62.0,0,0,9.6875,0,1,0,0,1,0,1,0,0,0,1,0,0
3,27.0,0,0,8.6625,0,0,1,0,1,0,0,1,0,0,1,0,0
4,22.0,1,1,12.2875,0,0,1,1,0,0,0,1,0,0,0,1,0


### Missing Age values

In [54]:
train_df.head().loc[:, "SibSp":]

Unnamed: 0,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,title_Master,title_Miss,title_Mr,title_Mrs,title_rare
0,1,0,7.25,0,0,1,0,1,0,0,1,0,0,1,0,0
1,1,0,71.2833,1,0,0,1,0,1,0,0,0,0,0,1,0
2,0,0,7.925,0,0,1,1,0,0,0,1,0,1,0,0,0
3,1,0,53.1,1,0,0,1,0,0,0,1,0,0,0,1,0
4,0,0,8.05,0,0,1,0,1,0,0,1,0,0,1,0,0


In [55]:
from sklearn.ensemble import RandomForestRegressor

def completing_age(df):
    temp_train = train_df.loc[train_df.Age.notnull()] # train_df with age values, not df with age values
    temp_test = df.loc[df.Age.isnull()]               # df with missing age values
    
    y = temp_train.Age#.values
    x = temp_train.loc[:, "SibSp":]#.values
    
    rfr = RandomForestRegressor(n_estimators=1500, n_jobs=-1)
    rfr.fit(x, y)
    
    predicted_age = rfr.predict(temp_test.loc[:, "SibSp":])
    
    df.loc[df.Age.isnull(), "Age"] = predicted_age
    
    return df

In [56]:
completing_age(train_df)
completing_age(test_df);
all_data = pd.concat([train_df, test_df], ignore_index=False)

In [57]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 17 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Age           1309 non-null   float64
 1   SibSp         1309 non-null   int64  
 2   Parch         1309 non-null   int64  
 3   Fare          1309 non-null   float64
 4   Pclass_1      1309 non-null   uint8  
 5   Pclass_2      1309 non-null   uint8  
 6   Pclass_3      1309 non-null   uint8  
 7   Sex_female    1309 non-null   uint8  
 8   Sex_male      1309 non-null   uint8  
 9   Embarked_C    1309 non-null   uint8  
 10  Embarked_Q    1309 non-null   uint8  
 11  Embarked_S    1309 non-null   uint8  
 12  title_Master  1309 non-null   uint8  
 13  title_Miss    1309 non-null   uint8  
 14  title_Mr      1309 non-null   uint8  
 15  title_Mrs     1309 non-null   uint8  
 16  title_rare    1309 non-null   uint8  
dtypes: float64(2), int64(2), uint8(13)
memory usage: 67.8 KB


In [58]:
train_df.head()

Unnamed: 0,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,title_Master,title_Miss,title_Mr,title_Mrs,title_rare
0,22.0,1,0,7.25,0,0,1,0,1,0,0,1,0,0,1,0,0
1,38.0,1,0,71.2833,1,0,0,1,0,1,0,0,0,0,0,1,0
2,26.0,0,0,7.925,0,0,1,1,0,0,0,1,0,1,0,0,0
3,35.0,1,0,53.1,1,0,0,1,0,0,0,1,0,0,0,1,0
4,35.0,0,0,8.05,0,0,1,0,1,0,0,1,0,0,1,0,0


### Feature Scaling

In [59]:
headers = all_data.columns 

all_data.head()

Unnamed: 0,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,title_Master,title_Miss,title_Mr,title_Mrs,title_rare
0,22.0,1,0,7.25,0,0,1,0,1,0,0,1,0,0,1,0,0
1,38.0,1,0,71.2833,1,0,0,1,0,1,0,0,0,0,0,1,0
2,26.0,0,0,7.925,0,0,1,1,0,0,0,1,0,1,0,0,0
3,35.0,1,0,53.1,1,0,0,1,0,0,0,1,0,0,0,1,0
4,35.0,0,0,8.05,0,0,1,0,1,0,0,1,0,0,1,0,0


In [60]:
from sklearn.preprocessing import StandardScaler
st_scale = StandardScaler()

scaled_train = st_scale.fit_transform(train_df)
scaled_test = st_scale.fit_transform(test_df)

In [61]:
pd.DataFrame(scaled_train, columns=headers).head()

Unnamed: 0,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,title_Master,title_Miss,title_Mr,title_Mrs,title_rare
0,-0.555117,0.432793,-0.473674,-0.502445,-0.565685,-0.510152,0.902587,-0.737695,0.737695,-0.482043,-0.307562,0.615838,-0.216803,-0.511898,0.850532,-0.40584,-0.162781
1,0.603365,0.432793,-0.473674,0.786845,1.767767,-0.510152,-1.107926,1.355574,-1.355574,2.074505,-0.307562,-1.623803,-0.216803,-0.511898,-1.175735,2.464027,-0.162781
2,-0.265497,-0.474545,-0.473674,-0.488854,-0.565685,-0.510152,0.902587,1.355574,-1.355574,-0.482043,-0.307562,0.615838,-0.216803,1.953514,-1.175735,-0.40584,-0.162781
3,0.386149,0.432793,-0.473674,0.42073,1.767767,-0.510152,-1.107926,1.355574,-1.355574,-0.482043,-0.307562,0.615838,-0.216803,-0.511898,-1.175735,2.464027,-0.162781
4,0.386149,-0.474545,-0.473674,-0.486337,-0.565685,-0.510152,0.902587,-0.737695,0.737695,-0.482043,-0.307562,0.615838,-0.216803,-0.511898,0.850532,-0.40584,-0.162781


In [62]:
pd.DataFrame(scaled_test, columns=headers).head()

Unnamed: 0,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,title_Master,title_Miss,title_Mr,title_Mrs,title_rare
0,0.295477,-0.49947,-0.400248,-0.497324,-0.586559,-0.534933,0.957826,-0.755929,0.755929,-0.568142,2.843757,-1.350676,-0.229993,-0.48274,0.861201,-0.456172,-0.120678
1,1.215987,0.616992,-0.400248,-0.512189,-0.586559,-0.534933,0.957826,1.322876,-1.322876,-0.568142,-0.351647,0.74037,-0.229993,-0.48274,-1.161169,2.192158,-0.120678
2,2.3206,-0.49947,-0.400248,-0.464012,-0.586559,1.869391,-1.044031,-0.755929,0.755929,-0.568142,2.843757,-1.350676,-0.229993,-0.48274,0.861201,-0.456172,-0.120678
3,-0.25683,-0.49947,-0.400248,-0.482387,-0.586559,-0.534933,0.957826,-0.755929,0.755929,-0.568142,-0.351647,0.74037,-0.229993,-0.48274,0.861201,-0.456172,-0.120678
4,-0.625034,0.616992,0.619896,-0.417405,-0.586559,-0.534933,0.957826,1.322876,-1.322876,-0.568142,-0.351647,0.74037,-0.229993,-0.48274,-1.161169,2.192158,-0.120678


### Splitting data

In [63]:
X = scaled_train
X

array([[-0.55511692,  0.43279337, -0.47367361, ...,  0.85053175,
        -0.40583972, -0.16278113],
       [ 0.60336472,  0.43279337, -0.47367361, ..., -1.17573506,
         2.4640269 , -0.16278113],
       [-0.26549651, -0.4745452 , -0.47367361, ..., -1.17573506,
        -0.40583972, -0.16278113],
       ...,
       [-1.67805091,  0.43279337,  2.00893337, ..., -1.17573506,
        -0.40583972, -0.16278113],
       [-0.26549651, -0.4745452 , -0.47367361, ...,  0.85053175,
        -0.40583972, -0.16278113],
       [ 0.16893411, -0.4745452 , -0.47367361, ...,  0.85053175,
        -0.40583972, -0.16278113]])

In [64]:
Y = survivers
Y

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [65]:
# The data from train.csv based on it the predictions will be submitted
X_submission  = scaled_test
X_submission

array([[ 0.29547672, -0.49947002, -0.4002477 , ...,  0.86120071,
        -0.45617155, -0.1206777 ],
       [ 1.21598727,  0.61699237, -0.4002477 , ..., -1.1611695 ,
         2.19215774, -0.1206777 ],
       [ 2.32059993, -0.49947002, -0.4002477 , ...,  0.86120071,
        -0.45617155, -0.1206777 ],
       ...,
       [ 0.59004009, -0.49947002, -0.4002477 , ...,  0.86120071,
        -0.45617155, -0.1206777 ],
       [ 0.02049614, -0.49947002, -0.4002477 , ...,  0.86120071,
        -0.45617155, -0.1206777 ],
       [-1.98620709,  0.61699237,  0.61989583, ..., -1.1611695 ,
        -0.45617155, -0.1206777 ]])

In [66]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .2, random_state=0)

In [67]:
X_train.shape, Y_train.shape, X_test.shape

((712, 17), (712,), (179, 17))

In [68]:
X_train

array([[-7.26336095e-04, -4.74545196e-01,  2.00893337e+00, ...,
        -1.17573506e+00,  2.46402690e+00, -1.62781129e-01],
       [ 9.65290028e-02, -4.74545196e-01, -4.73673609e-01, ...,
         8.50531749e-01, -4.05839725e-01, -1.62781129e-01],
       [ 9.65290028e-02,  4.32793366e-01,  7.67629879e-01, ...,
         8.50531749e-01, -4.05839725e-01, -1.62781129e-01],
       ...,
       [-3.24748977e-01, -4.74545196e-01, -4.73673609e-01, ...,
         8.50531749e-01, -4.05839725e-01, -1.62781129e-01],
       [ 4.58554518e-01,  4.32793366e-01, -4.73673609e-01, ...,
        -1.17573506e+00,  2.46402690e+00, -1.62781129e-01],
       [ 2.19627699e+00,  4.32793366e-01,  7.67629879e-01, ...,
         8.50531749e-01, -4.05839725e-01, -1.62781129e-01]])


# **2. Building models**

In [69]:
from sklearn.metrics import mean_absolute_error, accuracy_score, recall_score, precision_score, classification_report, balanced_accuracy_score
from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold

### Logistic Regression

In [70]:
sc = st_scale

In [71]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
cv = StratifiedShuffleSplit(n_splits = 10, test_size = .25, random_state = 0 ) # run model 10x with 60/30 split intentionally leaving out 10%

X = sc.fit_transform(X)
accuracies = cross_val_score(LogisticRegression(solver='liblinear'), X, Y, cv  = cv)
print ("Cross-Validation accuracy scores:{}".format(accuracies))
print ("Mean Cross-Validation accuracy score: {}".format(round(accuracies.mean(),5)))

Cross-Validation accuracy scores:[0.80269058 0.83408072 0.81165919 0.85650224 0.83408072 0.79372197
 0.8161435  0.88340807 0.81165919 0.83408072]
Mean Cross-Validation accuracy score: 0.8278


In [72]:
## C_vals is the alpla value of lasso and ridge regression(as alpha increases the model complexity decreases,)
## remember effective alpha scores are 0<alpha<infinity 
C_vals = [0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,2,3,4,5,6,7,8,9,10,12,13,14,15,16,16.5,17,17.5,18]
## Choosing penalties(Lasso(l1) or Ridge(l2))
penalties = ['none','l2']
## Choose a cross validation strategy. 
cv = StratifiedShuffleSplit(n_splits = 10, test_size = .25)

## setting param for param_grid in GridSearchCV. 
param = {'penalty': penalties, 'C': C_vals}

logreg = LogisticRegression(solver='liblinear')
## Calling on GridSearchCV object. 
grid = GridSearchCV(estimator=LogisticRegression(), 
                           param_grid = param,
                           scoring = 'accuracy',
                           n_jobs =-1,
                           cv = cv
                          )
## Fitting the model
grid.fit(X, Y)

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=10, random_state=None, test_size=0.25,
            train_size=None),
             estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 3,
                               4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 16.5,
                               17, 17.5, 18],
                         'penalty': ['none', 'l2']},
             scoring='accuracy')

In [73]:
print (grid.best_score_)
print (grid.best_params_)
print(grid.best_estimator_)

0.8367713004484305
{'C': 0.7, 'penalty': 'l2'}
LogisticRegression(C=0.7)


In [74]:
logreg_grid = grid.best_estimator_
acc_log = round(logreg_grid.score(X, Y)*100, 2)
acc_log

83.39

### kNN

In [75]:
# k-Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
k_range = range(1, 32)
 
weights_options=['uniform', 'distance']

param = {'n_neighbors':k_range, 'weights':weights_options}

cv = StratifiedShuffleSplit(n_splits=10, test_size=.30, random_state=1)

grid = GridSearchCV(KNeighborsClassifier(), param, cv=cv,verbose = False, n_jobs=-1)

grid.fit(X, Y)

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=10, random_state=1, test_size=0.3,
            train_size=None),
             estimator=KNeighborsClassifier(), n_jobs=-1,
             param_grid={'n_neighbors': range(1, 32),
                         'weights': ['uniform', 'distance']},
             verbose=False)

In [76]:
print(grid.best_score_)
print(grid.best_params_)
print(grid.best_estimator_)

0.8276119402985074
{'n_neighbors': 8, 'weights': 'uniform'}
KNeighborsClassifier(n_neighbors=8)


In [77]:
knn_grid = grid.best_estimator_
acc_knn = round(knn_grid.score(X, Y)*100, 2)
acc_knn

84.85

### Gaussian Naive Bayes

In [78]:
# Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB

gaussian = GaussianNB()
gaussian.fit(X, Y)
Y_pred = gaussian.predict(X_test)
acc_gaussian = round(accuracy_score(Y_pred, Y_test)*100, 2)
acc_gaussian

79.89

### SVM

In [79]:
# Support Vector Machines
from sklearn.svm import SVC

Cs = [0.001, 0.01, 0.1, 1,1.5,2,2.5,3,4,5, 10] # penalty parameter
gammas = [0.0001,0.001, 0.01, 0.1, 1]
param_grid = {'C': Cs, 'gamma' : gammas}
cv = StratifiedShuffleSplit(n_splits=10, test_size=.30, random_state=1)
grid_search = GridSearchCV(SVC(kernel = 'rbf', probability=True), param_grid, cv=cv) 
grid_search.fit(X, Y)

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=10, random_state=1, test_size=0.3,
            train_size=None),
             estimator=SVC(probability=True),
             param_grid={'C': [0.001, 0.01, 0.1, 1, 1.5, 2, 2.5, 3, 4, 5, 10],
                         'gamma': [0.0001, 0.001, 0.01, 0.1, 1]})

In [80]:
print(grid_search.best_score_)
print(grid_search.best_params_)
print(grid_search.best_estimator_)

0.8369402985074628
{'C': 2.5, 'gamma': 0.01}
SVC(C=2.5, gamma=0.01, probability=True)


In [81]:
svm_grid = grid_search.best_estimator_
acc_svm = round(svm_grid.score(X, Y)*100, 2)
acc_svm

83.95

### Perceptron

In [82]:
# Perceptron
from sklearn.linear_model import Perceptron

perceptron = Perceptron()
perceptron.fit(X_train, Y_train)
Y_pred = perceptron.predict(X_test)
acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, 2)
acc_perceptron

77.39

### SGD

In [83]:
# Stochastic Gradient Descent
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier()
sgd.fit(X_train, Y_train)
Y_pred = sgd.predict(X_test)
acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2)
acc_sgd

75.0

### Decision Tree

In [84]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier

max_depth = range(1, 16)
max_feature = [2,4,8,12,16,'auto']
criterion=["entropy", "gini"]

param = {'max_depth':max_depth, 
         'max_features':max_feature, 
         'criterion': criterion}
grid = GridSearchCV(DecisionTreeClassifier(), 
                                param_grid = param, 
                                verbose=False, 
                                cv=StratifiedKFold(n_splits=20, random_state=15, shuffle=True),
                                n_jobs = -1)
grid.fit(X, Y) 

GridSearchCV(cv=StratifiedKFold(n_splits=20, random_state=15, shuffle=True),
             estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'criterion': ['entropy', 'gini'],
                         'max_depth': range(1, 16),
                         'max_features': [2, 4, 8, 12, 16, 'auto']},
             verbose=False)

In [85]:
print( grid.best_params_)
print (grid.best_score_)
print (grid.best_estimator_)

{'criterion': 'entropy', 'max_depth': 8, 'max_features': 'auto'}
0.8307323232323233
DecisionTreeClassifier(criterion='entropy', max_depth=8, max_features='auto')


In [86]:
dectree_grid = grid.best_estimator_
acc_decision_tree = round(dectree_grid.score(X, Y) * 100, 2)
acc_decision_tree

86.64

In [87]:
feature_importances = pd.DataFrame(dectree_grid.feature_importances_,
                                   index = headers,
                                   columns=['importance'])
feature_importances.sort_values(by='importance', ascending=False)

Unnamed: 0,importance
title_Miss,0.241758
title_Mrs,0.161239
Age,0.160898
Fare,0.156998
Pclass_3,0.061984
Pclass_2,0.061294
Parch,0.031914
title_Master,0.028387
Pclass_1,0.026815
Embarked_S,0.021562


### Random Forest

In [88]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

n_estimators = [2, 8, 16, 18, 20, 24, 28, 32, 64];
max_depth = range(1, 12);
criterions = ['gini', 'entropy'];
cv = StratifiedShuffleSplit(n_splits=20, test_size=.33, random_state=1)


parameters = {'n_estimators':n_estimators,
              'max_depth':max_depth,
              'criterion': criterions
              
        }
grid = GridSearchCV(estimator=RandomForestClassifier(max_features='auto'),
                                 param_grid=parameters,
                                 cv=cv,
                                 n_jobs = -1)
grid.fit(X, Y) 

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=20, random_state=1, test_size=0.33,
            train_size=None),
             estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': range(1, 12),
                         'n_estimators': [2, 8, 16, 18, 20, 24, 28, 32, 64]})

In [89]:
print (grid.best_score_)
print (grid.best_params_)
print (grid.best_estimator_)

0.8367796610169492
{'criterion': 'entropy', 'max_depth': 5, 'n_estimators': 64}
RandomForestClassifier(criterion='entropy', max_depth=5, n_estimators=64)


In [90]:
rf_grid = grid.best_estimator_
acc_random_forest = round(rf_grid.score(X, Y)* 100, 2)
acc_random_forest

85.19

In [91]:
feature_importances = pd.DataFrame(rf_grid.feature_importances_,
                                   index = headers,
                                    columns=['importance'])
feature_importances.sort_values(by='importance', ascending=False)

Unnamed: 0,importance
Sex_female,0.183737
title_Mr,0.158631
Sex_male,0.147865
Fare,0.122406
Pclass_3,0.0783
Age,0.077474
Pclass_1,0.050734
SibSp,0.038353
Pclass_2,0.030723
Parch,0.027956


### Bagging

In [92]:
from sklearn.ensemble import BaggingClassifier
n_estimators = [8,16,32,64,128];
cv = StratifiedShuffleSplit(n_splits=10, test_size=.30, random_state=1)

parameters = {'n_estimators':n_estimators,
              }
grid = GridSearchCV(BaggingClassifier(base_estimator= None, ## If None, then the base estimator is a decision tree.
                                      bootstrap_features=False),
                                      param_grid=parameters,
                                      cv=cv,
                                      n_jobs = -1)
grid.fit(X, Y) 

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=10, random_state=1, test_size=0.3,
            train_size=None),
             estimator=BaggingClassifier(), n_jobs=-1,
             param_grid={'n_estimators': [8, 16, 32, 64, 128]})

In [93]:
print (grid.best_score_)
print (grid.best_params_)
print (grid.best_estimator_)

0.8149253731343282
{'n_estimators': 128}
BaggingClassifier(n_estimators=128)


In [94]:
bagging_grid = grid.best_estimator_
acc_bagging = round(bagging_grid.score(X, Y)*100, 2)
acc_bagging

98.43

### AdaBoost

In [95]:
from sklearn.ensemble import AdaBoostClassifier
n_estimators = [100,140,145,150,160,170,175,180,185];
cv = StratifiedShuffleSplit(n_splits=10, test_size=.30, random_state=1)
learning_r = [0.1,1,0.01,0.5]

parameters = {'n_estimators':n_estimators,
              'learning_rate':learning_r
              
        }
grid = GridSearchCV(AdaBoostClassifier(base_estimator= None, # If None, then the base estimator is a decision tree.
                                     ),
                                 param_grid=parameters,
                                 cv=cv,
                                 n_jobs = -1)
grid.fit(X, Y) 

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=10, random_state=1, test_size=0.3,
            train_size=None),
             estimator=AdaBoostClassifier(), n_jobs=-1,
             param_grid={'learning_rate': [0.1, 1, 0.01, 0.5],
                         'n_estimators': [100, 140, 145, 150, 160, 170, 175,
                                          180, 185]})

In [96]:
print (grid.best_score_)
print (grid.best_params_)
print (grid.best_estimator_)

0.821268656716418
{'learning_rate': 0.1, 'n_estimators': 100}
AdaBoostClassifier(learning_rate=0.1, n_estimators=100)


In [97]:
adaBoost_grid = grid.best_estimator_
acc_ada = adaBoost_grid.score(X, Y)
acc_ada

0.8271604938271605

### GradientBoosting

In [98]:
# Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier

gradient_boost = GradientBoostingClassifier()
n_estimators = [100,140,145,150,160,170,175,180,185];
cv = StratifiedShuffleSplit(n_splits=10, test_size=.30, random_state=1)
learning_r = [0.1,1,0.01,0.5]

parameters = {'n_estimators':n_estimators,
              'learning_rate':learning_r
              
        }
grid = GridSearchCV(gradient_boost,
                    param_grid=parameters,
                    cv=cv,
                    n_jobs = -1)
grid.fit(X, Y) 

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=10, random_state=1, test_size=0.3,
            train_size=None),
             estimator=GradientBoostingClassifier(), n_jobs=-1,
             param_grid={'learning_rate': [0.1, 1, 0.01, 0.5],
                         'n_estimators': [100, 140, 145, 150, 160, 170, 175,
                                          180, 185]})

In [99]:
print (grid.best_score_)
print (grid.best_params_)
print (grid.best_estimator_)

0.8335820895522387
{'learning_rate': 0.1, 'n_estimators': 140}
GradientBoostingClassifier(n_estimators=140)


In [100]:
gradBoost_grid = grid.best_estimator_
acc_gradboost = round(gradBoost_grid.score(X, Y)*100, 2)
acc_gradboost

92.14

### Voting

In [101]:
from sklearn.ensemble import VotingClassifier

voting_classifier = VotingClassifier(estimators=[
    ('lr_grid', logreg_grid),
    ('svc', svm_grid),
    ('random_forest', rf_grid),
    ('gradient_boosting', gradient_boost),
    ('decision_tree_grid',dectree_grid),
    ('knn_classifier', knn_grid),
    ('bagging_classifier', bagging_grid),
    ('adaBoost_classifier', adaBoost_grid)
],voting='hard')

#voting_classifier = voting_classifier.fit(train_x,train_y)
voting_classifier = voting_classifier.fit(X, Y)

In [102]:
Y_pred = voting_classifier.predict(X_test)
acc_voting = round(accuracy_score(Y_pred, Y_test)*100, 2)
acc_voting

87.15

# **3. Model evaluation**

### Models comparison

In [108]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 'Decision Tree',
              'Gradient Boosting Classifier', 'Voting Classifier','Bagging Classifier', 
              'SGD', 'Perceptron'],
    'Score': [acc_svm, acc_knn, acc_log, 
              acc_random_forest, acc_gaussian, acc_decision_tree, 
              acc_gradboost, acc_voting, acc_bagging,
              acc_sgd, acc_perceptron]})
models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
8,Bagging Classifier,98.43
6,Gradient Boosting Classifier,92.14
7,Voting Classifier,87.15
5,Decision Tree,86.64
3,Random Forest,85.19
1,KNN,84.85
0,Support Vector Machines,83.95
2,Logistic Regression,83.39
4,Naive Bayes,79.89
10,Perceptron,77.39


### Submission

In [104]:
Y_pred = rf_grid.predict(X_submission)

In [105]:
submission = pd.DataFrame({
        "PassengerId": passengerid,
        "Survived": Y_pred
    })

In [106]:
submission.to_csv('submission.csv', index=False)

In [107]:
from google.colab import files

files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# **Final conclusions**

*   Some models are clearly overfitting, however, RandomForest seems to work best in this competition
*   Delinitely reaching >80% accuracy on Kaggle requires considerably more EDA
*   Current best score on Kaggle: ~77%

