In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score


In [2]:
# Load data sets for training and testing the model
df_train = pd.read_csv("../data/train.csv",sep = ";")
df_test = pd.read_csv("../data/val.csv",sep = ";")

In [3]:
# Take a look at data sets for better understading of data we are dealing with (1)
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Celotti, Mr. Francesco",male,24.0,0,0,343275,8.05,,S
1,2,0,3,"Christmann, Mr. Emil",male,29.0,0,0,343276,8.05,,S
2,3,0,3,"Andreasson, Mr. Paul Edvin",male,20.0,0,0,347466,7.8542,,S
3,4,0,1,"Chaffee, Mr. Herbert Fuller",male,46.0,1,0,W.E.P. 5734,61.175,E31,S
4,5,0,3,"Dean, Mr. Bertram Frank",male,26.0,1,2,C.A. 2315,20.575,,S


In [4]:
# Take a look at data sets for better understading of data we are dealing with (2)
df_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,802.0,802.0,802.0,646.0,802.0,802.0,802.0
mean,401.5,0.377805,2.296758,30.075604,0.493766,0.374065,32.431675
std,231.661751,0.485141,0.838651,14.470148,1.084719,0.78196,50.414965
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,201.25,0.0,2.0,21.0,0.0,0.0,7.8958
50%,401.5,0.0,3.0,29.0,0.0,0.0,14.4542
75%,601.75,1.0,3.0,39.0,1.0,0.0,30.92395
max,802.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
# Take a look at data sets for better understading of data we are dealing with (3) 
# We can see here that there is some data missing in "Age" and "Cabin" 
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 802 entries, 0 to 801
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  802 non-null    int64  
 1   Survived     802 non-null    int64  
 2   Pclass       802 non-null    int64  
 3   Name         802 non-null    object 
 4   Sex          802 non-null    object 
 5   Age          646 non-null    float64
 6   SibSp        802 non-null    int64  
 7   Parch        802 non-null    int64  
 8   Ticket       802 non-null    object 
 9   Fare         802 non-null    float64
 10  Cabin        187 non-null    object 
 11  Embarked     801 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 75.3+ KB


In [6]:
# Take a look at data sets for better understading of data we are dealing with (4)
df_test.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
# Take a look at data sets for better understading of data we are dealing with (5)
df_test.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,89.0,89.0,89.0,68.0,89.0,89.0,89.0
mean,45.0,0.438202,2.41573,26.1225,0.786517,0.449438,30.154447
std,25.836021,0.498978,0.809166,14.681575,1.229116,1.000128,42.840442
min,1.0,0.0,1.0,0.83,0.0,0.0,7.225
25%,23.0,0.0,2.0,17.0,0.0,0.0,8.05
50%,45.0,0.0,3.0,26.0,0.0,0.0,15.5
75%,67.0,1.0,3.0,34.25,1.0,0.0,31.3875
max,89.0,1.0,3.0,66.0,5.0,5.0,263.0


In [8]:
# Take a look at data sets for better understading of data we are dealing with (6)
# Same case as in df_train
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89 entries, 0 to 88
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  89 non-null     int64  
 1   Survived     89 non-null     int64  
 2   Pclass       89 non-null     int64  
 3   Name         89 non-null     object 
 4   Sex          89 non-null     object 
 5   Age          68 non-null     float64
 6   SibSp        89 non-null     int64  
 7   Parch        89 non-null     int64  
 8   Ticket       89 non-null     object 
 9   Fare         89 non-null     float64
 10  Cabin        17 non-null     object 
 11  Embarked     88 non-null     object 
dtypes: float64(2), int64(5), object(5)
memory usage: 8.5+ KB


In [9]:
# Combine train and test data sets, we will need numeric representation of some features
# Better to work on both at the same time
total_data = [df_train,df_test]

In [10]:
# Let's begin with easy features, change sex to numeric representation in both data sets
sex_map = {"female": 1, "male": 0}
for data in total_data:
    data['Sex'] = data['Sex'].map(sex_map)

In [11]:
# Need to fill 2 missing values for "Embarked", for simplicity let's use most common value
df_train["Embarked"].value_counts() # "S" should do it
for data in total_data:
    data['Embarked'] = data['Embarked'].fillna('S')

In [12]:
# Create numeric representation of "Embarked" feature
embarked_map = {"S": 2, "C": 1, "Q": 0}
for data in total_data:
    data['Embarked'] = data['Embarked'].map(embarked_map)

In [13]:
# Create additional column with passanger's title using regural expressions
for data in total_data:
    data['Title'] = data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)


In [14]:
# Check titles to create map directiory
df_train["Title"].value_counts()

Mr          474
Miss        160
Mrs         109
Master       33
Dr            7
Rev           6
Major         2
Mlle          2
Col           2
Sir           1
Countess      1
Mme           1
Jonkheer      1
Lady          1
Ms            1
Capt          1
Name: Title, dtype: int64

In [15]:
# Create numeric represenation of "Title"

title_map = {"Mr": 0, "Miss": 1, "Mrs": 2, "Master": 3, "Dr": 3, "Rev": 3, "Mile": 3, "Major": 3, "Col": 3,"Ms": 3, "Jonkheer": 3, "Sir": 3, "Lady" : 3, "Countess": 3,"Mme": 3,"Capt": 3 }
for data in total_data:
    data['Title'] = data['Title'].map(title_map)

# Fill NaN with most common titles
for data in total_data:
    data['Title'] = data['Title'].fillna(0)

In [16]:
# Fill missing values from "Age", let's use average age based on a title
df_train["Age"].fillna(df_train.groupby("Title")["Age"].transform("mean"), inplace=True)
df_test["Age"].fillna(df_test.groupby("Title")["Age"].transform("mean"), inplace=True)


In [17]:
# Gruop age values to create numeric representation
for data in total_data:
    data.loc[ data["Age"] <= 12, "Age"] = 0
    data.loc[(data["Age"] > 12) & (data["Age"] <= 25), "Age"] = 1
    data.loc[(data["Age"] > 25) & (data["Age"] <= 40), "Age"] = 2
    data.loc[(data["Age"] > 40) & (data["Age"] <= 60), "Age"] = 3
    data.loc[ data["Age"] > 60, "Age"] = 4

In [18]:
# Let's figure out the family size of each passenger and add column to both data sets
df_train["FamilySize"] = df_train["SibSp"] + df_train["Parch"] + 1
df_test["FamilySize"] = df_test["SibSp"] + df_test["Parch"] + 1

In [19]:
# Create groups for "Fare" feature, groups created based on "describe"
for data in total_data:
    data.loc[ data["Fare"] <= 18, "Fare"] = 0,
    data.loc[(data["Fare"] > 18) & (data["Fare"] <= 30), "Fare"] = 1,
    data.loc[(data["Fare"] > 30) & (data["Fare"] <= 100), "Fare"] = 2,
    data.loc[ data["Fare"] > 100, "Fare"] = 3

In [20]:
# Drop unncecessary features from our data sets and prepare data for modeling
features_to_drop = ["Name","Ticket", "SibSp", "Parch","Cabin"]
df_train = df_train.drop(features_to_drop, axis=1)
df_test = df_test.drop(features_to_drop, axis=1)
df_train = df_train.drop(["PassengerId"], axis=1)
train_d = df_train.drop("Survived", axis=1)
target = df_train["Survived"]



In [21]:
# Let's take a look at our data set now
df_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,FamilySize
0,0,3,0,1.0,0.0,2,0.0,1
1,0,3,0,2.0,0.0,2,0.0,1
2,0,3,0,1.0,0.0,2,0.0,1
3,0,1,0,3.0,2.0,2,0.0,2
4,0,3,0,2.0,1.0,2,0.0,4


In [22]:
# Alright, create our prediction

In [23]:
# Use Cross Validation to prevent overfitting
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)

In [24]:
# Test on 10 random data sets and print accuracy (average of results)
clf = RandomForestClassifier(n_estimators=10)
scoring = "accuracy"
score = cross_val_score(clf, train_d, target, cv=k_fold, n_jobs=1, scoring=scoring)
print(np.mean(score)*100)

80.92438271604937


In [25]:
# Fit data into our model and make prediction on test data
clf.fit(train_d, target)
test_data = df_test.drop(["PassengerId","Survived"], axis=1).copy()
prediction = clf.predict(test_data)




In [42]:
# Check actual results by comparing target and prediction
np.where(df_test["Survived"] == prediction,True,False)

array([ True,  True, False,  True,  True,  True,  True,  True, False,
        True,  True,  True,  True,  True,  True,  True,  True, False,
       False,  True,  True, False,  True, False,  True, False,  True,
       False,  True,  True, False,  True,  True,  True, False,  True,
       False,  True, False, False, False, False,  True,  True,  True,
        True,  True,  True, False, False,  True,  True,  True,  True,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
       False,  True,  True,  True, False,  True,  True,  True])