# Titanic Project

### 1. Data cleaning and preparation

In [1]:
#import pandas
import pandas as pd


In [2]:
#import datasets
base = pd.read_csv('../data/raw/titanic_train.csv')
base_test = pd.read_csv('../data/raw/titanic_test.csv')
base.head(3)



Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [3]:
base.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
#drop colunms with high cardinality
base = base.drop(['PassengerId', 'Name'],axis=1)
base_test = base_test.drop(['PassengerId', 'Name'],axis=1)
base.head(2)


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C


In [5]:
#drop colunms: Ticket and Cabin 
base = base.drop(['Cabin', 'Ticket'],axis=1)
base_test = base_test.drop(['Cabin', 'Ticket'],axis=1)
base.head(2)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C


In [6]:
#colunm Age
base['Age']

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: Age, Length: 891, dtype: float64

In [7]:
#Filter the missing value on the age colunm 
base.loc[base.Age.isnull(),'Age']



5     NaN
17    NaN
19    NaN
26    NaN
28    NaN
       ..
859   NaN
863   NaN
868   NaN
878   NaN
888   NaN
Name: Age, Length: 177, dtype: float64

In [8]:
#Filter the missing value on the age colunm (test_datset)
base_test.loc[base_test.Age.isnull(),'Age']

10    NaN
22    NaN
29    NaN
33    NaN
36    NaN
       ..
408   NaN
410   NaN
413   NaN
416   NaN
417   NaN
Name: Age, Length: 86, dtype: float64

In [9]:
# Calculate the mean age
mean_age = base['Age'].mean()
mean_age_test = base_test['Age'].mean()
mean_age


29.69911764705882

In [10]:
#replace missing value to mean on the age colunm
base['Age'] = base['Age'].fillna(mean_age)
base_test['Age'] = base_test['Age'].fillna(mean_age_test)
base['Age']

0      22.000000
1      38.000000
2      26.000000
3      35.000000
4      35.000000
         ...    
886    27.000000
887    19.000000
888    29.699118
889    26.000000
890    32.000000
Name: Age, Length: 891, dtype: float64

In [11]:
#Filter missing value column: Sex
base.loc[base.Sex.isnull(),'Sex']

Series([], Name: Sex, dtype: object)

In [12]:
#Filter missing value column: Sex (datset test)
base_test.loc[base_test.Sex.isnull(),'Sex']

Series([], Name: Sex, dtype: object)

In [13]:
# convert a object columns to a int columns (colunm: sex)
map_sex = {'male': 0, 'female': 1}
base['Sex'] = base['Sex'].map(map_sex)
base_test['Sex'] = base_test['Sex'].map(map_sex)
base['Sex']

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    1
889    0
890    0
Name: Sex, Length: 891, dtype: int64

In [14]:
#Filter missing value column: Fare (datset test)
base_test.loc[base_test.Fare.isnull(),'Fare']

152   NaN
Name: Fare, dtype: float64

In [15]:
#Calculate median
median_Fare = base_test['Fare'].median()

In [16]:
#Replace N/A values to median - Fare column (dataset test)
base_test['Fare'] = base_test['Fare'].fillna(median_Fare)
base_test.loc[base_test.Fare.isnull(),'Fare']

Series([], Name: Fare, dtype: float64)

In [17]:
#drop N/A values - column Embarked 
base.dropna(subset=['Embarked'], inplace=True)

In [18]:
#drop N/A values - column Fare 
base.dropna(subset=['Fare'], inplace=True)

In [19]:
#check the missing value - Embarked and Fare column
base.info()

<class 'pandas.core.frame.DataFrame'>
Index: 889 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  889 non-null    int64  
 1   Pclass    889 non-null    int64  
 2   Sex       889 non-null    int64  
 3   Age       889 non-null    float64
 4   SibSp     889 non-null    int64  
 5   Parch     889 non-null    int64  
 6   Fare      889 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(5), object(1)
memory usage: 62.5+ KB


In [20]:
# one hot enconded - column Embarked (train)
Embk_onehot_train = pd.get_dummies(base['Embarked'])
Embk_onehot_train.head(3)


Unnamed: 0,C,Q,S
0,False,False,True
1,True,False,False
2,False,False,True


In [21]:
# one hot enconded - column Embarked (test)
Embk_onehot_test = pd.get_dummies(base_test['Embarked'])
Embk_onehot_test.head(3)

Unnamed: 0,C,Q,S
0,False,True,False
1,False,False,True
2,False,True,False


In [22]:
#concatenate the news columns on the dataset (train)
base = pd.concat([base,Embk_onehot_train], axis=1)
base.head(3)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,C,Q,S
0,0,3,0,22.0,1,0,7.25,S,False,False,True
1,1,1,1,38.0,1,0,71.2833,C,True,False,False
2,1,3,1,26.0,0,0,7.925,S,False,False,True


In [23]:
#concatenate the news columns on the dataset (test)
base_test= pd.concat([base_test,Embk_onehot_test], axis=1)
base_test.head(3)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,C,Q,S
0,0,3,0,34.5,0,0,7.8292,Q,False,True,False
1,1,3,1,47.0,1,0,7.0,S,False,False,True
2,0,2,0,62.0,0,0,9.6875,Q,False,True,False


In [24]:
base = base.drop('Embarked', axis=1)
base_test = base_test.drop('Embarked', axis=1)

In [25]:
# dataset  train info
base.info()

<class 'pandas.core.frame.DataFrame'>
Index: 889 entries, 0 to 890
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  889 non-null    int64  
 1   Pclass    889 non-null    int64  
 2   Sex       889 non-null    int64  
 3   Age       889 non-null    float64
 4   SibSp     889 non-null    int64  
 5   Parch     889 non-null    int64  
 6   Fare      889 non-null    float64
 7   C         889 non-null    bool   
 8   Q         889 non-null    bool   
 9   S         889 non-null    bool   
dtypes: bool(3), float64(2), int64(5)
memory usage: 58.2 KB


In [26]:
# datset test info
base_test.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  418 non-null    int64  
 1   Pclass    418 non-null    int64  
 2   Sex       418 non-null    int64  
 3   Age       418 non-null    float64
 4   SibSp     418 non-null    int64  
 5   Parch     418 non-null    int64  
 6   Fare      418 non-null    float64
 7   C         418 non-null    bool   
 8   Q         418 non-null    bool   
 9   S         418 non-null    bool   
dtypes: bool(3), float64(2), int64(5)
memory usage: 24.2 KB


In [27]:
base_test.head(3)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,C,Q,S
0,0,3,0,34.5,0,0,7.8292,False,True,False
1,1,3,1,47.0,1,0,7.0,False,False,True
2,0,2,0,62.0,0,0,9.6875,False,True,False


In [28]:
#Export processed data
base.to_csv('../data/processed/train_data.csv')
base_test.to_csv('../data/processed/test_data.csv')

In [29]:
# Define X and y for train
X_train = base.drop('Survived',axis=1)
y_train = base['Survived']


In [30]:
X_train.shape, y_train.shape

((889, 9), (889,))

### 2. Train Models

In [31]:
# import and train the neighborhood model
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train, y_train)

In [32]:
#neighborhood model score
neigh.score(X_train,y_train)

0.8335208098987626

In [33]:
# import and train the decision tree model
from sklearn import tree
clf = tree.DecisionTreeClassifier(random_state=0)
clf.fit(X_train,y_train)

In [34]:
# decision tree score
clf.score(X_train,y_train)

0.9820022497187851

In [35]:
# import and train the logistic regression model
from sklearn.linear_model import LogisticRegression
clf_log = LogisticRegression(random_state=0, max_iter=1000).fit(X_train,y_train)

In [36]:
#score the logistic regression model
clf_log.score(X_train,y_train)

0.8020247469066367

### 3. Evaluate models

In [37]:
#define X and y for test
X_test = base_test.drop('Survived',axis=1)
y_test = base_test['Survived']

X_test.shape, y_test.shape

((418, 9), (418,))

In [38]:
# KNN predictions
pred_KNN = neigh.predict(X_test)
# Decision tree
pred_tree = clf.predict(X_test)
# Logistic regression predictions
pred_log = clf_log.predict(X_test)
 

In [39]:
# import confusion matrix
from sklearn.metrics import confusion_matrix
#import acurracy metric
from sklearn.metrics import accuracy_score
#import precision metric
from sklearn.metrics import precision_score
#import recall metric
from sklearn.metrics import recall_score


In [40]:
#Verify metrcis for KNN
matrix_KNN = confusion_matrix(y_test, pred_KNN)
accuracy_KNN = accuracy_score(y_test, pred_KNN)
precision_KNN = precision_score(y_test, pred_KNN)
recall_KNN = recall_score(y_test, pred_KNN)
print('KNN METRICS\n')
print(f'Matrix confusion:\n {matrix_KNN}\nAccuracy: {accuracy_score(y_test, pred_KNN)} ')
print(f'Precison: {precision_KNN}\nRecall: {recall_KNN}')

KNN METRICS

Matrix confusion:
 [[189  77]
 [ 76  76]]
Accuracy: 0.6339712918660287 
Precison: 0.49673202614379086
Recall: 0.5


In [41]:
#Verify metrcis for decision tree
matrix_tree = confusion_matrix(y_test, pred_tree)
accuracy_tree = accuracy_score(y_test, pred_tree)
precision_tree = precision_score(y_test, pred_tree)
recall_tree = recall_score(y_test, pred_tree)

print('DECISION TREE METRICS\n')
print(f'Matrix confusion:\n {matrix_tree}\nAccuracy: {accuracy_score(y_test, pred_tree)} ')
print(f'Precison: {precision_tree}\nRecall: {recall_tree}')

DECISION TREE METRICS

Matrix confusion:
 [[219  47]
 [ 50 102]]
Accuracy: 0.7679425837320574 
Precison: 0.6845637583892618
Recall: 0.6710526315789473


In [42]:
#Verify metrics for logistic regression
matrix_log = confusion_matrix(y_test, pred_log)
accuracy_log = accuracy_score(y_test, pred_log)
precision_log = precision_score(y_test, pred_log)
recall_log = recall_score(y_test, pred_log)

print('LOG. REGRESSION METRICS\n') 
print(f'Matrix confusion:\n {matrix_log}\nAccuracy: {accuracy_log} ')
print(f'Precison: {precision_log}\nRecall: {recall_log}')

LOG. REGRESSION METRICS

Matrix confusion:
 [[253  13]
 [ 11 141]]
Accuracy: 0.9425837320574163 
Precison: 0.9155844155844156
Recall: 0.9276315789473685


### 4. Deploy 
Best model: Logistic Regression

In [43]:
#import dump
from joblib import dump
# save the model on the file.joblib
dump(clf_log, '../model/Titanic_model.joblib') 

['../model/Titanic_model.joblib']

In [44]:
#import load model
from joblib import load
#import the model
model = load('../model/Titanic_model.joblib')