In [1]:
import pandas as pd
import numpy as np

## Exoloratory data analysis

In [2]:
train = pd.read_csv("train.csv")

In [3]:
train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [4]:
train.shape

(891, 12)

In [5]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [6]:
train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [7]:
train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

##### Age, Cabin and Embarked have missing values. Cabin has more than %70 missing values so it's better to ignore this variable. It also is the cabin number which do not seem to be contributing to the model.

In [8]:
train.describe(include='all')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,204,889
unique,,,,891,2,,,,681.0,,147,3
top,,,,"Mionoff, Mr. Stoytcho",male,,,,1601.0,,G6,S
freq,,,,1,577,,,,7.0,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


###### All the values seem to be reasonable and meaningful when look at minmum and maxmimum values for each variable.

#### Filling NAs with the most frequent answer for Embarked

In [9]:
train['Embarked']= train.Embarked.fillna(value='S')

In [10]:
train.Embarked.isna().sum()

0

In [11]:
train['Sex']= train.Sex.map({'female':0, 'male':1})

In [12]:
train.Sex.dtype

dtype('int64')

In [13]:
train.Sex.head()

0    1
1    0
2    0
3    0
4    1
Name: Sex, dtype: int64

In [14]:
train['Embarked']= train.Embarked.map({'S':0, 'C':1, 'Q':2})

In [15]:
train.Embarked.dtype

dtype('int64')

In [16]:
train.Embarked.head()

0    0
1    1
2    0
3    0
4    0
Name: Embarked, dtype: int64

In [17]:
missing_age = train.loc[train.Age.isna(),:].head()

In [18]:
missing_age.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
5,6,0,3,"Moran, Mr. James",1,,0,0,330877,8.4583,,2
17,18,1,2,"Williams, Mr. Charles Eugene",1,,0,0,244373,13.0,,0
19,20,1,3,"Masselmani, Mrs. Fatima",0,,0,0,2649,7.225,,1
26,27,0,3,"Emir, Mr. Farred Chehab",1,,0,0,2631,7.225,,1
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",0,,0,0,330959,7.8792,,2


In [19]:
train.Age.mean() # excluding missing values

29.69911764705882

In [20]:
train['Age']= train.Age.fillna (train.Age.mean())

In [21]:
train.Age.mean() # after filling missing values

29.699117647058763

#### Modelling

In [22]:
features1 = train.loc[:,['Pclass','Sex','SibSp','Parch','Fare','Embarked','Age']]

In [23]:
features1.shape

(891, 7)

In [24]:
features1.head()

Unnamed: 0,Pclass,Sex,SibSp,Parch,Fare,Embarked,Age
0,3,1,1,0,7.25,0,22.0
1,1,0,1,0,71.2833,1,38.0
2,3,0,0,0,7.925,0,26.0
3,1,0,1,0,53.1,0,35.0
4,3,1,0,0,8.05,0,35.0


In [25]:
lable1 = train['Survived']

In [26]:
lable1.shape

(891,)

In [27]:
from sklearn.model_selection import train_test_split

In [28]:
X_train, X_test, y_train, y_test = train_test_split(features1, lable1, test_size= 0.25, random_state=22)

In [29]:
X_train.shape

(668, 7)

In [30]:
X_test.shape

(223, 7)

### Logistic regression modelling

In [31]:
from sklearn.linear_model import LogisticRegression

In [32]:
logreg = LogisticRegression(solver='lbfgs', max_iter=4000, random_state=1000)

In [33]:
logreg.fit (X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=4000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=1000, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [34]:
print(logreg.intercept_)

[4.68109474]


In [35]:
print(logreg.coef_)

[[-1.04886831e+00 -2.71130043e+00 -2.43533764e-01 -1.41463999e-01
   1.10728303e-03  2.10541318e-01 -3.67157330e-02]]


In [36]:
features1.columns

Index(['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Age'], dtype='object')

In [37]:
y_predict_1 = logreg.predict (X_test)

In [38]:
from sklearn import metrics

In [39]:
print (metrics.accuracy_score(y_test, y_predict_1))

0.7847533632286996


In [40]:
print (metrics.confusion_matrix(y_test, y_predict_1))

[[114  19]
 [ 29  61]]


In [41]:
print (metrics.classification_report(y_test, y_predict_1))

              precision    recall  f1-score   support

           0       0.80      0.86      0.83       133
           1       0.76      0.68      0.72        90

   micro avg       0.78      0.78      0.78       223
   macro avg       0.78      0.77      0.77       223
weighted avg       0.78      0.78      0.78       223



##### The model evaluation does not seem to be good. Letès try logistic regression modelling with cross validation

### Logistic regression with cross validation

In [42]:
from sklearn.model_selection import cross_val_score

In [43]:
logreg2 = LogisticRegression(solver='lbfgs', max_iter = 4000, random_state=2000)

In [44]:
scores= cross_val_score (logreg2, features1, lable1, cv=10, scoring='accuracy')

In [45]:
scores

array([0.78888889, 0.78888889, 0.75280899, 0.83146067, 0.80898876,
       0.7752809 , 0.78651685, 0.78651685, 0.79775281, 0.82954545])

In [46]:
scores.mean()

0.7946649075019862

#### The accuracy did not considerably imporve so we will try a random forest model for this dataset.

In [47]:
from sklearn.ensemble import RandomForestClassifier

In [48]:
rf = RandomForestClassifier(n_estimators=100)

In [49]:
rf.fit (X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [50]:
y_predict_rf = rf.predict(X_test)

In [51]:
print (metrics.accuracy_score(y_test, y_predict_rf))

0.7937219730941704


#### Feature engineering to improve the model ccuracy

In [52]:
features1.columns

Index(['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Age'], dtype='object')

In [53]:
features_imp= pd.Series (rf.feature_importances_, features1.columns)

In [54]:
features_imp.sort_values(ascending=False)

Sex         0.276526
Age         0.264449
Fare        0.259435
Pclass      0.073114
SibSp       0.049973
Parch       0.042093
Embarked    0.034411
dtype: float64

### Model tunning

Only the first three fetures will be picked for re-modelling. Grid Search will be also used to tune the model paramerts:

In [70]:
features2 = train.loc [:, ['Sex', 'Age', 'Fare']]

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [63]:
params_dict = dict(n_estimators= np.arange(50,300, 10), min_samples_split= np.linspace(0.1, 1, 10), max_depth= np.arange (2, 30, 1), min_samples_leaf= np.linspace(0.1, 0.5, 5))

In [64]:
params_dict

{'n_estimators': array([ 50,  60,  70,  80,  90, 100, 110, 120, 130, 140, 150, 160, 170,
        180, 190, 200, 210, 220, 230, 240, 250, 260, 270, 280, 290]),
 'min_samples_split': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ]),
 'max_depth': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
        19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]),
 'min_samples_leaf': array([0.1, 0.2, 0.3, 0.4, 0.5])}

In [65]:
rf2 = RandomForestClassifier()

In [67]:
rand = RandomizedSearchCV(rf2, params_dict, cv=10, scoring='accuracy', n_iter=10, random_state=5, return_train_score=False)

n_iter controls the nubmer of seraches

In [71]:
rand.fit(features2, lable1)

RandomizedSearchCV(cv=10, error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid='warn', n_iter=10, n_jobs=None,
          param_distributions={'n_estimators': array([ 50,  60,  70,  80,  90, 100, 110, 120, 130, 140, 150, 160, 170,
       180, 190, 200, 210, 220, 230, 240, 250, 260, 270, 280, 290]), 'min_samples_split': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ]), 'max_depth': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]), 'min_samples_leaf': 

In [75]:
pd.DataFrame(rand.cv_results_)[['mean_test_score', 'std_test_score', 'params']]

Unnamed: 0,mean_test_score,std_test_score,params
0,0.616162,0.002844,"{'n_estimators': 180, 'min_samples_split': 0.6..."
1,0.616162,0.002844,"{'n_estimators': 180, 'min_samples_split': 0.9..."
2,0.616162,0.002844,"{'n_estimators': 250, 'min_samples_split': 0.1..."
3,0.782267,0.031508,"{'n_estimators': 50, 'min_samples_split': 0.30..."
4,0.616162,0.002844,"{'n_estimators': 130, 'min_samples_split': 0.1..."
5,0.616162,0.002844,"{'n_estimators': 80, 'min_samples_split': 0.5,..."
6,0.616162,0.002844,"{'n_estimators': 50, 'min_samples_split': 0.1,..."
7,0.616162,0.002844,"{'n_estimators': 230, 'min_samples_split': 0.9..."
8,0.616162,0.002844,"{'n_estimators': 60, 'min_samples_split': 0.8,..."
9,0.616162,0.002844,"{'n_estimators': 170, 'min_samples_split': 0.8..."


In [73]:
print(rand.best_score_)

0.7822671156004489


In [77]:
print (rand.best_params_)

{'n_estimators': 50, 'min_samples_split': 0.30000000000000004, 'min_samples_leaf': 0.1, 'max_depth': 29}


### The accuracy is below %80 in all 3 models so we will work on feature enineering part a bit more to increase the accuracy (part 2).