In [1]:
# Main file funcitons
from configs import *

# Main manipulation functions
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import  Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import  cross_val_score, cross_val_predict
from sklearn.metrics import  confusion_matrix, precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_curve, roc_curve, roc_auc_score
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn import preprocessing
%matplotlib inline

In [2]:
titanic_train_data = pd.read_csv("dataset/train_analysis.csv")
titanic_test_data = pd.read_csv("dataset/test_analysis.csv")
titanic_train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,1.906724,S
1,1,1,female,38.0,1,0,3.49764,C
2,1,3,female,26.0,0,0,1.970459,S
3,1,1,female,35.0,1,0,3.304258,S
4,0,3,male,35.0,0,0,1.98168,S


## Analysis
Let's get the correlation of features related to survived people which of our interest.

### Pclass
We can see below that, Class which represents categorical ordinal features, has 3 different values from first class to third class, and those in first class are more survived than in other classes.

**More than 60% of people in the first class are survived**

### Sex
Next is the gender, which represent categorical nominal features, and also females are more survived than males.

**More than 74% of females are survived**

### SibSp & Parch

Both represent discrete quantitative variable, first one is about the number of siblings and spouses that the passenger has abroad the ship, second one is that number of parents and children that also the passenger has abroad the ship.

These two features can be combined in just one features that represent the overall family.

**We can see its has no correlation with survived people, as we can see it goes up and down in random way.**

In [3]:
titanic_train_data[['Pclass', 'Survived']].groupby(['Pclass'], 
                                         as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Pclass,Survived
0,1,0.62963
1,2,0.472826
2,3,0.242363


In [4]:
titanic_train_data[['Sex', 'Survived']].groupby(['Sex'], 
                                         as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Sex,Survived
0,female,0.742038
1,male,0.188908


In [5]:
titanic_train_data[['SibSp', 'Survived']].groupby(['SibSp'], 
                                         as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,SibSp,Survived
1,1,0.535885
2,2,0.464286
0,0,0.345395
3,3,0.25
4,4,0.166667
5,5,0.0
6,8,0.0


In [6]:
titanic_train_data[['Parch', 'Survived']].groupby(['Parch'], 
                                         as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Parch,Survived
3,3,0.6
1,1,0.550847
2,2,0.5
0,0,0.343658
5,5,0.2
4,4,0.0
6,6,0.0


## Family

Let's add the SibSp and Parch to each other to represent the family overall and drop these two features as replaced with new feature which family.

In [7]:
titanic_train_data['Overall_family_number'] = titanic_train_data['SibSp'] + titanic_train_data['Parch']
titanic_train_data.drop(['SibSp', 'Parch'], axis=1, inplace=True)

titanic_test_data['Overall_family_number'] = titanic_test_data['SibSp'] + titanic_test_data['Parch']
titanic_test_data.drop(['Parch', 'SibSp'], axis=1, inplace=True)

titanic_train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Overall_family_number
0,0,3,male,22.0,1.906724,S,1
1,1,1,female,38.0,3.49764,C,1
2,1,3,female,26.0,1.970459,S,0
3,1,1,female,35.0,3.304258,S,1
4,0,3,male,35.0,1.98168,S,0


In [8]:
titanic_test_data.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Overall_family_number
0,3,male,34.5,7.8292,Q,0
1,3,female,47.0,7.0,S,1
2,2,male,62.0,9.6875,Q,0
3,3,male,27.0,8.6625,S,0
4,3,female,22.0,12.2875,S,2


## Split Trainging Data

We split data to train and validation based on Survived features to ensure we have representative number of classes in each part of that split.

In [1]:
def split_data(features_based, data):

    split = StratifiedShuffleSplit(n_splits=1, test_size=.2, random_state=42)
    for train_indeces, test_indeces in split.split(data, data[features_based]):
        train_data = data.iloc[train_indeces]
        test_data = data.iloc[test_indeces]

    return train_data, test_data

In [2]:
train_data, validation_data = split_data("Survived", titanic_train_data)
train_data.head()

NameError: name 'titanic_train_data' is not defined

In [10]:
validation_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Overall_family_number
565,0,3,male,24.0,2.765868,S,2
160,0,3,male,44.0,2.479052,S,1
553,1,3,male,22.0,1.904255,C,0
860,0,3,male,41.0,2.384668,S,2
241,1,3,female,28.0,2.451946,Q,1


In [11]:
check_split_error(titanic_train_data, train_data, validation_data, 'Survived')

Unnamed: 0,overall,train,validation,train_error,test_error
0,0.383838,0.383427,0.385475,0.000411,0.001636
1,0.616162,0.616573,0.614525,0.000411,0.001636


In [12]:
def initializing_for_train(numerical_attr, categorical_attr):
    train_data_copy = train_data.copy()
    validation_data_copy = validation_data.copy()

    y_train   = np.array(train_data_copy['Survived'])
    train_data_copy.drop('Survived', axis=1, inplace=True)
    
    y_val   = np.array(validation_data_copy['Survived'])
    validation_data_copy.drop('Survived', axis=1, inplace=True)
    
    
    full_pipeline = ColumnTransformer([
    ('num_pipeline', MinMaxScaler(), numerical_attr),
    ('cat_pipeline', OneHotEncoder(), categorical_attr),
    ])
    

    X_train = full_pipeline.fit_transform(train_data_copy)
    X_val   = full_pipeline.fit_transform(validation_data_copy)

    print(X_train.shape)
    print(X_val.shape)
    
    return X_train, X_val, y_train, y_val


# Logistic Regression

Linear regression was about the continuos values you have to predicit any real numbers (.2, 1, 100, 1000.8), but Logistic Regression is the same as Linear Regression but we map the input to logistic function that map this output to some range between [0-1] for Sigmoid function, or between [-1-1] like tanch and other function, but what we see here is the Sigmoid function.

This maping of [0-1] is the estimated probability of the output for some class based on **threshold** which map this probabilty to belong to some class like if the probability estimation is >= .5 then its positiv class which 1, or < .5 then its negative class 0 for binary classifation problem.

In [13]:
def predict_result(model, X,y):
    predicted = model.predict(X)
    print("Accuracy", round(f1_score(y, predicted, average='micro')*100, 2))
    print("="*50)
    print(confusion_matrix(y, predicted))
    return True

In [14]:
numerical_attr    = [ 'Pclass', 'Overall_family_number', 'Fare', 'Age']
categorical_attri = ['Sex', 'Embarked']

X_train, X_val, y_train, y_val = initializing_for_train(numerical_attr, categorical_attri)


clf_model_1 = LogisticRegression(random_state=0).fit(X_train, y_train)
print("================ Train Accuracy ======================")
predict_result(clf_model_1, X_train, y_train)
print("================ Validation Accuracy ======================")
predict_result(clf_model_1, X_val, y_val)

(712, 9)
(179, 9)
Accuracy 79.78
[[374  65]
 [ 79 194]]
Accuracy 79.33
[[96 14]
 [23 46]]


True

## Features effect 
Let's run the same model while we drop some of these features.

Start by droping one of the main features which is **Pclass**.

In [15]:
numerical_attr    = [ 'Overall_family_number', 'Fare', 'Age']
categorical_attri = ['Sex', 'Embarked']

X_train, X_val, y_train, y_val = initializing_for_train(numerical_attr, categorical_attri)

clf_model_2 = LogisticRegression(random_state=0).fit(X_train, y_train)
print("================ Train Accuracy ======================")
predict_result(clf_model_2, X_train, y_train)
print("================ Validation Accuracy ======================")
predict_result(clf_model_2, X_val, y_val)

(712, 8)
(179, 8)
Accuracy 78.37
[[372  67]
 [ 87 186]]
Accuracy 78.77
[[95 15]
 [23 46]]


True

In [16]:
numerical_attr    = [ 'Overall_family_number', 'Fare']
categorical_attri = ['Sex']

X_train, X_val, y_train, y_val = initializing_for_train(numerical_attr, categorical_attri)


clf_model_3 = LogisticRegression(random_state=0).fit(X_train, y_train)
print("================ Train Accuracy ======================")
predict_result(clf_model_3, X_train, y_train)
print("================ Validation Accuracy ======================")
predict_result(clf_model_3, X_val, y_val)

(712, 4)
(179, 4)
Accuracy 78.93
[[374  65]
 [ 85 188]]
Accuracy 78.77
[[95 15]
 [23 46]]


True

In [17]:
numerical_attr    = [ 'Pclass', 'Overall_family_number', 'Fare', 'Age']
categorical_attri = [ 'Embarked']

X_train, X_val, y_train, y_val = initializing_for_train(numerical_attr, categorical_attri)


clf_model_4 = LogisticRegression(random_state=0).fit(X_train, y_train)
print("================ Train Accuracy ======================")
predict_result(clf_model_4, X_train, y_train)
print("================ Validation Accuracy ======================")
predict_result(clf_model_4, X_val, y_val)

(712, 7)
(179, 7)
Accuracy 70.37
[[377  62]
 [149 124]]
Accuracy 67.6
[[94 16]
 [42 27]]


True

## Note 

So as we can see the most important and effect features to predict survived people are about the gender, we starting by train the model without and with some of the features, and we found that most efficient feature is the sex, and without that features the model beside of get high bias, it also high variance and we can see the difference between the training and validation in accuracy.

## Train on all data

Now after we have done some of the experiments with model we can select first model as it give best result and as trade-off between bias and variance, then we will predict the test set.

In [18]:
numerical_attr    = [ 'Pclass', 'Overall_family_number', 'Fare', 'Age']
categorical_attr = ['Sex', 'Embarked']

y_train = np.array(titanic_train_data['Survived'])

titanic_train_data.drop(['Survived'], axis=1, inplace=True)


full_pipeline = ColumnTransformer([
('num_pipeline', MinMaxScaler(), numerical_attr),
('cat_pipeline', OneHotEncoder(), categorical_attr),
])


X_train = full_pipeline.fit_transform(titanic_train_data)
X_test   = full_pipeline.fit_transform(titanic_test_data)


clf_model_1 = LogisticRegression(random_state=0).fit(X_train, y_train)
print("================ Train Accuracy ======================")
predict_result(clf_model_1, X_train, y_train)

Accuracy 80.13
[[472  77]
 [100 242]]


True

In [19]:
predicted = clf_model_1.predict(X_test)
predicted[:5]

array([0, 0, 0, 0, 0])

In [20]:
titanic_test_data['Survived'] = predicted
titanic_test_data.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Overall_family_number,Survived
0,3,male,34.5,7.8292,Q,0,0
1,3,female,47.0,7.0,S,1,0
2,2,male,62.0,9.6875,Q,0,0
3,3,male,27.0,8.6625,S,0,0
4,3,female,22.0,12.2875,S,2,0


In [21]:
titanic_test_data.to_csv('dataset/test_prediced_data.csv', index=False)

In [22]:
test_kaggle_data = pd.read_csv('dataset/kaggle_test.csv')
test_kaggle_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [23]:
PassengerId = test_kaggle_data['PassengerId']
test_kaggle_data['Overall_family_number'] = test_kaggle_data['Parch'] + test_kaggle_data['SibSp']
test_kaggle_data.drop(['Name', 'Ticket', 'Cabin', 'PassengerId', 'Parch', 'SibSp'], axis=1, inplace=True)
test_kaggle_data.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Overall_family_number
0,3,male,34.5,7.8292,Q,0
1,3,female,47.0,7.0,S,1
2,2,male,62.0,9.6875,Q,0
3,3,male,27.0,8.6625,S,0
4,3,female,22.0,12.2875,S,2


In [24]:
test_kaggle_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Pclass                 418 non-null    int64  
 1   Sex                    418 non-null    object 
 2   Age                    332 non-null    float64
 3   Fare                   417 non-null    float64
 4   Embarked               418 non-null    object 
 5   Overall_family_number  418 non-null    int64  
dtypes: float64(2), int64(2), object(2)
memory usage: 19.7+ KB


In [25]:
test_kaggle_data['Age'] = test_kaggle_data['Age'].fillna(test_kaggle_data['Age'].median())
test_kaggle_data['Fare'] = test_kaggle_data['Fare'].fillna(test_kaggle_data['Fare'].median())
test_kaggle_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Pclass                 418 non-null    int64  
 1   Sex                    418 non-null    object 
 2   Age                    418 non-null    float64
 3   Fare                   418 non-null    float64
 4   Embarked               418 non-null    object 
 5   Overall_family_number  418 non-null    int64  
dtypes: float64(2), int64(2), object(2)
memory usage: 19.7+ KB


In [26]:
numerical_attr    = [ 'Pclass', 'Overall_family_number', 'Fare', 'Age']
categorical_attr = ['Sex', 'Embarked']




full_pipeline = ColumnTransformer([
('num_pipeline', MinMaxScaler(), numerical_attr),
('cat_pipeline', OneHotEncoder(), categorical_attr),
])


X_test   = full_pipeline.fit_transform(test_kaggle_data)
X_test.shape

(418, 9)

In [27]:
predicted = clf_model_1.predict(X_test)
test_kaggle_data['Survived'] = predicted
test_kaggle_data.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Overall_family_number,Survived
0,3,male,34.5,7.8292,Q,0,0
1,3,female,47.0,7.0,S,1,0
2,2,male,62.0,9.6875,Q,0,0
3,3,male,27.0,8.6625,S,0,0
4,3,female,22.0,12.2875,S,2,0


In [28]:
test_kaggle_data.drop(['Sex', 'Age', 'Pclass', 'Fare', 'Embarked', 'Overall_family_number'], axis=1, inplace=True)
test_kaggle_data['PassengerId'] = PassengerId
test_kaggle_data.head()

Unnamed: 0,Survived,PassengerId
0,0,892
1,0,893
2,0,894
3,0,895
4,0,896


In [29]:
test_kaggle_data.to_csv('dataset/test_kaggle_data_prediced.csv', index=False)