# Titanic Dataset

## Lib Imports

In [124]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [125]:
RANDOM_STATE = 123

## Dataset Loading

In [126]:
train_df = pd.read_csv('train.csv')

In [127]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [128]:
train_df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [129]:
train_df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

## Missing Dataset Analysis

In [132]:
empty_age_mask = (train_df['Age'].isna())

In [133]:
empty_age_df = train_df[empty_age_mask]
empty_age_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0000,,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.2250,,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.2250,,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q
...,...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.5500,,S
868,869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5000,,S
878,879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S


### Ticket Investigation

In [88]:
train_df['Ticket'].value_counts()

Ticket
347082              7
1601                7
CA. 2343            7
3101295             6
CA 2144             6
                   ..
PC 17590            1
17463               1
330877              1
373450              1
STON/O2. 3101282    1
Name: count, Length: 681, dtype: int64

In [89]:
ticket_test_mask = (train_df['Ticket'] == 'CA 2144')

In [90]:
train_df[ticket_test_mask]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
59,60,0,3,"Goodwin, Master. William Frederick",male,11.0,5,2,CA 2144,46.9,,S
71,72,0,3,"Goodwin, Miss. Lillian Amy",female,16.0,5,2,CA 2144,46.9,,S
386,387,0,3,"Goodwin, Master. Sidney Leonard",male,1.0,5,2,CA 2144,46.9,,S
480,481,0,3,"Goodwin, Master. Harold Victor",male,9.0,5,2,CA 2144,46.9,,S
678,679,0,3,"Goodwin, Mrs. Frederick (Augusta Tyler)",female,43.0,1,6,CA 2144,46.9,,S
683,684,0,3,"Goodwin, Mr. Charles Edward",male,14.0,5,2,CA 2144,46.9,,S


In [91]:
train_df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

### Name Investigation

In [92]:
passenger_names = train_df['Name'].str.split(',')
passenger_names

0                             [Braund,  Mr. Owen Harris]
1      [Cumings,  Mrs. John Bradley (Florence Briggs ...
2                              [Heikkinen,  Miss. Laina]
3        [Futrelle,  Mrs. Jacques Heath (Lily May Peel)]
4                            [Allen,  Mr. William Henry]
                             ...                        
886                             [Montvila,  Rev. Juozas]
887                      [Graham,  Miss. Margaret Edith]
888          [Johnston,  Miss. Catherine Helen "Carrie"]
889                             [Behr,  Mr. Karl Howell]
890                               [Dooley,  Mr. Patrick]
Name: Name, Length: 891, dtype: object

In [93]:
titles = [passenger_name[1].lstrip().split(' ')[0][:-1] for passenger_name in passenger_names]

In [94]:
set(titles)

{'Capt',
 'Col',
 'Don',
 'Dr',
 'Jonkheer',
 'Lady',
 'Major',
 'Master',
 'Miss',
 'Mlle',
 'Mme',
 'Mr',
 'Mrs',
 'Ms',
 'Rev',
 'Sir',
 'th'}

## Feature Engineering

In [95]:
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

In [96]:
def preprocess_dataset(df):
    """_summary_

    Args:
        df (_type_): _description_
    """
    # Combine Sib + Parch
    df['Family'] = df['SibSp'] + df['Parch']
    
    # Fill Missing Data
    mean_age = df['Age'].mean()
    df['Age'] = df['Age'].fillna(mean_age)
    mean_fare = df['Fare'].mean()
    df['Fare'] = df['Fare'].fillna(mean_fare)
    
    # Numerical Scaling
    min_max_scaler = MinMaxScaler()
    NUMERICAL_COLUMNS = ['Age', 'Fare', 'Family']
    df[NUMERICAL_COLUMNS] = min_max_scaler.fit_transform(df[NUMERICAL_COLUMNS])
    
    # Label Encoding
    label_encoder = LabelEncoder()
    df['Sex'] = label_encoder.fit_transform(df['Sex'])
    df['Embarked'] = label_encoder.fit_transform(df['Embarked'])
    
    #Drop Columns
    COLUMNS_TO_DROP = ['PassengerId', 'Name', 'Ticket', 'SibSp', 'Parch', 'Cabin']
    df = df.drop(COLUMNS_TO_DROP, axis=1)
    return df

In [97]:
train_df = preprocess_dataset(train_df)

In [98]:
train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Family
0,0,3,1,22.0,0.014151,2,1
1,1,1,0,38.0,0.139136,0,1
2,1,3,0,26.0,0.015469,2,0
3,1,1,0,35.0,0.103644,2,1
4,0,3,1,35.0,0.015713,2,0


In [99]:
train_df.isna().sum()

Survived    0
Pclass      0
Sex         0
Age         0
Fare        0
Embarked    0
Family      0
dtype: int64

In [100]:
len(train_df)

891

## X and y split

In [101]:
Y_COLUMN = 'Survived'

In [102]:
y = train_df[Y_COLUMN]
X = train_df.drop(Y_COLUMN, axis=1)

In [103]:
X.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Family
0,3,1,22.0,0.014151,2,1
1,1,0,38.0,0.139136,0,1
2,3,0,26.0,0.015469,2,0
3,1,0,35.0,0.103644,2,1
4,3,1,35.0,0.015713,2,0


In [104]:
y.value_counts(normalize=True)

Survived
0    0.616162
1    0.383838
Name: proportion, dtype: float64

## Train Test Split

In [105]:
from sklearn.model_selection import train_test_split

In [106]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=RANDOM_STATE)

## Model Selection

In [107]:
from sklearn.linear_model import LogisticRegression

In [108]:
lr_model = LogisticRegression()

In [109]:
lr_model.fit(X_train, y_train)

## Model Evaluation

In [110]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, precision_recall_fscore_support

In [111]:
def get_classification_metrics(y_true, y_pred, name):
    precision, recall, fscore, _ = precision_recall_fscore_support(y_true, y_pred)
    return {
        'name': name,
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred),
        'recall': recall_score(y_true, y_pred),
        'f1_score': f1_score(y_true, y_pred)
    }

In [112]:
y_pred = lr_model.predict(X_test)

In [113]:
get_classification_metrics(y_test, y_pred, 'Logistic Regression')

{'name': 'Logistic Regression',
 'accuracy': 0.7910447761194029,
 'precision': np.float64(0.7142857142857143),
 'recall': np.float64(0.7142857142857143),
 'f1_score': np.float64(0.7142857142857143)}

## Validation Set

In [114]:
val_df = pd.read_csv('test.csv')

In [115]:
val_df.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [116]:
val_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [117]:
X_val = preprocess_dataset(val_df)

In [118]:
X_val

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Family
0,3,1,34.50000,0.015282,1,0
1,3,0,47.00000,0.013663,2,1
2,2,1,62.00000,0.018909,1,0
3,3,1,27.00000,0.016908,2,0
4,3,0,22.00000,0.023984,2,2
...,...,...,...,...,...,...
413,3,1,30.27259,0.015713,2,0
414,1,0,39.00000,0.212559,0,0
415,3,1,38.50000,0.014151,2,0
416,3,1,30.27259,0.015713,2,0


In [119]:
y_val = lr_model.predict(X_val)

In [120]:
submission = pd.DataFrame({'PassengerId': val_df['PassengerId'], 'Survived': y_val})
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [121]:
submission.to_csv('submission.csv', index=False)