# Titanic - Machine Learning from Disaster

Predict survival on the Titanic

In [1]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.style=('darkgrid')

In [2]:
train_data = pd.read_csv('TITANIC_Folds.csv')
test_data = pd.read_csv('test.csv')
sample = pd.read_csv('gender_submission.csv')

In [3]:
train_data.shape

(891, 13)

In [4]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,fold
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,4
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,2
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,4


In [5]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [6]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
 12  fold         891 non-null    int64  
dtypes: float64(2), int64(6), object(5)
memory usage: 90.6+ KB


In [7]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [8]:
train_data.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
fold             0
dtype: int64

In [9]:
test_data.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [10]:
train_data.drop('Cabin', axis=1, inplace=True)
test_data.drop('Cabin', axis=1, inplace= True)

In [11]:
train_data['Age'] = train_data['Age'].fillna(int(train_data['Age'].mean()))
train_data['Embarked'] = train_data['Embarked'].fillna(train_data['Embarked'].mode()[0])
test_data['Age'] = test_data['Age'].fillna(int(test_data['Age'].mean()))
test_data['Fare'] = test_data['Fare'].fillna(test_data['Fare'].median())

In [12]:
train_data.isna().sum()
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Embarked     891 non-null    object 
 11  fold         891 non-null    int64  
dtypes: float64(2), int64(6), object(4)
memory usage: 83.7+ KB


In [13]:
test_data.isna().sum()
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          418 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         418 non-null    float64
 9   Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(4)
memory usage: 32.8+ KB


In [14]:
train_data['isTrain'] = 1
test_data['isTrain'] = 0

In [15]:
tt = pd.concat([train_data, test_data])
tt.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,fold,isTrain
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,0.0,1
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,1.0,1
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,4.0,1
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,2.0,1
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,4.0,1
5,6,0.0,3,"Moran, Mr. James",male,29.0,0,0,330877,8.4583,Q,3.0,1
6,7,0.0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,S,2.0,1
7,8,0.0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,S,2.0,1
8,9,1.0,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,S,1.0,1
9,10,1.0,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,C,1.0,1


In [16]:
tt.drop(['Name', 'Ticket'], axis=1, inplace = True)

In [17]:
tt.shape

(1309, 11)

In [18]:
tt = pd.get_dummies(tt,columns= ['Sex','Embarked'],drop_first = True)

In [19]:
tt.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,fold,isTrain,Sex_male,Embarked_Q,Embarked_S
0,1,0.0,3,22.0,1,0,7.25,0.0,1,1,0,1
1,2,1.0,1,38.0,1,0,71.2833,1.0,1,0,0,0


In [20]:
tt.shape

(1309, 12)

In [21]:
train_data = tt[tt.isTrain == 1]
test_data = tt[tt.isTrain == 0]

In [22]:
train_data.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,fold,isTrain,Sex_male,Embarked_Q,Embarked_S
0,1,0.0,3,22.0,1,0,7.25,0.0,1,1,0,1
1,2,1.0,1,38.0,1,0,71.2833,1.0,1,0,0,0


In [23]:
test_data.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,fold,isTrain,Sex_male,Embarked_Q,Embarked_S
0,892,,3,34.5,0,0,7.8292,,0,1,1,0
1,893,,3,47.0,1,0,7.0,,0,0,0,1


In [24]:
train_data = train_data.astype({"fold":"int","Survived":"int"})

In [25]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Age          891 non-null    float64
 4   SibSp        891 non-null    int64  
 5   Parch        891 non-null    int64  
 6   Fare         891 non-null    float64
 7   fold         891 non-null    int64  
 8   isTrain      891 non-null    int64  
 9   Sex_male     891 non-null    uint8  
 10  Embarked_Q   891 non-null    uint8  
 11  Embarked_S   891 non-null    uint8  
dtypes: float64(2), int64(7), uint8(3)
memory usage: 72.2 KB


In [26]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Survived     0 non-null      float64
 2   Pclass       418 non-null    int64  
 3   Age          418 non-null    float64
 4   SibSp        418 non-null    int64  
 5   Parch        418 non-null    int64  
 6   Fare         418 non-null    float64
 7   fold         0 non-null      float64
 8   isTrain      418 non-null    int64  
 9   Sex_male     418 non-null    uint8  
 10  Embarked_Q   418 non-null    uint8  
 11  Embarked_S   418 non-null    uint8  
dtypes: float64(4), int64(5), uint8(3)
memory usage: 33.9 KB


In [27]:
my_folds = train_data.drop(train_data.loc[:,['PassengerId','isTrain']], axis=1)
# train_data.drop(['isTrain'], axis=1, inplace=True)
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Age          891 non-null    float64
 4   SibSp        891 non-null    int64  
 5   Parch        891 non-null    int64  
 6   Fare         891 non-null    float64
 7   fold         891 non-null    int64  
 8   isTrain      891 non-null    int64  
 9   Sex_male     891 non-null    uint8  
 10  Embarked_Q   891 non-null    uint8  
 11  Embarked_S   891 non-null    uint8  
dtypes: float64(2), int64(7), uint8(3)
memory usage: 72.2 KB


In [28]:
test = test_data.drop(test_data.loc[:,['Survived', 'isTrain', 'fold']], axis=1)
# test_data.drop(['Survived', 'isTrain', 'fold'], axis=1, inplace=True)
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 0 to 417
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Age          418 non-null    float64
 3   SibSp        418 non-null    int64  
 4   Parch        418 non-null    int64  
 5   Fare         418 non-null    float64
 6   Sex_male     418 non-null    uint8  
 7   Embarked_Q   418 non-null    uint8  
 8   Embarked_S   418 non-null    uint8  
dtypes: float64(2), int64(4), uint8(3)
memory usage: 24.1 KB


In [29]:
# Note don't drop PassengerId column of train_data and test_data as it is used later. Instead drop it only for test and my_folds
useful_features = test.drop('PassengerId',axis=1).columns.tolist() #########################################
test = test_data[useful_features]
# my_folds = train_data.copy()

In [30]:
my_folds.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,fold,Sex_male,Embarked_Q,Embarked_S
0,0,3,22.0,1,0,7.25,0,1,0,1
1,1,1,38.0,1,0,71.2833,1,0,0,0
2,1,3,26.0,0,0,7.925,4,0,0,1
3,1,1,35.0,1,0,53.1,2,0,0,1
4,0,3,35.0,0,0,8.05,4,1,0,1


In [31]:
test.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,3,34.5,0,0,7.8292,1,1,0
1,3,47.0,1,0,7.0,0,0,1
2,2,62.0,0,0,9.6875,1,1,0
3,3,27.0,0,0,8.6625,1,0,1
4,3,22.0,1,1,12.2875,0,0,1


In [32]:
print(test.shape, my_folds.shape, useful_features)

(418, 8) (891, 10) ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_male', 'Embarked_Q', 'Embarked_S']


In [33]:
print(my_folds.columns.tolist())

['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'fold', 'Sex_male', 'Embarked_Q', 'Embarked_S']


In [34]:
print(test.columns.tolist())

['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_male', 'Embarked_Q', 'Embarked_S']


In [35]:
from sklearn.model_selection import LeaveOneGroupOut

In [36]:
logo = LeaveOneGroupOut()
logo.get_n_splits(groups=my_folds['fold'])

5

In [37]:
my_folds1 = my_folds.copy()
my_folds1.drop(['fold'],axis=1,inplace=True)
my_folds1_train = my_folds1.drop(['Survived'], axis=1)
my_folds1_valid = my_folds1['Survived']
test_X = test.copy()
sample.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [38]:
import numpy as np
my_folds1_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Pclass      891 non-null    int64  
 1   Age         891 non-null    float64
 2   SibSp       891 non-null    int64  
 3   Parch       891 non-null    int64  
 4   Fare        891 non-null    float64
 5   Sex_male    891 non-null    uint8  
 6   Embarked_Q  891 non-null    uint8  
 7   Embarked_S  891 non-null    uint8  
dtypes: float64(2), int64(3), uint8(3)
memory usage: 44.4 KB


In [39]:
groups = my_folds['fold']
#for i, (train_index, test_index) in enumerate(logo.split(my_folds.drop(columns=['Survived']), my_folds['Survived'], groups)):
#     print(f"Fold {i}:")
#     print(f"  Train: index={train_index}, group={groups[train_index]}")
#     print(f"  Test:  index={test_index}, group={groups[test_index]}")

In [45]:
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [48]:
pipe = Pipeline([("imputer", SimpleImputer(strategy='median')), ("scaler", StandardScaler()), ("svm", SVC())])
param_grid = {'svm__C': [0.001, 0.01, 0.1, 1, 10, 100]}
grid = GridSearchCV(pipe, param_grid, cv=LeaveOneGroupOut())
grid.fit(my_folds1_train, my_folds1_valid, groups=my_folds['fold'])
print("Best params:\n{}\n".format(grid.best_params_))
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
test_preds = grid.predict(test)
output = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': test_preds})
output.to_csv('submission.cv', index=False)

Best params:
{'svm__C': 1}

Best cross-validation score: 0.83


# correlation
# remove_outliers
# feature_selection
# feature - family size, social_status(Mr.Mrs.)

In [None]:
#Outliers
#Common method : IQR (Inter Quartile Range)
from collections import Counter
def outlier_detection(df, features):
    outlier_index = []
    
    for col in features:
        Q1 = np.percentile(df[col], 25, interpolation = 'midpoint')
        Q3 = np.percentile(df[col], 75, interpolation = 'midpoint')
        IQR = Q3-Q1
        
        # Upper bound & Lower bound
        index = df[(df[col] > (Q3+1.5*IQR))| (df[col] < (Q1-1.5*IQR))].index.to_list()
        outlier_index.extend(index)
        
    # Counts duplucates of index and select the outlier index with more than 3 outlier detection  
    counts = dict(Counter(outlier_index))
    duplicates = {key:value for key, value in counts.items() if value >= 3}
        
    return duplicates

outlier_detection(data_train, ["Age","Fare", "SibSp","Parch"])