# Import

In [1]:
import numpy    as np
import pandas   as pd
import seaborn  as sns
import matplotlib.pyplot as plt
import sklearn  as skl

from sklearn import pipeline      # Pipeline
from sklearn import preprocessing # OrdinalEncoder, LabelEncoder
from sklearn import impute
from sklearn import compose
from sklearn import model_selection # train_test_split
from sklearn import metrics         # accuracy_score, balanced_accuracy_score, plot_confusion_matrix
from sklearn import linear_model    # LogisticRegression
from sklearn import set_config

set_config(display='diagram') # Useful for display the pipeline

print("Pandas  ", pd.__version__)
print("Sklearn ", skl.__version__)

Pandas   1.3.0
Sklearn  0.24.2


# Load Data

In [2]:
df_train = pd.read_csv('titanic/train.csv', index_col='PassengerId')
df_test = pd.read_csv('titanic/test.csv', index_col='PassengerId')

In [3]:
df_train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df_test.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


# Check Missing

In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [6]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Name      418 non-null    object 
 2   Sex       418 non-null    object 
 3   Age       332 non-null    float64
 4   SibSp     418 non-null    int64  
 5   Parch     418 non-null    int64  
 6   Ticket    418 non-null    object 
 7   Fare      417 non-null    float64
 8   Cabin     91 non-null     object 
 9   Embarked  418 non-null    object 
dtypes: float64(2), int64(3), object(5)
memory usage: 35.9+ KB


In [7]:
df_train.isnull().sum()/len(df_train)

Survived    0.000000
Pclass      0.000000
Name        0.000000
Sex         0.000000
Age         0.198653
SibSp       0.000000
Parch       0.000000
Ticket      0.000000
Fare        0.000000
Cabin       0.771044
Embarked    0.002245
dtype: float64

In [8]:
df_test.isnull().sum()/len(df_test)

Pclass      0.000000
Name        0.000000
Sex         0.000000
Age         0.205742
SibSp       0.000000
Parch       0.000000
Ticket      0.000000
Fare        0.002392
Cabin       0.782297
Embarked    0.000000
dtype: float64

# Feature Extraction

In [9]:
df_train

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


## Name

In [10]:
get_title = lambda x : x.split(',')[-1].split('.')[0]

In [11]:
df_train['Name'] = df_train['Name'].map(get_title)
df_test['Name'] = df_test['Name'].map(get_title)

In [12]:
df_train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,Mr,male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,Mrs,female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,Miss,female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,Mrs,female,35.0,1,0,113803,53.1,C123,S
5,0,3,Mr,male,35.0,0,0,373450,8.05,,S


In [13]:
df_test.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,Mr,male,34.5,0,0,330911,7.8292,,Q
893,3,Mrs,female,47.0,1,0,363272,7.0,,S
894,2,Mr,male,62.0,0,0,240276,9.6875,,Q
895,3,Mr,male,27.0,0,0,315154,8.6625,,S
896,3,Mrs,female,22.0,1,1,3101298,12.2875,,S


##  Sex

In [14]:
### For looping through pandas is slow. Produces none values

# def sex_to_binary(col):
#     for x in col:
#         if x == 'male':
#             x = 1
#         else:
#             x = 0


In [15]:
sex_to_binary = {'m' : 1, 'f' : 0}
df_train['Sex'] = df_train['Sex'].str[0].str.lower().map(sex_to_binary)
df_test['Sex'] = df_test['Sex'].str[0].str.lower().map(sex_to_binary)

In [16]:
df_train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,Mr,1,22.0,1,0,A/5 21171,7.25,,S
2,1,1,Mrs,0,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,Miss,0,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,Mrs,0,35.0,1,0,113803,53.1,C123,S
5,0,3,Mr,1,35.0,0,0,373450,8.05,,S


In [17]:
df_test.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,Mr,1,34.5,0,0,330911,7.8292,,Q
893,3,Mrs,0,47.0,1,0,363272,7.0,,S
894,2,Mr,1,62.0,0,0,240276,9.6875,,Q
895,3,Mr,1,27.0,0,0,315154,8.6625,,S
896,3,Mrs,0,22.0,1,1,3101298,12.2875,,S


## Age

In [18]:
# to_int = lambda x : x.astype(int)

# df_train['Age'] = df_train['Age'].map(to_int)

In [19]:
# age_imp = impute.SimpleImputer(strategy='mean')

In [20]:
# df_train['Age'] = age_imp.fit_transform(df_train['Age'])

In [21]:
df_train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,Mr,1,22.0,1,0,A/5 21171,7.25,,S
2,1,1,Mrs,0,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,Miss,0,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,Mrs,0,35.0,1,0,113803,53.1,C123,S
5,0,3,Mr,1,35.0,0,0,373450,8.05,,S


In [22]:
# df_train['Age'] = df_train['Age'].astype(int)

## Embarked

In [23]:
df_train.Embarked.value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

# Drop Columns

In [24]:
df_train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,Mr,1,22.0,1,0,A/5 21171,7.25,,S
2,1,1,Mrs,0,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,Miss,0,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,Mrs,0,35.0,1,0,113803,53.1,C123,S
5,0,3,Mr,1,35.0,0,0,373450,8.05,,S


In [25]:
df_test.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,Mr,1,34.5,0,0,330911,7.8292,,Q
893,3,Mrs,0,47.0,1,0,363272,7.0,,S
894,2,Mr,1,62.0,0,0,240276,9.6875,,Q
895,3,Mr,1,27.0,0,0,315154,8.6625,,S
896,3,Mrs,0,22.0,1,1,3101298,12.2875,,S


In [26]:
x = df_train.drop(columns=['Survived','Ticket', 'Fare', 'Cabin'])
y = df_train['Survived']

x_test = df_test.drop(columns=['Ticket', 'Fare', 'Cabin'])

In [27]:
y.head()

PassengerId
1    0
2    1
3    1
4    1
5    0
Name: Survived, dtype: int64

# Validation

In [28]:
x_train, x_val, y_train, y_val = model_selection.train_test_split(
    x, y,
    test_size=0.2,
    stratify=y,
    random_state=1
)

# Prepro pipeline

In [29]:
str_vars = ['Name', 'Embarked']
num_vars = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch']

In [30]:
num_prepoccessing = pipeline.Pipeline(steps=[
    ('impute', impute.SimpleImputer(strategy='mean', add_indicator=False)),
    ('scaler', preprocessing.StandardScaler())
])
str_prepoccessing = pipeline.Pipeline(steps=[
    ('impute', impute.SimpleImputer(strategy='constant', add_indicator=False)),
    ('onehot', preprocessing.OneHotEncoder(handle_unknown='ignore'))
])
prepro = compose.ColumnTransformer(transformers=[
    ('num', num_prepoccessing, num_vars),
    ('str', str_prepoccessing, str_vars)
])

prepro

In [31]:
x_train_prepro = prepro.fit_transform(x_train)
x_val_prepro = prepro.transform(x_val)

x_test_prepro = prepro.transform(x_test)

In [32]:
pd.DataFrame(x_train_prepro)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,0.836300,-1.367833,0.000000,-0.464873,-0.482466,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.836300,-1.367833,-1.435646,3.113390,1.980012,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.836300,-1.367833,-0.897476,0.429693,-0.482466,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.836300,0.731083,0.000000,-0.464873,-0.482466,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,-1.535993,-1.367833,-2.127579,0.429693,1.980012,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
707,0.836300,0.731083,0.000000,-0.464873,-0.482466,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
708,0.836300,0.731083,0.101983,-0.464873,-0.482466,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
709,-0.349846,0.731083,0.486390,-0.464873,-0.482466,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
710,-1.535993,-1.367833,0.870797,-0.464873,-0.482466,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [33]:
model = linear_model.LogisticRegression(max_iter=2000, random_state=1)
model.fit(x_train_prepro, y_train)

# Model pipeline

In [34]:
full_model = pipeline.Pipeline([('Preproccessing', prepro), ('Model', model)])
full_model

In [38]:
y_pred = model.predict(x_test_prepro)

#print("Accuracy:\t", metrics.accuracy_score(y_val, y_pred)*100)
print("Accuracy:\t", model.score(x_val_prepro, y_val)*100)
#print("Balanced accuracy:\t", metrics.balanced_accuracy_score(y_val, y_pred)*100)

Accuracy:	 83.24022346368714


# Save Model

In [39]:
sub_df = pd.DataFrame(y_pred)

In [40]:
sub_df

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,1
...,...
413,0
414,1
415,0
416,0


In [41]:
sub_df = sub_df.rename(columns={0 : 'Survived'})
sub_df.index = x_test.index

In [42]:
sub_df

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,0
896,1
...,...
1305,0
1306,1
1307,0
1308,0


In [43]:
#sub_df.to_csv('titanic_sub.csv')