In [133]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [134]:
df = pd.read_csv('train.csv', usecols=['Age' ,'Pclass', 'SibSp', 'Parch', 'Survived'])
df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch
0,0,3,22.0,1,0
1,1,1,38.0,1,0
2,1,3,26.0,0,0
3,1,1,35.0,1,0
4,0,3,35.0,0,0


In [135]:
df.dropna(inplace=True)

In [136]:
df.shape

(714, 5)

In [137]:
X = df.iloc[:,1:5]
y = df.iloc[:,0]

In [138]:
X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch
0,3,22.0,1,0
1,1,38.0,1,0
2,3,26.0,0,0
3,1,35.0,1,0
4,3,35.0,0,0


In [139]:
y.head()

Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0


In [140]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((571, 4), (143, 4), (571,), (143,))

In [141]:
clf = LogisticRegression()
clf.fit(X_train, y_train)

pred = clf.predict(X_test)
accuracy_score(y_test, pred)

0.6923076923076923

In [142]:
np.mean(cross_val_score(LogisticRegression(), X_train, y_train, cv=30, scoring='accuracy'))

0.6936842105263159

# Applying Feature Construction

In [143]:
df_new = df.copy()
df_new['Family_size'] = df_new['SibSp'] + df_new['Parch'] + 1
df_new.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Family_size
0,0,3,22.0,1,0,2
1,1,1,38.0,1,0,2
2,1,3,26.0,0,0,1
3,1,1,35.0,1,0,2
4,0,3,35.0,0,0,1


In [144]:
def myfunc(num):
  if num <= 1:
    return 0
  elif num > 1 and num <= 4:
    return 1
  else:
    return 2

In [145]:
df_new['Family_type'] = df_new['Family_size'].apply(myfunc)
df_new.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Family_size,Family_type
0,0,3,22.0,1,0,2,1
1,1,1,38.0,1,0,2,1
2,1,3,26.0,0,0,1,0
3,1,1,35.0,1,0,2,1
4,0,3,35.0,0,0,1,0


In [146]:
df_new.drop(columns=['SibSp', 'Parch'], inplace=True)
df_new.head()

Unnamed: 0,Survived,Pclass,Age,Family_size,Family_type
0,0,3,22.0,2,1
1,1,1,38.0,2,1
2,1,3,26.0,1,0
3,1,1,35.0,2,1
4,0,3,35.0,1,0


In [147]:
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(df_new.iloc[:,1:5], df_new.iloc[:,0], test_size=0.2, random_state=42)
X_train_new.shape, X_test_new.shape, y_train_new.shape, y_test_new.shape

((571, 4), (143, 4), (571,), (143,))

In [148]:
X_train_new, y_train_new

(     Pclass   Age  Family_size  Family_type
 328       3  31.0            3            1
 73        3  26.0            2            1
 253       3  30.0            2            1
 719       3  33.0            1            0
 666       2  25.0            1            0
 ..      ...   ...          ...          ...
 92        1  46.0            2            1
 134       2  25.0            1            0
 337       1  41.0            1            0
 548       3  33.0            3            1
 130       3  33.0            1            0
 
 [571 rows x 4 columns],
 328    1
 73     0
 253    0
 719    0
 666    0
       ..
 92     0
 134    0
 337    1
 548    0
 130    0
 Name: Survived, Length: 571, dtype: int64)

In [149]:
clf_new = LogisticRegression()
clf_new.fit(X_train_new, y_train_new)

pred_new = clf_new.predict(X_test_new)
accuracy_score(y_test_new, pred_new)

0.6853146853146853

In [150]:
np.mean(cross_val_score(LogisticRegression(), X_train_new, y_train_new, cv=30, scoring='accuracy'))

0.7024561403508771

# Applying Feature Splitting

In [151]:
df_s = pd.read_csv('train.csv')
df_s.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [152]:
df_s['Name']

Unnamed: 0,Name
0,"Braund, Mr. Owen Harris"
1,"Cumings, Mrs. John Bradley (Florence Briggs Th..."
2,"Heikkinen, Miss. Laina"
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)"
4,"Allen, Mr. William Henry"
...,...
886,"Montvila, Rev. Juozas"
887,"Graham, Miss. Margaret Edith"
888,"Johnston, Miss. Catherine Helen ""Carrie"""
889,"Behr, Mr. Karl Howell"


In [153]:
df_s['Title'] = df_s['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]

In [154]:
df_s[['Title','Name']]

Unnamed: 0,Title,Name
0,Mr,"Braund, Mr. Owen Harris"
1,Mrs,"Cumings, Mrs. John Bradley (Florence Briggs Th..."
2,Miss,"Heikkinen, Miss. Laina"
3,Mrs,"Futrelle, Mrs. Jacques Heath (Lily May Peel)"
4,Mr,"Allen, Mr. William Henry"
...,...,...
886,Rev,"Montvila, Rev. Juozas"
887,Miss,"Graham, Miss. Margaret Edith"
888,Miss,"Johnston, Miss. Catherine Helen ""Carrie"""
889,Mr,"Behr, Mr. Karl Howell"


In [158]:
(df_s.groupby('Title')['Survived'].mean()).sort_values(ascending=False)

Unnamed: 0_level_0,Survived
Title,Unnamed: 1_level_1
the Countess,1.0
Mlle,1.0
Sir,1.0
Ms,1.0
Lady,1.0
Mme,1.0
Mrs,0.792
Miss,0.697802
Master,0.575
Col,0.5
