In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('train.csv', usecols=['Age', 'Pclass', 'SibSp', 'Parch', 'Survived'])

In [3]:
df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch
0,0,3,22.0,1,0
1,1,1,38.0,1,0
2,1,3,26.0,0,0
3,1,1,35.0,1,0
4,0,3,35.0,0,0


In [4]:
df.isnull().mean() * 100

Survived     0.00000
Pclass       0.00000
Age         19.86532
SibSp        0.00000
Parch        0.00000
dtype: float64

In [5]:
df.dropna(inplace=True)

In [6]:
X = df.iloc[:, 1:]
y = df.iloc[:, 0]

In [7]:
np.mean(cross_val_score(LogisticRegression(), X, y, cv=10))

np.float64(0.6921165884194054)

## Applying Feature Construction

In [9]:
df2 = pd.read_csv('train.csv', usecols=['Age', 'Pclass', 'SibSp', 'Parch', 'Survived'])

In [10]:
df2['Family_type'] = df['SibSp'] + df['Parch']

In [11]:
df2.sample(5)

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Family_type
795,0,2,39.0,0,0,0.0
813,0,3,6.0,4,2,6.0
778,0,3,,0,0,
280,0,3,65.0,0,0,0.0
405,0,2,34.0,1,0,1.0


In [12]:
df2['Family_type'] = np.where(df2['Family_type'] == 0,
                              0,
                              np.where((df2['Family_type'] >= 1) & (df2['Family_type'] <= 4),
                                       1,
                                       2))

In [13]:
df2.sample(10)

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Family_type
722,0,2,34.0,0,0,0
349,0,3,42.0,0,0,0
518,1,2,36.0,1,0,1
856,1,1,45.0,1,1,1
708,1,1,22.0,0,0,0
425,0,3,,0,0,2
590,0,3,35.0,0,0,0
26,0,3,,0,0,2
294,0,3,24.0,0,0,0
812,0,2,35.0,0,0,0


In [14]:
df2.drop(columns=['SibSp', 'Parch'], inplace=True)
df2.dropna(inplace=True)

In [15]:
df2.head()

Unnamed: 0,Survived,Pclass,Age,Family_type
0,0,3,22.0,1
1,1,1,38.0,1
2,1,3,26.0,0
3,1,1,35.0,1
4,0,3,35.0,0


In [16]:
X = df2.iloc[:,1:]
y = df2.iloc[:,0]

In [17]:
np.mean(cross_val_score(LogisticRegression(), X, y, cv=10))

np.float64(0.7005281690140845)

## Feature Splitting

In [19]:
df = pd.read_csv('train.csv')

In [20]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [21]:
df['Title'] = df['Name'].str.split(',', expand=True)[1].str.split('.', expand=True)[0]

In [22]:
df[['Title', 'Name']]

Unnamed: 0,Title,Name
0,Mr,"Braund, Mr. Owen Harris"
1,Mrs,"Cumings, Mrs. John Bradley (Florence Briggs Th..."
2,Miss,"Heikkinen, Miss. Laina"
3,Mrs,"Futrelle, Mrs. Jacques Heath (Lily May Peel)"
4,Mr,"Allen, Mr. William Henry"
...,...,...
886,Rev,"Montvila, Rev. Juozas"
887,Miss,"Graham, Miss. Margaret Edith"
888,Miss,"Johnston, Miss. Catherine Helen ""Carrie"""
889,Mr,"Behr, Mr. Karl Howell"


In [23]:
df.groupby('Title')['Survived'].mean().sort_values()

Title
Capt            0.000000
Don             0.000000
Jonkheer        0.000000
Rev             0.000000
Mr              0.156673
Dr              0.428571
Col             0.500000
Major           0.500000
Master          0.575000
Miss            0.697802
Mrs             0.792000
Lady            1.000000
Mme             1.000000
Mlle            1.000000
Ms              1.000000
Sir             1.000000
the Countess    1.000000
Name: Survived, dtype: float64

In [24]:
df['Is_married'] = np.where(df['Title'] == 'Mrs', 1, 0)

In [58]:
df.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Is_married
155,156,0,1,"Williams, Mr. Charles Duane",male,51.0,0,1,PC 17597,61.3792,,C,Mr,0
273,274,0,1,"Natsch, Mr. Charles H",male,37.0,0,1,PC 17596,29.7,C118,C,Mr,0
23,24,1,1,"Sloper, Mr. William Thompson",male,28.0,0,0,113788,35.5,A6,S,Mr,0
369,370,1,1,"Aubart, Mme. Leontine Pauline",female,24.0,0,0,PC 17477,69.3,B35,C,Mme,0
855,856,1,3,"Aks, Mrs. Sam (Leah Rosen)",female,18.0,0,1,392091,9.35,,S,Mrs,0


In [25]:
df['Is_married'].value_counts()

Is_married
0    891
Name: count, dtype: int64