In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

import seaborn as sns

In [3]:
df = pd.read_csv('train.csv', usecols=['Age', 'Pclass', "SibSp", 'Parch', "Survived"])
df.head()


Unnamed: 0,Survived,Pclass,Age,SibSp,Parch
0,0,3,22.0,1,0
1,1,1,38.0,1,0
2,1,3,26.0,0,0
3,1,1,35.0,1,0
4,0,3,35.0,0,0


In [4]:
df.dropna(inplace=True)

In [5]:
df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch
0,0,3,22.0,1,0
1,1,1,38.0,1,0
2,1,3,26.0,0,0
3,1,1,35.0,1,0
4,0,3,35.0,0,0


In [6]:
df.shape

(714, 5)

In [7]:
y = df['Survived']
X = df.iloc[:, 1:]
X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch
0,3,22.0,1,0
1,1,38.0,1,0
2,3,26.0,0,0
3,1,35.0,1,0
4,3,35.0,0,0


In [8]:
np.mean(cross_val_score(LogisticRegression(), X, y, scoring='accuracy', cv=15))

0.696306146572104

## Apply Feature Construction

In [9]:
X['Family_size'] = X['SibSp'] + X['Parch'] + 1

In [10]:
X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Family_size
0,3,22.0,1,0,2
1,1,38.0,1,0,2
2,3,26.0,0,0,1
3,1,35.0,1,0,2
4,3,35.0,0,0,1


In [13]:
def colTrfFunc(num:int):
    if num == 1:
        return 0 # alone
    elif num > 1 and num <= 4:
        return 1 # family
    else:
        return 2 # large family

In [14]:
colTrfFunc(4)

1

In [15]:
X['Family_type'] = X['Family_size'].apply(colTrfFunc)
X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Family_size,Family_type
0,3,22.0,1,0,2,1
1,1,38.0,1,0,2,1
2,3,26.0,0,0,1,0
3,1,35.0,1,0,2,1
4,3,35.0,0,0,1,0


In [16]:
X.drop(columns=['SibSp', 'Parch', 'Family_size'], inplace=True)
X.head()

Unnamed: 0,Pclass,Age,Family_type
0,3,22.0,1
1,1,38.0,1
2,3,26.0,0
3,1,35.0,1
4,3,35.0,0


In [18]:
np.mean(cross_val_score(LogisticRegression(), X, y, scoring='accuracy', cv=20))

0.7003174603174602

## Feature Splitting

In [19]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [20]:
df['Name']

0                                Braund, Mr. Owen Harris
1      Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                 Heikkinen, Miss. Laina
3           Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                               Allen, Mr. William Henry
                             ...                        
886                                Montvila, Rev. Juozas
887                         Graham, Miss. Margaret Edith
888             Johnston, Miss. Catherine Helen "Carrie"
889                                Behr, Mr. Karl Howell
890                                  Dooley, Mr. Patrick
Name: Name, Length: 891, dtype: object

In [25]:
df['Title'] = df['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]

In [26]:
df[['Title', "Name"]]

Unnamed: 0,Title,Name
0,Mr,"Braund, Mr. Owen Harris"
1,Mrs,"Cumings, Mrs. John Bradley (Florence Briggs Th..."
2,Miss,"Heikkinen, Miss. Laina"
3,Mrs,"Futrelle, Mrs. Jacques Heath (Lily May Peel)"
4,Mr,"Allen, Mr. William Henry"
...,...,...
886,Rev,"Montvila, Rev. Juozas"
887,Miss,"Graham, Miss. Margaret Edith"
888,Miss,"Johnston, Miss. Catherine Helen ""Carrie"""
889,Mr,"Behr, Mr. Karl Howell"
