In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

In [2]:
df=pd.read_csv('../../train.csv')[['Age','Pclass','SibSp','Parch','Survived']]

In [3]:
df.head()

Unnamed: 0,Age,Pclass,SibSp,Parch,Survived
0,22.0,3,1,0,0
1,38.0,1,1,0,1
2,26.0,3,0,0,1
3,35.0,1,1,0,1
4,35.0,3,0,0,0


In [4]:
df.shape

(891, 5)

In [5]:
# Removing all rows with missing values
df.dropna(inplace=True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 714 entries, 0 to 890
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       714 non-null    float64
 1   Pclass    714 non-null    int64  
 2   SibSp     714 non-null    int64  
 3   Parch     714 non-null    int64  
 4   Survived  714 non-null    int64  
dtypes: float64(1), int64(4)
memory usage: 33.5 KB


In [7]:
# Seperating x and y (dependent and independent variable)
x=df.iloc[:,0:4]
y=df.iloc[:,-1]

In [8]:
x.head()

Unnamed: 0,Age,Pclass,SibSp,Parch
0,22.0,3,1,0
1,38.0,1,1,0
2,26.0,3,0,0
3,35.0,1,1,0
4,35.0,3,0,0


In [9]:
y.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [10]:
# Cross-val score on logistic regression of the unmodified data
np.mean(cross_val_score(LogisticRegression(),x,y,scoring='accuracy',cv=20))

0.6933333333333332

We have not performed feature construction yet, and the accuracy score of the model is 69.3%

# Applying Feature Construction

In [11]:
x['Family_size']=x['SibSp'] + x['Parch'] + 1

In [12]:
x.head()

Unnamed: 0,Age,Pclass,SibSp,Parch,Family_size
0,22.0,3,1,0,2
1,38.0,1,1,0,2
2,26.0,3,0,0,1
3,35.0,1,1,0,2
4,35.0,3,0,0,1


In [14]:
# Constructing a new numerical variable to indicate family size

def myfunc(num):
    if num==1:
        #alone
        return 0
    elif num>1 and num <=4:
        #small family
        return 1
    else:
        #large family
        return 2

In [15]:
# trial input of function
myfunc(4)

1

In [16]:
x['Family_type']=x['Family_size'].apply(myfunc)

In [17]:
x.head()

Unnamed: 0,Age,Pclass,SibSp,Parch,Family_size,Family_type
0,22.0,3,1,0,2,1
1,38.0,1,1,0,2,1
2,26.0,3,0,0,1,0
3,35.0,1,1,0,2,1
4,35.0,3,0,0,1,0


In [20]:
x.drop(columns=['SibSp','Parch','Family_size'],inplace=True)

In [21]:
x.head()

Unnamed: 0,Age,Pclass,Family_type
0,22.0,3,1
1,38.0,1,1
2,26.0,3,0
3,35.0,1,1
4,35.0,3,0


In [24]:
# Checking the accuracy score again after feature construction
np.mean(cross_val_score(LogisticRegression(),x,y,scoring='accuracy',cv=20))

0.7003174603174602

# Feature Splitting

In [26]:
df_again=pd.read_csv('../../train.csv')

In [27]:
df_again.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [29]:
# WE will work on the column "Name"
df_again['Name']

0                                Braund, Mr. Owen Harris
1      Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                 Heikkinen, Miss. Laina
3           Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                               Allen, Mr. William Henry
                             ...                        
886                                Montvila, Rev. Juozas
887                         Graham, Miss. Margaret Edith
888             Johnston, Miss. Catherine Helen "Carrie"
889                                Behr, Mr. Karl Howell
890                                  Dooley, Mr. Patrick
Name: Name, Length: 891, dtype: object

Name is made of the following format: surname, prefix. firstname

In [36]:
df_again['Title']= df_again['Name'].str.split(', ',expand=True)[1].str.split('.', expand=True)[0]

In [37]:
df_again[['Name','Title']]

Unnamed: 0,Name,Title
0,"Braund, Mr. Owen Harris",Mr
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",Mrs
2,"Heikkinen, Miss. Laina",Miss
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",Mrs
4,"Allen, Mr. William Henry",Mr
...,...,...
886,"Montvila, Rev. Juozas",Rev
887,"Graham, Miss. Margaret Edith",Miss
888,"Johnston, Miss. Catherine Helen ""Carrie""",Miss
889,"Behr, Mr. Karl Howell",Mr


In [42]:
(df_again.groupby('Title')['Survived'].mean()).sort_values(ascending=False)

Title
the Countess    1.000000
Mlle            1.000000
Sir             1.000000
Ms              1.000000
Lady            1.000000
Mme             1.000000
Mrs             0.792000
Miss            0.697802
Master          0.575000
Col             0.500000
Major           0.500000
Dr              0.428571
Mr              0.156673
Jonkheer        0.000000
Rev             0.000000
Don             0.000000
Capt            0.000000
Name: Survived, dtype: float64

Observation: Mrs , Miss have a very high survival rate ; starting 4 titles are very less in number that why its survival rate is 100%