# Feature Construction

### Feature construction is a process which builds intermediate features from the original descriptors in a dataset. The aim is to build more efficient features for a machine data mining task.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.linear_model import LogisticRegression

In [2]:
data = pd.read_csv("../1_Features_Transformation/09_Handle_Missing_values/train.csv",usecols=["Age","Pclass","SibSp","Parch","Survived"])

In [3]:
data.dropna(inplace=True)
data.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch
0,0,3,22.0,1,0
1,1,1,38.0,1,0
2,1,3,26.0,0,0
3,1,1,35.0,1,0
4,0,3,35.0,0,0


In [4]:
x = data.drop(columns=["Survived"])
y = data.Survived

# Apply Cross Validate Score

In [5]:
np.mean(cross_val_score(LogisticRegression(),x,y,scoring="accuracy",cv=20))

0.6933333333333332

In [6]:
data1 = data.copy() 

# Create new feature

In [7]:
data1["Family_Size"] = data1.SibSp + data.Parch + 1

In [8]:
data1.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Family_Size
0,0,3,22.0,1,0,2
1,1,1,38.0,1,0,2
2,1,3,26.0,0,0,1
3,1,1,35.0,1,0,2
4,0,3,35.0,0,0,1


In [9]:
def func(n):
    if n == 1:
        # alone
        return 0
    elif n > 1 and n <= 4:
        # small family
        return 1
    else:
        # large family
        return 2

# Create new feature

In [10]:
data1["Family_Type"] = data1.Family_Size.apply(func)

In [11]:
data1.head(10)

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Family_Size,Family_Type
0,0,3,22.0,1,0,2,1
1,1,1,38.0,1,0,2,1
2,1,3,26.0,0,0,1,0
3,1,1,35.0,1,0,2,1
4,0,3,35.0,0,0,1,0
6,0,1,54.0,0,0,1,0
7,0,3,2.0,3,1,5,2
8,1,3,27.0,0,2,3,1
9,1,2,14.0,1,0,2,1
10,1,3,4.0,1,1,3,1


# Now we don't need SibSp Parch and Family size column

In [12]:
data1.drop(columns=["SibSp","Parch","Family_Size"],inplace=True)

In [13]:
data1.head()

Unnamed: 0,Survived,Pclass,Age,Family_Type
0,0,3,22.0,1
1,1,1,38.0,1
2,1,3,26.0,0
3,1,1,35.0,1
4,0,3,35.0,0


# Now Apply Cross Validate Score

In [14]:
X = data1.drop(columns=["Survived"])
Y = data1.Survived

In [15]:
np.mean(cross_val_score(LogisticRegression(),X,Y,scoring="accuracy",cv=20))

0.7003174603174602