# Feature Engineering

## Redundancy ans useless information

In [31]:
# We would need these libraries to manage our dataset
# Numpy: used for large, multi-dimensional arrays and matrices, and for high-level mathematical functions
# Pandas: used for data manipulation and analysis
# matplotlib: used for visualisation and plotting graph/image/etc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [32]:
# Import the iris dataset from sklearn
# https://scikit-learn.org/stable/modules/classes.html#module-sklearn.datasets
from sklearn.datasets import load_iris
# load the dataset
iris = load_iris()
# transform the dataset into a Pandas DataFrame to make it easier to display
# it is not necessary to understand what is done here
dataset = pd.DataFrame(data= np.column_stack([iris['data'], iris['target']]),
                     columns= iris['feature_names'] + ['target'])

In [33]:
from sklearn.preprocessing import MinMaxScaler
nm = MinMaxScaler()
dataset_norm = nm.fit_transform(dataset)
dataset_norm = pd.DataFrame(dataset_norm, columns=dataset.columns)

We can check which variable are colinear with each other with the `corr` function from pandas. This can help us removing unecessary features. For example the `petal length (cm)` is very similar to `petal width (cm)` and we could remove one of them.

In [34]:
dataset_norm.corr()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
sepal length (cm),1.0,-0.11757,0.871754,0.817941,0.782561
sepal width (cm),-0.11757,1.0,-0.42844,-0.366126,-0.426658
petal length (cm),0.871754,-0.42844,1.0,0.962865,0.949035
petal width (cm),0.817941,-0.366126,0.962865,1.0,0.956547
target,0.782561,-0.426658,0.949035,0.956547,1.0


We can check the duplicate value in a very easy fashion using pandas again

In [36]:
dataset.duplicated().sum()

1

In [37]:
dataset = dataset[~dataset.duplicated()]

The `describe` function gives us all the basic information about all the column containing numerical information. It includes the `std` which we can use to determine if the column carry enough information. 

In [38]:
dataset_norm.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
count,150.0,150.0,150.0,150.0,150.0
mean,0.428704,0.440556,0.467458,0.458056,0.5
std,0.230018,0.181611,0.299203,0.317599,0.409616
min,0.0,0.0,0.0,0.0,0.0
25%,0.222222,0.333333,0.101695,0.083333,0.0
50%,0.416667,0.416667,0.567797,0.5,0.5
75%,0.583333,0.541667,0.694915,0.708333,1.0
max,1.0,1.0,1.0,1.0,1.0


Checking the number of unique value in each column to check you don't have any constant value. You can also check the number of unique value per line, if it is necessary.

In [39]:
dataset.nunique()

sepal length (cm)    35
sepal width (cm)     23
petal length (cm)    43
petal width (cm)     22
target                3
dtype: int64

In [40]:
unique = dataset.nunique(axis=1)
unique.min(), unique.max()

(4, 5)

Checking there is no `nan` cell, which pandas way to treat missing value.

In [10]:
dataset.isna().sum()

sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
target               0
dtype: int64

In [41]:
'''
This line will allow us to load the dataset
The dataset should be in a CSV format
A CSV file is a delimited text file that uses a comma to separate values. 
Each line of the file is a data record.
''' 
# header: indicates if the dataset has an initial line with the name of each column
# 0 is the index of the first row
dataset = pd.read_csv('./dataset/titanic.csv', header=0)

In [42]:
# Translating the Sex variable to a numerical features
dataset['Sex'] = dataset['Sex'] == 'male'

In [43]:
dataset.duplicated().sum()

0

In [44]:
dataset.corr()

Unnamed: 0,Survived,Pclass,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
Survived,1.0,-0.336528,-0.542152,-0.059665,-0.037082,0.080097,0.256179
Pclass,-0.336528,1.0,0.129507,-0.391492,0.085026,0.020252,-0.548919
Sex,-0.542152,0.129507,1.0,0.091875,-0.113249,-0.244337,-0.181137
Age,-0.059665,-0.391492,0.091875,1.0,-0.297669,-0.193741,0.112329
Siblings/Spouses Aboard,-0.037082,0.085026,-0.113249,-0.297669,1.0,0.414244,0.158839
Parents/Children Aboard,0.080097,0.020252,-0.244337,-0.193741,0.414244,1.0,0.21547
Fare,0.256179,-0.548919,-0.181137,0.112329,0.158839,0.21547,1.0


In [45]:
dataset.nunique()

Survived                     2
Pclass                       3
Name                       887
Sex                          2
Age                         89
Siblings/Spouses Aboard      7
Parents/Children Aboard      7
Fare                       248
dtype: int64

In [46]:
np.unique(dataset['Pclass'], return_counts=True)

(array([1, 2, 3]), array([216, 184, 487]))

## Feature selection

We first remove all the non numerical feature and normalize the dataset

In [47]:
col = ['Pclass', 'Sex', 'Age', 'Siblings/Spouses Aboard', 'Parents/Children Aboard']

In [48]:
dataset_norm = nm.fit_transform(dataset[col])
dataset = pd.DataFrame(np.concatenate((dataset[['Survived']], dataset_norm), axis=1), columns=['Survived', *col])

We'll use a linear regression, as it is the only we have seen in class so far. 

In [49]:
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
from sklearn.linear_model import LogisticRegression
reg = LogisticRegression()

Train a model with a dataset composed of each feature of the dataset, and record the accuracy

In [19]:
from sklearn.metrics import accuracy_score
acc = []
for i in col: #  iterate through the features
    reg.fit(dataset[[i]], dataset['Survived']) #  train the model with this feature
    predict = reg.predict(dataset[[i]]) #  get the prediction
    acc.append(accuracy_score(dataset['Survived'], predict)) #  record the accuracy
list(zip(col, acc))

[('Pclass', 0.677564825253664),
 ('Sex', 0.7857948139797069),
 ('Age', 0.6144306651634723),
 ('Siblings/Spouses Aboard', 0.6144306651634723),
 ('Parents/Children Aboard', 0.6054114994363021)]

`Sex` is the best features, so we remove it from the list of features to evaluate and add it to the list of selected features.

In [50]:
selected = ['Sex']
col.pop(1)

'Sex'

We repeat the first step, but now instead of dataset compose of one feature, we are testing dataset composed of two features:
- `Sex`
- another tested features

In [51]:
acc = []
for i in col:
    features = [*selected, i] #  Sex + the tested feature
    reg.fit(dataset[features], dataset['Survived'])
    predict = reg.predict(dataset[features])
    acc.append(accuracy_score(dataset['Survived'], predict))
list(zip(col, acc))

[('Pclass', 0.7857948139797069),
 ('Age', 0.7857948139797069),
 ('Siblings/Spouses Aboard', 0.7891770011273957),
 ('Parents/Children Aboard', 0.7857948139797069)]

The best feature now is `Siblings/Spouses Aboard`. It doesn't add too much information however.

In [22]:
selected.append(col.pop(2))

Again, we are repeating our previous step, but now our selected features are:
- Sex
- Siblings/Spouses Aboard'  
and we are adding one feature to test during each iteration.

In [23]:
acc = []
for i in col:
    features = [*selected, i]
    reg.fit(dataset[features], dataset['Survived'])
    predict = reg.predict(dataset[features])
    acc.append(accuracy_score(dataset['Survived'], predict))
list(zip(col, acc))

[('Pclass', 0.7925591882750845),
 ('Age', 0.7891770011273957),
 ('Parents/Children Aboard', 0.7891770011273957)]

In [24]:
selected.append(col.pop(0))

In [25]:
acc = []
for i in col:
    features = [*selected, i]
    reg.fit(dataset[features], dataset['Survived'])
    predict = reg.predict(dataset[features])
    acc.append(accuracy_score(dataset['Survived'], predict))
list(zip(col, acc))

[('Age', 0.790304396843292), ('Parents/Children Aboard', 0.7925591882750845)]

In [26]:
selected.append(col.pop(1))

In [27]:
acc = []
for i in col:
    features = [*selected, i]
    reg.fit(dataset[features], dataset['Survived'])
    predict = reg.predict(dataset[features])
    acc.append(accuracy_score(dataset['Survived'], predict))
list(zip(col, acc))

[('Age', 0.8004509582863585)]

### Diabetes

In [52]:
'''
This line will allow us to load the dataset
The dataset should be in a CSV format
A CSV file is a delimited text file that uses a comma to separate values. 
Each line of the file is a data record.
''' 
# header: indicates if the dataset has an initial line with the name of each column
dataset = pd.read_csv('pima-indians-diabetes.csv', header=None)

In [53]:
dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


We can automatise this in one function

In [30]:
col = list(dataset.columns[:-1]) #  all features except the target
selected = []
for j in range(6): #  loop 6 times, find the best set of 6 features
    acc = []
    print('Testing:', col)
    for i in col: #  for each features to test
        features = [*selected, i] #  set of features = selected one so far + the one to test
        reg.fit(dataset[features], dataset[8]) #  fit
        predict = reg.predict(dataset[features]) #  predict
        acc.append(accuracy_score(dataset[8], predict)) #  record accuracy
    print(list(zip(col, acc)))
    print('max', col[np.argmax(acc)], '-', np.max(acc), end='\n\n')
    selected.append(col.pop(np.argmax(acc))) #  remove the best from the list and add it to the selected features
print(selected)

Testing: [0, 1, 2, 3, 4, 5, 6, 7]
[(0, 0.6640625), (1, 0.74609375), (2, 0.6510416666666666), (3, 0.65234375), (4, 0.6575520833333334), (5, 0.6640625), (6, 0.6588541666666666), (7, 0.6484375)]
max 1 - 0.74609375

Testing: [0, 2, 3, 4, 5, 6, 7]
[(0, 0.74609375), (2, 0.74609375), (3, 0.7395833333333334), (4, 0.74609375), (5, 0.7643229166666666), (6, 0.7473958333333334), (7, 0.7369791666666666)]
max 5 - 0.7643229166666666

Testing: [0, 2, 3, 4, 6, 7]
[(0, 0.7669270833333334), (2, 0.7669270833333334), (3, 0.7630208333333334), (4, 0.7604166666666666), (6, 0.7708333333333334), (7, 0.76953125)]
max 6 - 0.7708333333333334

Testing: [0, 2, 3, 4, 7]
[(0, 0.76953125), (2, 0.7708333333333334), (3, 0.76953125), (4, 0.7721354166666666), (7, 0.77734375)]
max 7 - 0.77734375

Testing: [0, 2, 3, 4]
[(0, 0.7799479166666666), (2, 0.77734375), (3, 0.7747395833333334), (4, 0.77734375)]
max 0 - 0.7799479166666666

Testing: [2, 3, 4]
[(2, 0.7708333333333334), (3, 0.7799479166666666), (4, 0.7786458333333334)]
m