# Feature Engineering

## Redundancy ans useless information

In [1]:
# We would need these libraries to manage our dataset
# Numpy: used for large, multi-dimensional arrays and matrices, and for high-level mathematical functions
# Pandas: used for data manipulation and analysis
# matplotlib: used for visualisation and plotting graph/image/etc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Import the iris dataset from sklearn
# https://scikit-learn.org/stable/modules/classes.html#module-sklearn.datasets
from sklearn.datasets import load_iris
# load the dataset
iris = load_iris()
# transform the dataset into a Pandas DataFrame to make it easier to display
# it is not necessary to understand what is done here
dataset = pd.DataFrame(data= np.column_stack([iris['data'], iris['target']]),
                     columns= iris['feature_names'] + ['target'])

In [3]:
dataset.corr()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
sepal length (cm),1.0,-0.11757,0.871754,0.817941,0.782561
sepal width (cm),-0.11757,1.0,-0.42844,-0.366126,-0.426658
petal length (cm),0.871754,-0.42844,1.0,0.962865,0.949035
petal width (cm),0.817941,-0.366126,0.962865,1.0,0.956547
target,0.782561,-0.426658,0.949035,0.956547,1.0


In [4]:
dataset.duplicated().sum()

1

In [5]:
dataset.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333,1.0
std,0.828066,0.435866,1.765298,0.762238,0.819232
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


In [6]:
dataset.nunique()

sepal length (cm)    35
sepal width (cm)     23
petal length (cm)    43
petal width (cm)     22
target                3
dtype: int64

In [7]:
'''
This line will allow us to load the dataset
The dataset should be in a CSV format
A CSV file is a delimited text file that uses a comma to separate values. 
Each line of the file is a data record.
''' 
# header: indicates if the dataset has an initial line with the name of each column
# 0 is the index of the first row
dataset = pd.read_csv('./dataset/titanic.csv', header=0)

In [8]:
dataset['Sex'] = dataset['Sex'] == 'male'

In [9]:
dataset.duplicated().sum()

0

In [10]:
dataset.corr()

Unnamed: 0,Survived,Pclass,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
Survived,1.0,-0.336528,-0.542152,-0.059665,-0.037082,0.080097,0.256179
Pclass,-0.336528,1.0,0.129507,-0.391492,0.085026,0.020252,-0.548919
Sex,-0.542152,0.129507,1.0,0.091875,-0.113249,-0.244337,-0.181137
Age,-0.059665,-0.391492,0.091875,1.0,-0.297669,-0.193741,0.112329
Siblings/Spouses Aboard,-0.037082,0.085026,-0.113249,-0.297669,1.0,0.414244,0.158839
Parents/Children Aboard,0.080097,0.020252,-0.244337,-0.193741,0.414244,1.0,0.21547
Fare,0.256179,-0.548919,-0.181137,0.112329,0.158839,0.21547,1.0


In [11]:
dataset.nunique()

Survived                     2
Pclass                       3
Name                       887
Sex                          2
Age                         89
Siblings/Spouses Aboard      7
Parents/Children Aboard      7
Fare                       248
dtype: int64

## Feature selection

In [12]:
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
from sklearn.linear_model import LogisticRegression
reg = LogisticRegression()

In [13]:
col = ['Pclass', 'Sex', 'Age', 'Siblings/Spouses Aboard', 'Parents/Children Aboard']

In [14]:
from sklearn.metrics import accuracy_score
acc = []
for i in col:
    reg.fit(dataset[[i]], dataset['Survived'])
    predict = reg.predict(dataset[[i]])
    acc.append(accuracy_score(dataset['Survived'], predict))
list(zip(col, acc))

[('Pclass', 0.677564825253664),
 ('Sex', 0.7857948139797069),
 ('Age', 0.6144306651634723),
 ('Siblings/Spouses Aboard', 0.6144306651634723),
 ('Parents/Children Aboard', 0.6065388951521984)]

In [15]:
selected = ['Sex']
col.pop(1)

'Sex'

In [16]:
acc = []
for i in col:
    features = [*selected, i]
    reg.fit(dataset[features], dataset['Survived'])
    predict = reg.predict(dataset[features])
    acc.append(accuracy_score(dataset['Survived'], predict))
list(zip(col, acc))

[('Pclass', 0.7857948139797069),
 ('Age', 0.7857948139797069),
 ('Siblings/Spouses Aboard', 0.790304396843292),
 ('Parents/Children Aboard', 0.7857948139797069)]

In [17]:
selected.append(col.pop(2))

In [18]:
acc = []
for i in col:
    features = [*selected, i]
    reg.fit(dataset[features], dataset['Survived'])
    predict = reg.predict(dataset[features])
    acc.append(accuracy_score(dataset['Survived'], predict))
list(zip(col, acc))

[('Pclass', 0.7993235625704622),
 ('Age', 0.790304396843292),
 ('Parents/Children Aboard', 0.790304396843292)]

In [19]:
selected.append(col.pop(0))

In [20]:
acc = []
for i in col:
    features = [*selected, i]
    reg.fit(dataset[features], dataset['Survived'])
    predict = reg.predict(dataset[features])
    acc.append(accuracy_score(dataset['Survived'], predict))
list(zip(col, acc))

[('Age', 0.8049605411499436), ('Parents/Children Aboard', 0.7993235625704622)]

In [21]:
selected.append(col.pop(0))

In [22]:
acc = []
for i in col:
    features = [*selected, i]
    reg.fit(dataset[features], dataset['Survived'])
    predict = reg.predict(dataset[features])
    acc.append(accuracy_score(dataset['Survived'], predict))
list(zip(col, acc))

[('Parents/Children Aboard', 0.8083427282976324)]

### Diabetes

In [23]:
'''
This line will allow us to load the dataset
The dataset should be in a CSV format
A CSV file is a delimited text file that uses a comma to separate values. 
Each line of the file is a data record.
''' 
# header: indicates if the dataset has an initial line with the name of each column
dataset = pd.read_csv('pima-indians-diabetes.csv', header=None)

In [24]:
dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [28]:
col = list(dataset.columns[:-1])
selected = []
for j in range(6):
    acc = []
    print('Testing:', col)
    for i in col:
        features = [*selected, i]
        reg.fit(dataset[features], dataset[8])
        predict = reg.predict(dataset[features])
        acc.append(accuracy_score(dataset[8], predict))
    print(list(zip(col, acc)))
    print('max', col[np.argmax(acc)], '-', np.max(acc), end='\n\n')
    selected.append(col.pop(np.argmax(acc)))
print(selected)

Testing: [0, 1, 2, 3, 4, 5, 6, 7]
[(0, 0.6640625), (1, 0.74609375), (2, 0.6510416666666666), (3, 0.65234375), (4, 0.6575520833333334), (5, 0.6640625), (6, 0.6588541666666666), (7, 0.6484375)]
max 1 - 0.74609375

Testing: [0, 2, 3, 4, 5, 6, 7]
[(0, 0.74609375), (2, 0.74609375), (3, 0.7395833333333334), (4, 0.74609375), (5, 0.7643229166666666), (6, 0.7473958333333334), (7, 0.7369791666666666)]
max 5 - 0.7643229166666666

Testing: [0, 2, 3, 4, 6, 7]
[(0, 0.7669270833333334), (2, 0.7669270833333334), (3, 0.7630208333333334), (4, 0.7604166666666666), (6, 0.7708333333333334), (7, 0.76953125)]
max 6 - 0.7708333333333334

Testing: [0, 2, 3, 4, 7]
[(0, 0.76953125), (2, 0.7708333333333334), (3, 0.76953125), (4, 0.7721354166666666), (7, 0.77734375)]
max 7 - 0.77734375

Testing: [0, 2, 3, 4]
[(0, 0.7799479166666666), (2, 0.77734375), (3, 0.7747395833333334), (4, 0.77734375)]
max 0 - 0.7799479166666666

Testing: [2, 3, 4]
[(2, 0.7708333333333334), (3, 0.7799479166666666), (4, 0.7786458333333334)]
m