## Feature Engineering Exercises - Regression Module

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from pydataset import data

import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, QuantileTransformer

from feature_engineering import select_k_best
from feature_engineering import rfe

In [2]:
df = data('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [3]:
df['price_per_person'] = df.total_bill / df['size']
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,8.495
2,10.34,1.66,Male,No,Sun,Dinner,3,3.446667
3,21.01,3.5,Male,No,Sun,Dinner,3,7.003333
4,23.68,3.31,Male,No,Sun,Dinner,2,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,6.1475


In [4]:
encode_cols = [col for col in df.columns if df[col].dtype == 'O']
    
for col in encode_cols:
    dummie_df = pd.get_dummies(df[col], prefix = df[col].name, drop_first = True)
    df = pd.concat([df, dummie_df], axis=1)

df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,sex_Male,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch
1,16.99,1.01,Female,No,Sun,Dinner,2,8.495,0,0,0,1,0,0
2,10.34,1.66,Male,No,Sun,Dinner,3,3.446667,1,0,0,1,0,0
3,21.01,3.5,Male,No,Sun,Dinner,3,7.003333,1,0,0,1,0,0
4,23.68,3.31,Male,No,Sun,Dinner,2,11.84,1,0,0,1,0,0
5,24.59,3.61,Female,No,Sun,Dinner,4,6.1475,0,0,0,1,0,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 244 entries, 1 to 244
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   total_bill        244 non-null    float64
 1   tip               244 non-null    float64
 2   sex               244 non-null    object 
 3   smoker            244 non-null    object 
 4   day               244 non-null    object 
 5   time              244 non-null    object 
 6   size              244 non-null    int64  
 7   price_per_person  244 non-null    float64
 8   sex_Male          244 non-null    uint8  
 9   smoker_Yes        244 non-null    uint8  
 10  day_Sat           244 non-null    uint8  
 11  day_Sun           244 non-null    uint8  
 12  day_Thur          244 non-null    uint8  
 13  time_Lunch        244 non-null    uint8  
dtypes: float64(3), int64(1), object(4), uint8(6)
memory usage: 18.6+ KB


In [6]:
df = df.drop(columns=encode_cols)
df.head()

Unnamed: 0,total_bill,tip,size,price_per_person,sex_Male,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch
1,16.99,1.01,2,8.495,0,0,0,1,0,0
2,10.34,1.66,3,3.446667,1,0,0,1,0,0
3,21.01,3.5,3,7.003333,1,0,0,1,0,0
4,23.68,3.31,2,11.84,1,0,0,1,0,0
5,24.59,3.61,4,6.1475,0,0,0,1,0,0


In [7]:
train_val, test = train_test_split(df, train_size = 0.8, random_state=123)
train, validate = train_test_split(train_val, train_size = 0.7, random_state=123)
train.shape, validate.shape, test.shape

((136, 10), (59, 10), (49, 10))

In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 136 entries, 19 to 167
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   total_bill        136 non-null    float64
 1   tip               136 non-null    float64
 2   size              136 non-null    int64  
 3   price_per_person  136 non-null    float64
 4   sex_Male          136 non-null    uint8  
 5   smoker_Yes        136 non-null    uint8  
 6   day_Sat           136 non-null    uint8  
 7   day_Sun           136 non-null    uint8  
 8   day_Thur          136 non-null    uint8  
 9   time_Lunch        136 non-null    uint8  
dtypes: float64(3), int64(1), uint8(6)
memory usage: 6.1 KB


In [9]:
x_train = train.drop(columns='tip')
y_train = train[['tip']]
x_train.head()

Unnamed: 0,total_bill,size,price_per_person,sex_Male,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch
19,16.97,3,5.656667,0,0,0,1,0,0
173,7.25,2,3.625,1,1,0,1,0,0
119,12.43,2,6.215,0,0,0,0,1,1
29,21.7,2,10.85,1,0,1,0,0,0
238,32.83,2,16.415,1,1,1,0,0,0


In [10]:
y_train.head()

Unnamed: 0,tip
19,3.5
173,5.15
119,1.8
29,4.3
238,1.17


In [11]:
kbest = SelectKBest(f_regression, k=2)
kbest.fit(x_train, y_train)

SelectKBest(k=2, score_func=<function f_regression at 0x7f8d5c9d08b0>)

In [12]:
kbest_results = pd.DataFrame(dict(p=kbest.pvalues_, f=kbest.scores_), index=x_train.columns)
kbest_results

Unnamed: 0,p,f
total_bill,7.18647e-20,115.984909
size,1.341642e-12,61.259089
price_per_person,0.001310327,10.777792
sex_Male,0.2844794,1.154792
smoker_Yes,0.5579978,0.344909
day_Sat,0.9550468,0.00319
day_Sun,0.1236625,2.400404
day_Thur,0.3013774,1.07638
time_Lunch,0.1821449,1.798647


In [13]:
x_train.columns[kbest.get_support()]

Index(['total_bill', 'size'], dtype='object')

In [17]:
list(x_train.columns[kbest.get_support()])

['total_bill', 'size']

In [14]:
model = LinearRegression()
rfe = RFE(model, n_features_to_select=2)
rfe.fit(x_train, y_train)

RFE(estimator=LinearRegression(), n_features_to_select=2)

In [15]:
pd.DataFrame({'rfe_ranking': rfe.ranking_}, index=x_train.columns)

Unnamed: 0,rfe_ranking
total_bill,5
size,1
price_per_person,3
sex_Male,2
smoker_Yes,7
day_Sat,4
day_Sun,6
day_Thur,1
time_Lunch,8


In [16]:
x_train.columns[rfe.get_support()]

Index(['size', 'day_Thur'], dtype='object')

---

#### Add a Scaler to the above dataset/process
- Reset the Kernel

- Run the imports

- Then run top down from here


In [2]:
df = data('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [3]:
df['price_per_person'] = df.total_bill / df['size']
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,8.495
2,10.34,1.66,Male,No,Sun,Dinner,3,3.446667
3,21.01,3.5,Male,No,Sun,Dinner,3,7.003333
4,23.68,3.31,Male,No,Sun,Dinner,2,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,6.1475


In [4]:
encode_cols = [col for col in df.columns if df[col].dtype == 'O']
    
for col in encode_cols:
    dummie_df = pd.get_dummies(df[col], prefix = df[col].name, drop_first = True)
    df = pd.concat([df, dummie_df], axis=1)

df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,sex_Male,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch
1,16.99,1.01,Female,No,Sun,Dinner,2,8.495,0,0,0,1,0,0
2,10.34,1.66,Male,No,Sun,Dinner,3,3.446667,1,0,0,1,0,0
3,21.01,3.5,Male,No,Sun,Dinner,3,7.003333,1,0,0,1,0,0
4,23.68,3.31,Male,No,Sun,Dinner,2,11.84,1,0,0,1,0,0
5,24.59,3.61,Female,No,Sun,Dinner,4,6.1475,0,0,0,1,0,0


In [5]:
df = df.drop(columns=encode_cols)
df.head()

Unnamed: 0,total_bill,tip,size,price_per_person,sex_Male,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch
1,16.99,1.01,2,8.495,0,0,0,1,0,0
2,10.34,1.66,3,3.446667,1,0,0,1,0,0
3,21.01,3.5,3,7.003333,1,0,0,1,0,0
4,23.68,3.31,2,11.84,1,0,0,1,0,0
5,24.59,3.61,4,6.1475,0,0,0,1,0,0


In [6]:
train_val, test = train_test_split(df, train_size = 0.8, random_state=123)
train, validate = train_test_split(train_val, train_size = 0.7, random_state=123)
train.shape, validate.shape, test.shape

((136, 10), (59, 10), (49, 10))

In [7]:
x_train = train.drop(columns='tip')
y_train = train[['tip']]
x_train.head()

Unnamed: 0,total_bill,size,price_per_person,sex_Male,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch
19,16.97,3,5.656667,0,0,0,1,0,0
173,7.25,2,3.625,1,1,0,1,0,0
119,12.43,2,6.215,0,0,0,0,1,1
29,21.7,2,10.85,1,0,1,0,0,0
238,32.83,2,16.415,1,1,1,0,0,0


In [8]:
scaler = MinMaxScaler()
scaler.fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_train_scaled_df = pd.DataFrame(x_train_scaled, columns=x_train.columns, index=x_train.index)
x_train_scaled_df.head()

Unnamed: 0,total_bill,size,price_per_person,sex_Male,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch
19,0.307114,0.4,0.150344,0.0,0.0,0.0,1.0,0.0,0.0
173,0.092355,0.2,0.032258,1.0,1.0,0.0,1.0,0.0,0.0
119,0.206805,0.2,0.182796,0.0,0.0,0.0,0.0,1.0,1.0
29,0.411622,0.2,0.452194,1.0,0.0,1.0,0.0,0.0,0.0
238,0.657534,0.2,0.775647,1.0,1.0,1.0,0.0,0.0,0.0


In [9]:
kbest = SelectKBest(f_regression, k=2)
kbest.fit(x_train_scaled_df, y_train)

SelectKBest(k=2, score_func=<function f_regression at 0x7fbcce8d18b0>)

In [10]:
kbest_results = pd.DataFrame(dict(p=kbest.pvalues_, f=kbest.scores_), index=x_train_scaled_df.columns)
kbest_results

Unnamed: 0,p,f
total_bill,7.18647e-20,115.984909
size,1.341642e-12,61.259089
price_per_person,0.001310327,10.777792
sex_Male,0.2844794,1.154792
smoker_Yes,0.5579978,0.344909
day_Sat,0.9550468,0.00319
day_Sun,0.1236625,2.400404
day_Thur,0.3013774,1.07638
time_Lunch,0.1821449,1.798647


In [11]:
x_train_scaled_df.columns[kbest.get_support()]

Index(['total_bill', 'size'], dtype='object')

In [12]:
model = LinearRegression()
rfe = RFE(model, n_features_to_select=2)
rfe.fit(x_train_scaled_df, y_train)

RFE(estimator=LinearRegression(), n_features_to_select=2)

In [13]:
pd.DataFrame({'rfe_ranking': rfe.ranking_}, index=x_train_scaled_df.columns)

Unnamed: 0,rfe_ranking
total_bill,1
size,2
price_per_person,1
sex_Male,3
smoker_Yes,7
day_Sat,5
day_Sun,6
day_Thur,4
time_Lunch,8


In [14]:
x_train_scaled_df.columns[rfe.get_support()]

Index(['total_bill', 'price_per_person'], dtype='object')

---

### Test functions from feature_engineering.py
- Restart the kernel

- Run the import cell

- Run from top down from here

In [2]:
#this cell is the data acquisition, cleaning, and prep specific to the 'tips' dataset
df = data('tips')
df['price_per_person'] = df.total_bill / df['size']
encode_cols = [col for col in df.columns if df[col].dtype == 'O']
for col in encode_cols:
    dummie_df = pd.get_dummies(df[col], prefix = df[col].name, drop_first = True)
    df = pd.concat([df, dummie_df], axis=1)
df = df.drop(columns=encode_cols)
train_val, test = train_test_split(df, train_size = 0.8, random_state=123)
train, validate = train_test_split(train_val, train_size = 0.7, random_state=123)
x_train = train.drop(columns='tip')
y_train = train[['tip']]
scaler = MinMaxScaler()
scaler.fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_train_scaled_df = pd.DataFrame(x_train_scaled, columns=x_train.columns, index=x_train.index)
x_train_scaled_df.head()

Unnamed: 0,total_bill,size,price_per_person,sex_Male,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch
19,0.307114,0.4,0.150344,0.0,0.0,0.0,1.0,0.0,0.0
173,0.092355,0.2,0.032258,1.0,1.0,0.0,1.0,0.0,0.0
119,0.206805,0.2,0.182796,0.0,0.0,0.0,0.0,1.0,1.0
29,0.411622,0.2,0.452194,1.0,0.0,1.0,0.0,0.0,0.0
238,0.657534,0.2,0.775647,1.0,1.0,1.0,0.0,0.0,0.0


##### K Best Function

In [3]:
#run the select k best function from the feature_engineering.py file call = (x, y, k)
select_k_best(x_train_scaled_df, y_train, 2)

The 2 best features = ['total_bill', 'size']


Unnamed: 0,p,f
total_bill,7.18647e-20,115.984909
size,1.341642e-12,61.259089
price_per_person,0.001310327,10.777792
sex_Male,0.2844794,1.154792
smoker_Yes,0.5579978,0.344909
day_Sat,0.9550468,0.00319
day_Sun,0.1236625,2.400404
day_Thur,0.3013774,1.07638
time_Lunch,0.1821449,1.798647


##### Recursive Feature Elimination Function

In [3]:
rfe(x_train_scaled_df, y_train, 2)

The 2 best features = ['total_bill', 'price_per_person']


Unnamed: 0,rfe_ranking
total_bill,1
size,2
price_per_person,1
sex_Male,3
smoker_Yes,7
day_Sat,5
day_Sun,6
day_Thur,4
time_Lunch,8
