<div class="alert alert-block alert-warning">

# Feature Engineering Exercises

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from pydataset import data
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_regression, RFE #feature selection objects
from sklearn.linear_model import LinearRegression

<div class="alert alert-block alert-success">

Do your work for this exercise in a jupyter notebook named feature_engineering within the regression-exercises repo. Add, commit, and push your work.


<div class="alert alert-block alert-info">

1. Load the tips dataset.


In [2]:
df = data('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


<div class="alert alert-block alert-info">

1. 
    * Create a column named price_per_person. This should be the total bill divided by the party size.


In [3]:
df['price_per_person'] = df['total_bill']/df['size']
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,8.495
2,10.34,1.66,Male,No,Sun,Dinner,3,3.446667
3,21.01,3.5,Male,No,Sun,Dinner,3,7.003333
4,23.68,3.31,Male,No,Sun,Dinner,2,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,6.1475


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 244 entries, 1 to 244
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   total_bill        244 non-null    float64
 1   tip               244 non-null    float64
 2   sex               244 non-null    object 
 3   smoker            244 non-null    object 
 4   day               244 non-null    object 
 5   time              244 non-null    object 
 6   size              244 non-null    int64  
 7   price_per_person  244 non-null    float64
dtypes: float64(3), int64(1), object(4)
memory usage: 17.2+ KB


In [5]:
df.describe()

Unnamed: 0,total_bill,tip,size,price_per_person
count,244.0,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672,7.88823
std,8.902412,1.383638,0.9511,2.91435
min,3.07,1.0,1.0,2.875
25%,13.3475,2.0,2.0,5.8025
50%,17.795,2.9,2.0,7.255
75%,24.1275,3.5625,3.0,9.39
max,50.81,10.0,6.0,20.275


In [6]:
df.isna().sum()

total_bill          0
tip                 0
sex                 0
smoker              0
day                 0
time                0
size                0
price_per_person    0
dtype: int64

In [7]:
non_numeric_columns = df.select_dtypes(exclude=['number']).columns

for column in non_numeric_columns:
    unique_values = df[column].unique()
    print(f"Unique values in column '{column}': {unique_values}")


Unique values in column 'sex': ['Female' 'Male']
Unique values in column 'smoker': ['No' 'Yes']
Unique values in column 'day': ['Sun' 'Sat' 'Thur' 'Fri']
Unique values in column 'time': ['Dinner' 'Lunch']


In [8]:
# Create dummies for the 'day' column without dropping the first category
df_encoded = pd.get_dummies(df, columns=['day'])

# Select non-numeric columns and create dummies with drop_first=True
non_numeric_columns = df_encoded.select_dtypes(exclude=['number']).columns
df_encoded = pd.get_dummies(df_encoded, columns=non_numeric_columns, drop_first=True)

# Print the resulting DataFrame
df_encoded

Unnamed: 0,total_bill,tip,size,price_per_person,sex_Male,smoker_Yes,time_Lunch,day_Fri_True,day_Sat_True,day_Sun_True,day_Thur_True
1,16.99,1.01,2,8.495000,False,False,False,False,False,True,False
2,10.34,1.66,3,3.446667,True,False,False,False,False,True,False
3,21.01,3.50,3,7.003333,True,False,False,False,False,True,False
4,23.68,3.31,2,11.840000,True,False,False,False,False,True,False
5,24.59,3.61,4,6.147500,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...
240,29.03,5.92,3,9.676667,True,False,False,False,True,False,False
241,27.18,2.00,2,13.590000,False,True,False,False,True,False,False
242,22.67,2.00,2,11.335000,True,True,False,False,True,False,False
243,17.82,1.75,2,8.910000,True,False,False,False,True,False,False


In [9]:
df_encoded.columns

Index(['total_bill', 'tip', 'size', 'price_per_person', 'sex_Male',
       'smoker_Yes', 'time_Lunch', 'day_Fri_True', 'day_Sat_True',
       'day_Sun_True', 'day_Thur_True'],
      dtype='object')

In [10]:
# Define the new column names
new_column_names = {
    'total_bill': 'total_bill',
    'tip': 'tip',
    'size': 'size',
    'price_per_person': 'price_per_person',
    'sex_Male': 'sex_male',
    'smoker_Yes': 'smoker',
    'time_Lunch': 'time_lunch',
    'day_Fri_True': 'fri',
    'day_Sat_True': 'sat',
    'day_Sun_True': 'sun',
    'day_Thur_True': 'thur'
}

# Rename columns
df_encoded.rename(columns=new_column_names, inplace=True)

df_encoded

Unnamed: 0,total_bill,tip,size,price_per_person,sex_male,smoker,time_lunch,fri,sat,sun,thur
1,16.99,1.01,2,8.495000,False,False,False,False,False,True,False
2,10.34,1.66,3,3.446667,True,False,False,False,False,True,False
3,21.01,3.50,3,7.003333,True,False,False,False,False,True,False
4,23.68,3.31,2,11.840000,True,False,False,False,False,True,False
5,24.59,3.61,4,6.147500,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...
240,29.03,5.92,3,9.676667,True,False,False,False,True,False,False
241,27.18,2.00,2,13.590000,False,True,False,False,True,False,False
242,22.67,2.00,2,11.335000,True,True,False,False,True,False,False
243,17.82,1.75,2,8.910000,True,False,False,False,True,False,False


In [11]:
df = df_encoded
df.head()

Unnamed: 0,total_bill,tip,size,price_per_person,sex_male,smoker,time_lunch,fri,sat,sun,thur
1,16.99,1.01,2,8.495,False,False,False,False,False,True,False
2,10.34,1.66,3,3.446667,True,False,False,False,False,True,False
3,21.01,3.5,3,7.003333,True,False,False,False,False,True,False
4,23.68,3.31,2,11.84,True,False,False,False,False,True,False
5,24.59,3.61,4,6.1475,False,False,False,False,False,True,False


In [12]:
df_encoded['weekend'] = (df_encoded['sat'] | df_encoded['sun'])
df_encoded.head()

Unnamed: 0,total_bill,tip,size,price_per_person,sex_male,smoker,time_lunch,fri,sat,sun,thur,weekend
1,16.99,1.01,2,8.495,False,False,False,False,False,True,False,True
2,10.34,1.66,3,3.446667,True,False,False,False,False,True,False,True
3,21.01,3.5,3,7.003333,True,False,False,False,False,True,False,True
4,23.68,3.31,2,11.84,True,False,False,False,False,True,False,True
5,24.59,3.61,4,6.1475,False,False,False,False,False,True,False,True


In [13]:
mms = MinMaxScaler()

to_scale = ['total_bill', 'size', 'price_per_person']

df_encoded[to_scale] = mms.fit_transform(df_encoded[to_scale])

df_encoded.head()

Unnamed: 0,total_bill,tip,size,price_per_person,sex_male,smoker,time_lunch,fri,sat,sun,thur,weekend
1,0.291579,1.01,0.2,0.322989,False,False,False,False,False,True,False,True
2,0.152283,1.66,0.4,0.032854,True,False,False,False,False,True,False,True
3,0.375786,3.5,0.4,0.237261,True,False,False,False,False,True,False,True
4,0.431713,3.31,0.2,0.51523,True,False,False,False,False,True,False,True
5,0.450775,3.61,0.6,0.188075,False,False,False,False,False,True,False,True


In [15]:
df = df_encoded

df.head()

Unnamed: 0,total_bill,tip,size,price_per_person,sex_male,smoker,time_lunch,fri,sat,sun,thur,weekend
1,0.291579,1.01,0.2,0.322989,False,False,False,False,False,True,False,True
2,0.152283,1.66,0.4,0.032854,True,False,False,False,False,True,False,True
3,0.375786,3.5,0.4,0.237261,True,False,False,False,False,True,False,True
4,0.431713,3.31,0.2,0.51523,True,False,False,False,False,True,False,True
5,0.450775,3.61,0.6,0.188075,False,False,False,False,False,True,False,True


<div class="alert alert-block alert-info">

1. 
    * Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?


##### Higher total bill correlates to higher tip amount. 

H1: Higher total bill does not correlate to higher tip amount. 

H0: Higher total bill does correlate to higher tip amount. 

<div class="alert alert-block alert-info">

1. 
    * Use Select K Best to select the top 2 features for predicting tip amount. What are they?


In [19]:
X = df.drop(columns = ['tip'])
y = df.tip

In [24]:
skb = SelectKBest(f_regression, k = 2)

skb.fit(X, y)

In [25]:
skb_mask = skb.get_support()
X.columns[skb_mask]

Index(['total_bill', 'size'], dtype='object')

<div class="alert alert-block alert-info">

1. 
    * Use Recursive Feature Elimination to select the top 2 features for tip amount. What are they?


In [28]:
lm = LinearRegression()

rfe = RFE(lm, n_features_to_select=2)

rfe.fit(X, y)

In [29]:
rfe_mask = rfe.get_support()
X.columns[rfe_mask]

Index(['total_bill', 'price_per_person'], dtype='object')

<div class="alert alert-block alert-info">

1. 
    * Why do you think Select K Best and Recursive Feature Elimination might give different answers for the top features? Does this change as you change the number of features you are selecting?


RFE takes into account the interactions between features, while SelectKBest typically evaluates features independently.

<div class="alert alert-block alert-info">

2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.


In [30]:
def select_kbest(X, y, k):
    
    skb = SelectKBest(f_regression, k = k)
    
    skb.fit(X, y)

    skb_mask = skb.get_support()
    
    kbest = X.columns[skb_mask]
    
    return kbest

In [32]:
select_kbest(X, y, 2)

Index(['total_bill', 'size'], dtype='object')

<div class="alert alert-block alert-info">

3. Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top n features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.


In [33]:
def select_rfe(X, y, n):
    lm = LinearRegression()

    rfe = RFE(lm, n_features_to_select=2)

    rfe.fit(X, y)

    rfe_mask = rfe.get_support()

    rfe_select = X.columns[rfe_mask]

    return rfe_select


In [34]:
select_rfe(X, y, 2)

Index(['total_bill', 'price_per_person'], dtype='object')

<div class="alert alert-block alert-info">

4. Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both Select K Best and Recursive Feature Elimination (use the functions you just built to help you out).