In [1]:
import warnings

warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from feature_engineer import select_kbest, rfe_ #imported module for functions

from pydataset import data
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_regression, RFE #feature selection objects
from sklearn.linear_model import LinearRegression

# Exercises

- Do your work for this exercise in a jupyter notebook named feature_engineering within the regression-exercises repo. Add, commit, and push your work.

1. Load the tips dataset.

In [2]:
df = data('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


a. Create a column named price_per_person. This should be the total bill divided by the party size.

In [3]:
df['price_per_person'] = df['total_bill'] / df['size']
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,8.495
2,10.34,1.66,Male,No,Sun,Dinner,3,3.446667
3,21.01,3.5,Male,No,Sun,Dinner,3,7.003333
4,23.68,3.31,Male,No,Sun,Dinner,2,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,6.1475


b. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?

- **I think the most important features to predict tip amount would be time, size, and total_bill would the most important to predict the tip amount.**

c. Use Select K Best to select the top 2 features for predicting tip amount. What are they?

In [4]:
#MinMaxScaler cannot fit strings
df = pd.get_dummies(df, columns=['sex', 'smoker', 'day', 'time'])
df.columns = df.columns.str.lower()
df = df.drop(columns=['smoker_no', 'sex_female',])
df.rename(columns={'day_Fri': 'friday', 'day_Sat': 'saturday', 'day_Sun': 'sunday', 'day_Thur': 'thursday', 'time_Dinner': 'dinner', 'time_Lunch' : 'lunch'}, inplace=True)
df.head()

Unnamed: 0,total_bill,tip,size,price_per_person,sex_male,smoker_yes,day_fri,day_sat,day_sun,day_thur,time_dinner,time_lunch
1,16.99,1.01,2,8.495,0,0,0,0,1,0,1,0
2,10.34,1.66,3,3.446667,1,0,0,0,1,0,1,0
3,21.01,3.5,3,7.003333,1,0,0,0,1,0,1,0
4,23.68,3.31,2,11.84,1,0,0,0,1,0,1,0
5,24.59,3.61,4,6.1475,0,0,0,0,1,0,1,0


In [5]:
mms = MinMaxScaler()

to_scale = df.drop(columns=['tip']).columns

df[to_scale] = mms.fit_transform(df[to_scale])

df.head()

Unnamed: 0,total_bill,tip,size,price_per_person,sex_male,smoker_yes,day_fri,day_sat,day_sun,day_thur,time_dinner,time_lunch
1,0.291579,1.01,0.2,0.322989,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,0.152283,1.66,0.4,0.032854,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,0.375786,3.5,0.4,0.237261,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,0.431713,3.31,0.2,0.51523,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
5,0.450775,3.61,0.6,0.188075,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


Let's break our data into X and y subsets.

We will move forward with SelectKBest. This technique uses a statistical test to determine how useful features may be.

In [6]:
X = df.drop(columns=['tip'])
y = df.tip

In [7]:
skb = SelectKBest(f_regression, k = 2)

skb.fit(X, y)

In [8]:
skb_mask = skb.get_support()
X.columns[skb_mask]

Index(['total_bill', 'size'], dtype='object')

d. Use Recursive Feature Elimination to select the top 2 features for tip amount. What are they?

In [9]:
lm = LinearRegression()

rfe = RFE(lm, n_features_to_select=2)

rfe.fit(X, y)

In [10]:
rfe_mask = rfe.get_support()
X.columns[rfe_mask]

Index(['total_bill', 'price_per_person'], dtype='object')

e. Why do you think Select K Best and Recursive Feature Elimination might give different answers for the top features? Does this change as you change the number of features you are selecting?

They perform different methodoligies. the SelectKBest performs stats test(chisquared, ANOVA, etc.) The RFE performs with models and eliminates weak peforming features after numbers of iterations.

2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [11]:
def select_kbest(X, y, k):

    # Initialize SelectKBest with the f_regression scoring function
    selector = SelectKBest(score_func=f_regression, k=k)
    
    # Fit the selector to the data
    selector.fit(X, y)
    
    # Get the indices of the top k selected features
    top_feature_indices = selector.get_support(indices=True)
    
    # Get the feature names based on the selected indices
    selected_features = pd.DataFrame(X.columns[top_feature_indices].tolist())

    # Remove the row and index from dataframe to display list
    selected_features_display = selected_features.style.hide_index().hide_columns()
    
    return selected_features_display

In [12]:
select_kbest(X, y, 2)

0
total_bill
size


3. Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top n features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [13]:
def rfe_(X, y, n):

    # Initialize with the LinearRegression estimator and n number of features
    rank = RFE(lm, n_features_to_select=n)
    
    # Fit the data
    rank.fit(X, y)
    
    # Get the indices of the top n ranked features
    top_feature_indices = rank.get_support(indices=True)
    
    # Get the feature names based on the selected indices
    ranked_features = pd.DataFrame(X.columns[top_feature_indices].tolist())

    #hiding index and columns
    ranked_features_display = ranked_features.style.hide_index().hide_columns()
    
    return ranked_features_display

In [14]:
rfe_(X, y, 2)

0
total_bill
price_per_person


4. Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both Select K Best and Recursive Feature Elimination (use the functions you just built to help you out).

In [15]:
df = data('swiss')
df.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [16]:
Xs = df.drop(columns=['Fertility'])
ys = df.Fertility

In [17]:
select_kbest(Xs, ys, 3)

0
Examination
Education
Catholic


In [18]:
rfe_(Xs, ys, 3)

0
Examination
Education
Infant.Mortality


In [19]:
df2 = data('iris')
df2.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa
4,4.6,3.1,1.5,0.2,setosa
5,5.0,3.6,1.4,0.2,setosa


In [20]:
df2 = pd.get_dummies(df2, columns=['Species'])
df2.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species_setosa,Species_versicolor,Species_virginica
1,5.1,3.5,1.4,0.2,1,0,0
2,4.9,3.0,1.4,0.2,1,0,0
3,4.7,3.2,1.3,0.2,1,0,0
4,4.6,3.1,1.5,0.2,1,0,0
5,5.0,3.6,1.4,0.2,1,0,0


In [21]:
Xr = df2.drop(columns=['Species_versicolor'])
yr = df2.Species_versicolor	

In [22]:
select_kbest(Xr, yr, 3)

0
Sepal.Width
Species_setosa
Species_virginica


In [23]:
rfe_(Xr, yr, 3)

0
Petal.Width
Species_setosa
Species_virginica
