In [30]:
import numpy as np
import pandas as pd
import seaborn as sns
from pydataset import data
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector

Load the tips dataset.

Create a column named price_per_person. This should be the total bill divided by the party size.

Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?

Use Select K Best to select the top 2 features for predicting tip amount. What are they?

Use Recursive Feature Elimination to select the top 2 features for tip amount. What are they?

Why do you think Select K Best and Recursive Feature Elimination might give different answers for the top features? Does this change as you change the number of features you are selecting?

In [2]:
df = sns.load_dataset("tips")
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [3]:
# Create a column named price_per_person
df['price_per_person'] = df['total_bill'] / df['size']

In [4]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
0,16.99,1.01,Female,No,Sun,Dinner,2,8.495
1,10.34,1.66,Male,No,Sun,Dinner,3,3.446667
2,21.01,3.5,Male,No,Sun,Dinner,3,7.003333
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84
4,24.59,3.61,Female,No,Sun,Dinner,4,6.1475


In [5]:
# I think time and total bill will be important features.

In [6]:
# Define a mapping dictionary
time_mapping = {'Lunch': 0, 'Dinner': 1}
sex_mapping = {'Male': 0, 'Female': 1}
day_mapping = {'Thur': 0, 'Fri': 1, 'Sat': 2, 'Sun':3}
smoker_mapping = {'No': 0, 'Yes': 1}
# Replace 'time' column with mapped integers
df['time'] = df['time'].map(time_mapping)
df['sex'] = df['sex'].map(sex_mapping)
df['day'] = df['day'].map(day_mapping)
df['smoker'] = df['smoker'].map(smoker_mapping)

In [7]:
df.sample(10)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
218,7.74,1.44,0,1,2,1,2,3.87
95,40.17,4.73,0,1,1,1,4,10.0425
136,10.33,2.0,1,0,0,0,2,5.165
21,20.29,2.75,1,0,2,1,2,10.145
43,9.68,1.32,0,0,3,1,2,4.84
159,16.49,2.0,0,0,3,1,4,4.1225
224,13.42,1.58,0,1,1,0,2,6.71
151,13.13,2.0,0,0,3,1,2,6.565
83,32.68,5.0,0,1,0,0,2,16.34
70,12.02,1.97,0,0,2,1,2,6.01


In [9]:
train_val, test = train_test_split(df, train_size=0.8, random_state=1349)
train, validate = train_test_split(train_val, train_size=0.7, random_state=1349)

In [10]:
train.shape, validate.shape, test.shape

((136, 8), (59, 8), (49, 8))

In [11]:
train.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
51,10.29,2.6,1,0,3,1,2,5.145
46,22.23,5.0,0,0,3,1,2,11.115
107,25.21,4.29,0,1,2,1,2,12.605
86,13.03,2.0,0,0,0,0,2,6.515
58,11.24,1.76,0,1,2,1,2,5.62


In [12]:
kbest_0 = SelectKBest(f_regression, k=2)

In [13]:
kbest_0

In [14]:
X_train, y_train = train.drop(columns='tip'), train.tip

In [15]:
X_train.columns

Index(['total_bill', 'sex', 'smoker', 'day', 'time', 'size',
       'price_per_person'],
      dtype='object')

In [16]:
# fit the object that we just created:
kbest_0.fit(X_train, y_train)

In [17]:
kbest_0.scores_

array([98.8998824 ,  1.42850653,  1.97543109,  4.39663297,  2.09207312,
       32.78883378, 23.49641256])

In [18]:
kbest_0.get_feature_names_out()

array(['total_bill', 'size'], dtype=object)

Recursive Feature Elimination:
    
start from all, continue to eliminate features until we find best number

In [20]:
# make a model object for our wrapper:
model = LinearRegression()

In [21]:
# now that we have this linear regression model,
# we will also create an object for RFE
rfe = RFE(model, n_features_to_select=2)

In [22]:
rfe

In [23]:
# just like with every other sklearn object, we have created it, 
# and now we need to fit it
rfe.fit(X_train, y_train)

In [24]:
rfe.ranking_

array([5, 6, 4, 3, 2, 1, 1])

In [25]:
pd.DataFrame(
{
    'feature': X_train.columns.to_list(),
    'rfe_ranking': rfe.ranking_
    
})

Unnamed: 0,feature,rfe_ranking
0,total_bill,5
1,sex,6
2,smoker,4
3,day,3
4,time,2
5,size,1
6,price_per_person,1


In [None]:
# yes changing the number changes the output of features that are returned. They give two different answers because they're two different models.

Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [26]:
def select_kbest(X,y,k):
    
    kbest = SelectKBest(f_regression, k=k)
    
    kbest.fit(X, y)
    
    bestfeatures = kbest.get_feature_names_out()
    
    return bestfeatures

In [27]:
select_kbest(X_train, y_train, 2)

array(['total_bill', 'size'], dtype=object)

Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top n features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [28]:
def rfe(X,y,n):
    
    model = LinearRegression()
    
    rfe = RFE(model, n_features_to_select=n)
    
    # fit the rfe object
    rfe.fit(X, y)
    
    #place the rfe rankings into a dataframe for easy to read 
    df = pd.DataFrame(
    {
    'feature': X.columns.to_list(),
    'rfe_ranking': rfe.ranking_   
    })
    
    #give us the data frame back
    return df

In [29]:
rfe(X_train, y_train, 2)

Unnamed: 0,feature,rfe_ranking
0,total_bill,5
1,sex,6
2,smoker,4
3,day,3
4,time,2
5,size,1
6,price_per_person,1


Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both Select K Best and Recursive Feature Elimination (use the functions you just built to help you out).

In [35]:
df_swiss = data("swiss")
df_swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [33]:
train_val, test = train_test_split(df_swiss, train_size=0.8, random_state=1349)
train, validate = train_test_split(train_val, train_size=0.7, random_state=1349)

In [34]:
train.shape, validate.shape, test.shape

((25, 6), (12, 6), (10, 6))

In [38]:
X_train_S, y_train_S = train.drop(columns='Fertility'), train.Fertility

In [39]:
X_train_S.columns

Index(['Agriculture', 'Examination', 'Education', 'Catholic',
       'Infant.Mortality'],
      dtype='object')

In [40]:
select_kbest(X_train_S, y_train_S, 3)

array(['Agriculture', 'Examination', 'Education'], dtype=object)

In [41]:
rfe(X_train_S, y_train_S, 3)

Unnamed: 0,feature,rfe_ranking
0,Agriculture,2
1,Examination,1
2,Education,1
3,Catholic,3
4,Infant.Mortality,1
