In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import math

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from pydataset import data
from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector

Load the tips dataset.

- Create a column named price_per_person. This should be the total bill divided by the party size.
- Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?
- Use select k best to select the top 2 features for predicting tip amount. What are they?
- Use recursive feature elimination to select the top 2 features for tip amount. What are they?
- Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?
- Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

- Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

- Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [2]:
tips = data('tips')

tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [3]:
tips['price_per_person'] = round((tips['total_bill'] / tips['size']), 2)

tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,8.49
2,10.34,1.66,Male,No,Sun,Dinner,3,3.45
3,21.01,3.5,Male,No,Sun,Dinner,3,7.0
4,23.68,3.31,Male,No,Sun,Dinner,2,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,6.15


In [4]:
# making dummies and encoded values to help machine learning
dummy_df = pd.get_dummies(tips[['time', 'day']], dummy_na=False,drop_first=False)

tips = pd.concat([tips, dummy_df], axis=1)

tips['sex_encoded'] = tips.sex.map({'Male':1, 'Female':0})
tips['smoker_encoded'] = tips.smoker.map({'Yes':1, 'No':0})

tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,time_Dinner,time_Lunch,day_Fri,day_Sat,day_Sun,day_Thur,sex_encoded,smoker_encoded
1,16.99,1.01,Female,No,Sun,Dinner,2,8.49,1,0,0,0,1,0,0,0
2,10.34,1.66,Male,No,Sun,Dinner,3,3.45,1,0,0,0,1,0,1,0
3,21.01,3.5,Male,No,Sun,Dinner,3,7.0,1,0,0,0,1,0,1,0
4,23.68,3.31,Male,No,Sun,Dinner,2,11.84,1,0,0,0,1,0,1,0
5,24.59,3.61,Female,No,Sun,Dinner,4,6.15,1,0,0,0,1,0,0,0


In [5]:
tips = tips.drop(columns=['time', 'day', 'smoker', 'sex'])

tips.head()

Unnamed: 0,total_bill,tip,size,price_per_person,time_Dinner,time_Lunch,day_Fri,day_Sat,day_Sun,day_Thur,sex_encoded,smoker_encoded
1,16.99,1.01,2,8.49,1,0,0,0,1,0,0,0
2,10.34,1.66,3,3.45,1,0,0,0,1,0,1,0
3,21.01,3.5,3,7.0,1,0,0,0,1,0,1,0
4,23.68,3.31,2,11.84,1,0,0,0,1,0,1,0
5,24.59,3.61,4,6.15,1,0,0,0,1,0,0,0


In [7]:
# split tips dataframe
train_validate, test = train_test_split(tips, test_size=.2, random_state=123)
train, validate = train_test_split(train_validate, test_size=.3, random_state=123)
train.shape, validate.shape, test.shape

((136, 12), (59, 12), (49, 12))

In [9]:
x_train = train.drop(columns=['tip'])
y_train = train['tip']

x_validate = validate.drop(columns=['tip'])
y_validate = validate['tip']

x_test = test.drop(columns=['tip'])
y_test = test['tip']

In [10]:

kbest = SelectKBest(f_regression, k=2)
kbest.fit(x_train, y_train)

SelectKBest(k=2, score_func=<function f_regression at 0x13e7d2f70>)

In [11]:
kbest_results = pd.DataFrame(dict(p=kbest.pvalues_, f=kbest.scores_), index=x_train.columns)
kbest_results

Unnamed: 0,p,f
total_bill,7.18647e-20,115.984909
size,1.341642e-12,61.259089
price_per_person,0.001306594,10.783502
time_Dinner,0.1821449,1.798647
time_Lunch,0.1821449,1.798647
day_Fri,0.4068238,0.692418
day_Sat,0.9550468,0.00319
day_Sun,0.1236625,2.400404
day_Thur,0.3013774,1.07638
sex_encoded,0.2844794,1.154792


In [12]:
x_train.columns[kbest.get_support()]

Index(['total_bill', 'size'], dtype='object')

In [13]:
x_train_transformed = pd.DataFrame(
    kbest.transform(x_train),
    index=x_train.index,
    columns=x_train.columns[kbest.get_support()]
)
x_train_transformed.head()


Unnamed: 0,total_bill,size
19,16.97,3.0
173,7.25,2.0
119,12.43,2.0
29,21.7,2.0
238,32.83,2.0


### Recursive Feature Elimination

In [14]:
model = LinearRegression()
rfe = RFE(model, n_features_to_select=2)
rfe.fit(x_train, y_train)

RFE(estimator=LinearRegression(), n_features_to_select=2)

In [15]:
pd.DataFrame({'rfe_ranking': rfe.ranking_}, index=x_train.columns)

Unnamed: 0,rfe_ranking
total_bill,5
size,1
price_per_person,3
time_Dinner,9
time_Lunch,10
day_Fri,6
day_Sat,4
day_Sun,8
day_Thur,1
sex_encoded,2


In [16]:
x_train.columns[rfe.get_support()]

Index(['size', 'day_Thur'], dtype='object')

In [17]:
x_train_transformed = pd.DataFrame(
    rfe.transform(x_train),
    index=x_train.index,
    columns=x_train.columns[rfe.support_]
)
x_train_transformed.head()

Unnamed: 0,size,day_Thur
19,3.0,0.0
173,2.0,0.0
119,2.0,1.0
29,2.0,0.0
238,2.0,0.0


 Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [23]:
def select_kbest(x, y, k):
     kbest = SelectKBest(f_regression, k=k)
     kbest.fit(x , y)
     return x.columns[kbest.get_support()]


In [24]:
select_kbest(x_train, y_train, 2)

Index(['total_bill', 'size'], dtype='object')

In [26]:
def select_rfe(x, y, k):
    model = LinearRegression()
    rfe = RFE(model, n_features_to_select= k)
    rfe.fit(x, y)
    return x.columns[rfe.get_support()]

In [27]:
select_rfe(x_train, y_train, 2)

Index(['size', 'day_Thur'], dtype='object')

In [28]:
swiss = data('swiss')

swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [30]:
swiss_train, swiss_test = train_test_split(swiss, test_size=.2, random_state=123)
# train, validate = train_test_split(train_validate, test_size=.3, random_state=123)
swiss_train.shape, swiss_test.shape

((37, 6), (10, 6))

In [35]:
x_train_swiss = swiss_train.drop(columns=['Fertility'])
y_train_swiss = swiss_train[['Fertility']]

x_test_swiss = swiss_test.drop(columns=['Fertility'])
y_test_swiss = swiss_test['Fertility']

In [36]:
select_kbest(x_train_swiss, y_train_swiss, 3)

  y = column_or_1d(y, warn=True)


Index(['Examination', 'Education', 'Catholic'], dtype='object')

In [37]:
select_rfe(x_train_swiss, y_train_swiss, 3)

Index(['Examination', 'Education', 'Infant.Mortality'], dtype='object')