In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

## 1. Load the tips dataset

In [6]:
from pydataset import data
tips = data('tips')
tips.head(2)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3


### a) Create a column named price_per_person. This should be the total bill divided by the party size.

In [11]:
tips['price_per_person'] = tips.total_bill/tips['size']
tips.head(2)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,8.495
2,10.34,1.66,Male,No,Sun,Dinner,3,3.446667


### b) Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?

Total Bill - pretty obvious since most tips are based off a percentage of this.

Time - Dinner is usually a bigger meal and probably has higher tips as well.

### c) Use select k best to select the top 2 features for predicting tip amount. What are they?

In [13]:
from splitter import splitter
train, validate, test = splitter(tips)
train.head(2)

Train = 136 rows (56.0%) | Validate = 59 rows (24.0%) | Test = 49 rows (20.0%)
You did not stratify.  If looking to stratify, ensure to add argument: "target = variable to stratify on".


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
19,16.97,3.5,Female,No,Sun,Dinner,3,5.656667
173,7.25,5.15,Male,Yes,Sun,Dinner,2,3.625


In [16]:
train_d = pd.get_dummies(train[['sex','smoker','day','time']], dummy_na=False, drop_first=False)
validate_d = pd.get_dummies(validate[['sex','smoker','day','time']], dummy_na=False, drop_first=False)
test_d = pd.get_dummies(test[['sex','smoker','day','time']], dummy_na=False, drop_first=False)

In [19]:
train = pd.concat([train.drop(columns = ['sex','smoker','day','time']), train_d], axis=1)
validate = pd.concat([validate.drop(columns = ['sex','smoker','day','time']), validate_d], axis=1)
test = pd.concat([test.drop(columns = ['sex','smoker','day','time']), test_d], axis=1)
train

Unnamed: 0,total_bill,tip,size,price_per_person,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,...,sex_Female.1,sex_Male.1,smoker_No.1,smoker_Yes.1,day_Fri.1,day_Sat.1,day_Sun,day_Thur,time_Dinner,time_Lunch
19,16.97,3.5,3,5.656667,1,0,1,0,0,0,...,1,0,1,0,0,0,1,0,1,0
173,7.25,5.15,2,3.625,0,1,0,1,0,0,...,0,1,0,1,0,0,1,0,1,0
119,12.43,1.8,2,6.215,1,0,1,0,0,0,...,1,0,1,0,0,0,0,1,0,1
29,21.7,4.3,2,10.85,0,1,1,0,0,1,...,0,1,1,0,0,1,0,0,1,0
238,32.83,1.17,2,16.415,0,1,0,1,0,1,...,0,1,0,1,0,1,0,0,1,0
208,38.73,3.0,4,9.6825,0,1,0,1,0,1,...,0,1,0,1,0,1,0,0,1,0
184,23.17,6.5,4,5.7925,0,1,0,1,0,0,...,0,1,0,1,0,0,1,0,1,0
61,20.29,3.21,2,10.145,0,1,0,1,0,1,...,0,1,0,1,0,1,0,0,1,0
42,17.46,2.54,2,8.73,0,1,1,0,0,0,...,0,1,1,0,0,0,1,0,1,0
161,21.5,3.5,4,5.375,0,1,1,0,0,0,...,0,1,1,0,0,0,1,0,1,0


In [20]:
X_train = train.drop(columns='tip')
y_train = train.tip

X_validate = validate.drop(columns='tip')
y_validate = validate.tip

X_test = test.drop(columns='tip')
y_test = test.tip

In [22]:
from sklearn.feature_selection import SelectKBest, f_regression

# parameters: f_regression stats test, give me 8 features
f_selector = SelectKBest(f_regression, k=2)

# find the top 8 X's correlated with y
f_selector.fit(X_train, y_train)

# boolean mask of whether the column was selected or not. 
feature_mask = f_selector.get_support()

# get list of top K features. 
f_feature = X_train.iloc[:,feature_mask].columns.tolist()
f_feature

['total_bill', 'size']

Total Bill and Size.

### d) Use recursive feature elimination to select the top 2 features for tip amount. What are they?

In [24]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

# initialize the ML algorithm
lm = LinearRegression()

# create the rfe object, indicating the ML object (lm) and the number of features I want to end up with. 
rfe = RFE(lm, n_features_to_select=2)

# fit the data using RFE
rfe.fit(X_train,y_train)  

# get the mask of the columns selected
feature_mask = rfe.support_

# get list of the column names. 
rfe_feature = X_train.iloc[:,feature_mask].columns.tolist()
rfe_feature

['size', 'day_Thur']

Interestingly, the to two features here are size and it being a Thursday, not total_amount!

In [25]:
# view list of columns and their ranking

# get the ranks
var_ranks = rfe.ranking_
# get the variable names
var_names = X_train.columns.tolist()
# combine ranks and names into a df for clean viewing
rfe_ranks_df = pd.DataFrame({'Var': var_names, 'Rank': var_ranks})
# sort the df by rank
rfe_ranks_df.sort_values('Rank')

Unnamed: 0,Var,Rank
1,size,1
20,day_Thur,1
14,sex_Male,2
2,price_per_person,3
8,day_Sat,4
0,total_bill,5
13,sex_Female,6
10,day_Thur,7
17,day_Fri,8
16,smoker_Yes,9


### e) Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?

K-best is a 'one-time' correlation analysis: it looks at everything once and picks the best from that. A 'top-down' approach.

RFE is recursive, meaning it updates and repeats itself.  So that once a feature is removed, it take another look at the data, and over and over again.  A more 'bottom-up' approach.

## 2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [26]:
# import numpy as np
# import pandas as pd
# from sklearn.feature_selection import SelectKBest, f_regression
def select_kbest(X_train,y_train,k):
    # parameters: f_regression stats test, give me k features
    f_selector = SelectKBest(f_regression, k=k)
    f_selector.fit(X_train, y_train) 
    feature_mask = f_selector.get_support()
    f_feature = X_train.iloc[:,feature_mask].columns.tolist()
    return f_feature
 
select_kbest(X_train,y_train,2)

['total_bill', 'size']

## 3. Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [34]:
# import numpy as np
# import pandas as pd
# from sklearn.linear_model import LinearRegression
# from sklearn.feature_selection import RFE
def rfe(X_train, y_train, k, model_type = LinearRegression()):
    rfe = RFE(model_type, n_features_to_select=k)
    rfe.fit(X_train,y_train)  
    feature_mask = rfe.support_
    rfe_feature = X_train.iloc[:,feature_mask].columns.tolist()
    return rfe_feature

rfe(X_train, y_train,2)

['size', 'day_Thur']

## 4. Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [37]:
from pydataset import data
swiss = data('swiss')
train,validate,test = splitter(swiss)

Train = 25 rows (56.0%) | Validate = 12 rows (24.0%) | Test = 10 rows (20.0%)
You did not stratify.  If looking to stratify, ensure to add argument: "target = variable to stratify on".


In [41]:
train

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Rolle,60.5,60.8,16,10,7.72,16.3
Lavaux,65.1,73.0,19,9,2.84,20.0
Nyone,56.6,50.9,22,12,15.14,16.7
Conthey,75.5,85.9,3,2,99.71,15.1
Yverdon,65.4,49.5,15,8,6.1,22.5
Oron,72.5,71.2,12,1,2.4,21.0
Cossonay,61.7,69.3,22,5,2.82,18.7
St Maurice,65.0,75.9,9,9,99.06,17.8
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Orbe,57.4,54.1,20,6,4.2,15.3


In [42]:
X_train = train.drop(columns='Fertility')
y_train = train.Fertility

X_validate = validate.drop(columns='Fertility')
y_validate = validate.Fertility

X_test = test.drop(columns='Fertility')
y_test = test.Fertility

In [43]:
select_kbest(X_train,y_train,3)

['Examination', 'Catholic', 'Infant.Mortality']

In [44]:
rfe(X_train, y_train,3)

['Agriculture', 'Examination', 'Infant.Mortality']