In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import math

import wrangle
import pydataset

### 1
- Load the tips dataset.
    - Create a column named `price_per_person`. This should be the total bill divided by the party size.
    - Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?
    - Use select k best to select the top 2 features for predicting tip amount. What are they?
    - Use recursive feature elimination to select the top 2 features for tip amount. What are they?
    - Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?



In [21]:
tips = pydataset.data('tips')
tips['price_per_person'] = tips.total_bill/tips['size']
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,8.495
2,10.34,1.66,Male,No,Sun,Dinner,3,3.446667
3,21.01,3.5,Male,No,Sun,Dinner,3,7.003333
4,23.68,3.31,Male,No,Sun,Dinner,2,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,6.1475


In [24]:
tips.columns.tolist()

['total_bill',
 'tip',
 'sex',
 'smoker',
 'day',
 'time',
 'size',
 'price_per_person']

> I think `total_bill` and `price_per_person` will be the best features

In [22]:
tips['smoker'] = (tips.smoker == 'Yes')
tips = pd.get_dummies(tips, columns=[ 'sex',
                                'smoker',
                                'day',
                                'time',])

In [27]:
tips_y = tips.tip
tips_x = tips.drop(columns=['tip'])

In [43]:
tips_x.head()

Unnamed: 0,total_bill,size,price_per_person,sex_Female,sex_Male,smoker_False,smoker_True,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch
1,16.99,2,8.495,1,0,1,0,0,0,1,0,1,0
2,10.34,3,3.446667,0,1,1,0,0,0,1,0,1,0
3,21.01,3,7.003333,0,1,1,0,0,0,1,0,1,0
4,23.68,2,11.84,0,1,1,0,0,0,1,0,1,0
5,24.59,4,6.1475,1,0,1,0,0,0,1,0,1,0


In [49]:
from sklearn.preprocessing import MinMaxScaler

cols_to_scale = ['total_bill','size','price_per_person']

scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(tips_x), index=tips_x.index, columns=tips_x.columns )
X.head()


Unnamed: 0,total_bill,size,price_per_person,sex_Female,sex_Male,smoker_False,smoker_True,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch
1,0.291579,0.2,0.322989,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,0.152283,0.4,0.032854,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,0.375786,0.4,0.237261,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,0.431713,0.2,0.51523,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
5,0.450775,0.6,0.188075,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


#### Select K Best

In [56]:
from sklearn.feature_selection import SelectKBest, f_regression, RFE

kbest = SelectKBest(f_regression, k=3)

kbest.fit(X,tips_y)

tips_x.columns[kbest.get_support()]

Index(['total_bill', 'size', 'price_per_person'], dtype='object')

Index(['total_bill', 'size'], dtype='object')

#### Recursive Feature Elimination

In [55]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
selector  = RFE(model, n_features_to_select=2)

selector.fit(X, tips_y)
X.columns[selector.support_]

Index(['total_bill', 'price_per_person'], dtype='object')

> I think they give different answers because one is experimental in nature, and the other is algorithmic

### 2
Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.



In [57]:
from sklearn.feature_selection import SelectKBest, f_regression
def select_kbest(X, y, k):
    kbest = SelectKBest(k=k)
    kbest.fit(X, y)
    mask = kbest.get_support()
    return X.columns[mask]

In [58]:
y = tips_y

select_kbest(X,y,3)

Index(['total_bill', 'size', 'price_per_person'], dtype='object')

### 3
Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.


In [59]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

def rfe(X, y, k, model=LinearRegression()):
    rfe = RFE(model, n_features_to_select=k)
    rfe.fit(X, y)
    mask = rfe.get_support()
    return X.columns[mask]

In [61]:
rfe(X,y,2)

Index(['total_bill', 'price_per_person'], dtype='object')


### 4
Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).



In [63]:
swiss = pydataset.data('swiss')
swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [64]:
from sklearn.preprocessing import MinMaxScaler
def min_max_scale_df(df, cols=None):
    
    if cols == None:
        cols = df.columns.tolist()
    df = df[cols]
    
    scaler = MinMaxScaler()
    X = pd.DataFrame(scaler.fit_transform(df), index=df.index, columns=df.columns )
    
    return X



In [68]:
y = swiss.Fertility
x = swiss.drop(columns=['Fertility'])

X = min_max_scale_df(x)

In [70]:
print(select_kbest(X, y, 3))
print(rfe(X, y, 3))

Index(['Agriculture', 'Examination', 'Education'], dtype='object')
Index(['Agriculture', 'Education', 'Infant.Mortality'], dtype='object')
