In [120]:
import pandas as pd
import numpy as np
import pydataset

from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import warnings
warnings.simplefilter("ignore")

In [121]:
tips = pydataset.data("tips")

In [122]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [123]:
tips.isnull().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

#### Create a column named tip_percentage. This should be the tip amount divided by the total bill.

In [124]:
tips["tip_percentage"] = tips.tip / tips.total_bill

In [125]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage
1,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
2,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
3,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
4,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
5,24.59,3.61,Female,No,Sun,Dinner,4,0.146808


#### Create a column named price_per_person. This should be the total bill divided by the party size.

In [126]:
tips.size

1952

In [127]:
tips["price_per_person"] = tips.total_bill / tips["size"]

In [128]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,0.059447,8.495
2,10.34,1.66,Male,No,Sun,Dinner,3,0.160542,3.446667
3,21.01,3.5,Male,No,Sun,Dinner,3,0.166587,7.003333
4,23.68,3.31,Male,No,Sun,Dinner,2,0.13978,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,0.146808,6.1475


#### Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount? The tip percentage?

In [129]:
# im going to guess the order will be 1. tip_percentage, 2. total_bill, 3. price_per_person

#### Use select k best and recursive feature elimination to select the top 2 features for predicting tip amount. What are they?

In [130]:
tips['smoker'] = (tips.smoker == 'Yes').astype(int)
tips['dinner'] = (tips.time == 'Dinner').astype(int)

In [131]:
X = tips[['total_bill',"dinner","smoker","price_per_person"]]
y = tips.tip

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=123)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [132]:
rfe = RFE(estimator = LinearRegression(), n_features_to_select = 2)
rfe.fit(X_train_scaled,y_train)
rfe.get_support()

array([ True, False, False,  True])

In [133]:
rfe.ranking_

array([1, 3, 2, 1])

In [134]:
X_train.columns[rfe.get_support()]

Index(['total_bill', 'price_per_person'], dtype='object')

In [135]:
kbest = SelectKBest(f_regression, k=2)
kbest.fit(X_train_scaled, y_train)

SelectKBest(k=2, score_func=<function f_regression at 0x7fa8c0c55940>)

In [136]:
X_train.columns[kbest.get_support()]

Index(['total_bill', 'price_per_person'], dtype='object')

#### Use select k best and recursive feature elimination to select the top 2 features for predicting tip percentage. What are they?

In [137]:
tips['smoker'] = (tips.smoker == 'Yes').astype(int)
tips['dinner'] = (tips.time == 'Dinner').astype(int)

In [138]:
X = tips[['total_bill',"dinner","smoker","price_per_person"]]
y = tips.tip_percentage

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=123)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [139]:
kbest = SelectKBest(f_regression, k=3)
kbest.fit(X_train_scaled, y_train)

SelectKBest(k=3, score_func=<function f_regression at 0x7fa8c0c55940>)

In [140]:
X_train.columns[kbest.get_support()]

Index(['total_bill', 'dinner', 'price_per_person'], dtype='object')

In [141]:
rfe = RFE(estimator = LinearRegression(), n_features_to_select = 3)
rfe.fit(X_train_scaled,y_train)
rfe.get_support()

array([ True,  True, False,  True])

In [142]:
rfe.ranking_

array([1, 1, 2, 1])

In [143]:
X_train.columns[rfe.get_support()]

Index(['total_bill', 'dinner', 'price_per_person'], dtype='object')

#### Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?

In [144]:
# i think the difference is the way the two functions work, select k best takes the top features at that point in time,
# while the rfe regenerates the dataset each time and checks the features each time it selects a different feature.

#### Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [152]:
def select_kbest(X,y,k):
    kbest = SelectKBest(f_regression, k=k)
    kbest.fit(X,y)
    return X.columns[kbest.get_support()].to_list()

#### Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [158]:
def rfe(X,y,n):
    recursive_feature_elimination = RFE(estimator = LinearRegression(), n_features_to_select = n)
    recursive_feature_elimination.fit(X,y)
    return X_train.columns[recursive_feature_elimination.get_support()].to_list()

#### Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [159]:
df = pydataset.data("swiss")

In [160]:
X = df[["Agriculture","Examination","Education","Catholic","Infant.Mortality"]]
y = df.Fertility

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=123)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [161]:
select_kbest(X_train,y_train,3)

['Examination', 'Education', 'Catholic']

In [162]:
rfe(X_train,y_train,3)

['Examination', 'Education', 'Infant.Mortality']