In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import pydataset
import matplotlib.pyplot as plt

from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
tips = pydataset.data('tips')
tips['smoker'] = (tips.smoker == 'Yes').astype(int)
tips['dinner'] = (tips.time == 'Dinner').astype(int)
tips['tip_percentage'] = tips['tip']/tips['total_bill']
tips['price_per_person'] = tips['total_bill']/tips['size']

In [3]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,dinner,tip_percentage,price_per_person
1,16.99,1.01,Female,0,Sun,Dinner,2,1,0.059447,8.495
2,10.34,1.66,Male,0,Sun,Dinner,3,1,0.160542,3.446667
3,21.01,3.5,Male,0,Sun,Dinner,3,1,0.166587,7.003333
4,23.68,3.31,Male,0,Sun,Dinner,2,1,0.13978,11.84
5,24.59,3.61,Female,0,Sun,Dinner,4,1,0.146808,6.1475


In [4]:
# Convert non-numeric columns to numeric
for column in tips.columns:
    if tips[column].dtype == np.number:
        continue
    tips[column] = LabelEncoder().fit_transform(tips[column])

In [5]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,dinner,tip_percentage,price_per_person
1,16.99,1.01,0,0,2,0,1,1,0.059447,8.495
2,10.34,1.66,1,0,2,0,2,1,0.160542,3.446667
3,21.01,3.5,1,0,2,0,2,1,0.166587,7.003333
4,23.68,3.31,1,0,2,0,1,1,0.13978,11.84
5,24.59,3.61,0,0,2,0,3,1,0.146808,6.1475


In [19]:
# Using K best to predict tip amount
X = tips.drop(columns=['tip','tip_percentage'])
y = tips.tip

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=123)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

kbest = SelectKBest(f_regression, k=2)
kbest.fit(X_train_scaled, y_train)
X_train.columns[kbest.get_support()]

Index(['total_bill', 'size'], dtype='object')

In [20]:
# Using RFE to predict tip amount
rfe = RFE(estimator=LinearRegression(), n_features_to_select=2)
rfe.fit(X_train_scaled, y_train)
X_train.columns[rfe.get_support()]

Index(['total_bill', 'price_per_person'], dtype='object')

In [21]:
# Using K best to predict tip_percentage
X = tips.drop(columns=['tip','tip_percentage'])
y = tips.tip_percentage

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=123)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

kbest = SelectKBest(f_regression, k=2)
kbest.fit(X_train_scaled, y_train)
X_train.columns[kbest.get_support()]

Index(['total_bill', 'price_per_person'], dtype='object')

In [22]:
# Using RFE to predict tip_percentage
rfe = RFE(estimator=LinearRegression(), n_features_to_select=2)
rfe.fit(X_train_scaled, y_train)
X_train.columns[rfe.get_support()]

Index(['size', 'price_per_person'], dtype='object')

K best and RFE give slightly different answers when k/n_features = 2 - 3 but are the same when > 4

In [23]:
# Write a function named select_kbest
def select_kbest(X,y,k):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=123)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    kbest = SelectKBest(f_regression, k=k)
    kbest.fit(X_train_scaled, y_train)
    return X_train.columns[kbest.get_support()]

In [24]:
select_kbest(X,y,2)

Index(['total_bill', 'price_per_person'], dtype='object')

In [25]:
# Write a function named rfe
def rfe(X,y,k):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=123)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    rfe = RFE(estimator=LinearRegression(), n_features_to_select=k)
    rfe.fit(X_train_scaled, y_train)
    return X_train.columns[rfe.get_support()]

In [26]:
rfe(X,y,2)

Index(['size', 'price_per_person'], dtype='object')

In [27]:
swiss = pydataset.data('swiss')

In [28]:
swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [29]:
X = swiss.drop(columns='Fertility')
y = swiss.Fertility

In [30]:
select_kbest(X,y,3)

Index(['Examination', 'Education', 'Catholic'], dtype='object')

In [31]:
rfe(X,y,3)

Index(['Agriculture', 'Education', 'Catholic'], dtype='object')