In [1]:
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Data handling
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Sklearn modules and features
from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector
from sklearn.model_selection import train_test_split

# Data acquisition
from pydataset import data
from sklearn.preprocessing import MinMaxScaler



# 1

Load the tips dataset.

    a - Create a column named price_per_person. This should be the total bill divided by the party size.
    b - Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?
    c - Use select k best to select the top 2 features for predicting tip amount. What are they?
    d - Use recursive feature elimination to select the top 2 features for tip amount. What are they?
    e - Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?



In [2]:
df = data('tips')

In [3]:
#a
df['price_per_person'] = (df.total_bill/df['size'])

In [4]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,8.495
2,10.34,1.66,Male,No,Sun,Dinner,3,3.446667
3,21.01,3.5,Male,No,Sun,Dinner,3,7.003333
4,23.68,3.31,Male,No,Sun,Dinner,2,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,6.1475


#b
total_bill 
size


# 1 c
clean:

In [5]:
dummy_df = pd.get_dummies(df[['sex', 'smoker', 'day', 'time']], dummy_na=False, drop_first= True)
df = pd.concat([df, dummy_df], axis=1)

In [6]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,sex_Male,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch
1,16.99,1.01,Female,No,Sun,Dinner,2,8.495,0,0,0,1,0,0
2,10.34,1.66,Male,No,Sun,Dinner,3,3.446667,1,0,0,1,0,0
3,21.01,3.5,Male,No,Sun,Dinner,3,7.003333,1,0,0,1,0,0
4,23.68,3.31,Male,No,Sun,Dinner,2,11.84,1,0,0,1,0,0
5,24.59,3.61,Female,No,Sun,Dinner,4,6.1475,0,0,0,1,0,0


In [7]:
df = df.drop(columns = ['sex', 'smoker', 'day', 'time'])

In [8]:
df.head(3)

Unnamed: 0,total_bill,tip,size,price_per_person,sex_Male,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch
1,16.99,1.01,2,8.495,0,0,0,1,0,0
2,10.34,1.66,3,3.446667,1,0,0,1,0,0
3,21.01,3.5,3,7.003333,1,0,0,1,0,0


In [9]:
train_validate, test = train_test_split(df, random_state=123, train_size=.8)
train, validate = train_test_split(train_validate, random_state=123, train_size=.7)

In [10]:
X_train = train.drop(columns = 'tip')
y_train = train.tip
X_validate = validate.drop(columns = 'tip')
y_validate = validate.tip
X_test = test.drop(columns = 'tip')
y_test = test.tip

In [11]:
#class review chose to scale data

mms = MinMaxScaler()

mms.fit(X_train[['total_bill', 'price_per_person']])

X_train[['total_bill', 'price_per_person']] = mms.transform(X_train[['total_bill', 'price_per_person']])

X_train.head()

Unnamed: 0,total_bill,size,price_per_person,sex_Male,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch
19,0.307114,3,0.150344,0,0,0,1,0,0
173,0.092355,2,0.032258,1,1,0,1,0,0
119,0.206805,2,0.182796,0,0,0,0,1,1
29,0.411622,2,0.452194,1,0,1,0,0,0
238,0.657534,2,0.775647,1,1,1,0,0,0


In [12]:
#select hyperperameters and fit to train data
kbest = SelectKBest(f_regression, k=2)
kbest.fit(X_train, y_train)

SelectKBest(k=2, score_func=<function f_regression at 0x13d9ff9d0>)

In [13]:
# Top two features

In [14]:
X_train.columns[kbest.get_support()] 

Index(['total_bill', 'size'], dtype='object')

In [15]:
#class also chose to use dummy variables for size

In [16]:
kbest.get_support()

array([ True,  True, False, False, False, False, False, False, False])

# d - 
Use recursive feature elimination to select the top 2 features for tip amount. What are they?

In [17]:
from sklearn.linear_model import LinearRegression

In [18]:
model = LinearRegression()
rfe = RFE(model, n_features_to_select=2)
rfe.fit(X_train, y_train)

RFE(estimator=LinearRegression(), n_features_to_select=2)

In [19]:
# Top two

In [20]:
X_train.columns[rfe.get_support()]

Index(['total_bill', 'price_per_person'], dtype='object')

# e - 
Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting? Yes

The way they sort through features is different. k best runs through each individualy compared to the target variable and picks the oens that most closely correlate with the target variable

rfe starts with all the features combined, then drops off one feature at a time. and it is greedy algorithim so it can't think ahead, just only what single step is in front of it (it doesn't test all the combinations of features and give you the best one, just drops off the least valuable feature as it goes)

In [21]:
model = LinearRegression()
rfe = RFE(model, n_features_to_select=3)
rfe.fit(X_train, y_train)
X_train.columns[rfe.get_support()]

Index(['total_bill', 'price_per_person', 'sex_Male'], dtype='object')

In [22]:
kbest = SelectKBest(f_regression, k=3)
kbest.fit(X_train, y_train)
X_train.columns[kbest.get_support()]

Index(['total_bill', 'size', 'price_per_person'], dtype='object')

In [23]:
# get the ranks
var_ranks = rfe.ranking_
# get the variable names
var_names = X_train.columns.tolist()
# combine ranks and names into a df for clean viewing
rfe_ranks_df = pd.DataFrame({'Var': var_names, 'Rank': var_ranks})
# sort the df by rank
rfe_ranks_df.sort_values('Rank')

Unnamed: 0,Var,Rank
0,total_bill,1
2,price_per_person,1
3,sex_Male,1
7,day_Thur,2
5,day_Sat,3
1,size,4
6,day_Sun,5
4,smoker_Yes,6
8,time_Lunch,7


# 2
Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [24]:
from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector

def select_kbest(x, y, n):
    kbest = SelectKBest(f_regression, k=n)
    kbest.fit(x, y)
    l = x.columns[kbest.get_support()]
    return l



In [25]:
#  test

In [26]:
select_kbest(X_train, y_train, 2)

Index(['total_bill', 'size'], dtype='object')

# 3
Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [27]:
model = LinearRegression()

def rfe_lin_reg(x, y, k):
    model = LinearRegression()
    rfe = RFE(model, n_features_to_select= k)
    rfe.fit(x, y)
    l = x.columns[rfe.get_support()]
    return l

In [28]:
rfe_lin_reg(X_train, y_train, 3)

Index(['total_bill', 'price_per_person', 'sex_Male'], dtype='object')

# 4
Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [29]:
df = data('swiss')

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 47 entries, Courtelary to Rive Gauche
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Fertility         47 non-null     float64
 1   Agriculture       47 non-null     float64
 2   Examination       47 non-null     int64  
 3   Education         47 non-null     int64  
 4   Catholic          47 non-null     float64
 5   Infant.Mortality  47 non-null     float64
dtypes: float64(4), int64(2)
memory usage: 2.6+ KB


In [31]:
df.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [32]:
train_validate, test = train_test_split(df, random_state=123, train_size=.8)
train, validate = train_test_split(train_validate, random_state=123, train_size=.7)

In [33]:
X_train = train.drop(columns = 'Fertility')
y_train = train.Fertility
X_validate = validate.drop(columns = 'Fertility')
y_validate = validate.Fertility
X_test = test.drop(columns = 'Fertility')
y_test = test.Fertility

In [34]:
#class scaled
#error somewhere

# mms2 =  MinMaxScaler()

# X_train = mms2.fit_transform(X_train)

# X_train.head()

In [35]:
select_kbest(X_train, y_train, 3)

Index(['Examination', 'Catholic', 'Infant.Mortality'], dtype='object')

In [36]:
rfe_lin_reg(X_train, y_train, 3)

Index(['Agriculture', 'Examination', 'Infant.Mortality'], dtype='object')

# Extra

In [37]:
from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector

def select_kbest(x, y, n):
    kbest = SelectKBest(f_regression, k=n)
    kbest.fit(x, y)
    l = x.columns[kbest.get_support()]
    return l



In [38]:
def rfe_lin_reg(x, y, k):
    model = LinearRegression()
    rfe = RFE(model, n_features_to_select= k)
    rfe.fit(x, y)
    l = x.columns[rfe.get_support()]
    return l

In [39]:
features = []

def select_kbest(x, y, n):
    for i in range(n):
        kbest = SelectKBest(f_regression, k=i)
        kbest.fit(x, y)
        l = x.columns[kbest.get_support()]
        features.append(l)
        if i >= (x.shape[1] - 1):
            print('Number is too big')
            break
    return features
