In [1]:
# imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from math import sqrt

from sklearn.linear_model import LinearRegression
# not needed for this lesson : 
  # from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_regression, RFE

from regprepare import train_val_test

# dataset and functions
from pydataset import data

# turn off pink warning boxes
import warnings
warnings.filterwarnings("ignore")


In [2]:
# getting data

tips = data('tips')

In [3]:
tips.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 244 entries, 1 to 244
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 15.2+ KB


In [4]:
tips.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
total_bill,244.0,19.785943,8.902412,3.07,13.3475,17.795,24.1275,50.81
tip,244.0,2.998279,1.383638,1.0,2.0,2.9,3.5625,10.0
size,244.0,2.569672,0.9511,1.0,2.0,2.0,3.0,6.0


In [5]:
tips.head(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3


### 1.a. Create a column named price_per_person. This should be the total bill divided by the party size.

In [6]:
# calculating price per person 
tips['total_bill'] / tips['size']

1       8.495000
2       3.446667
3       7.003333
4      11.840000
5       6.147500
         ...    
240     9.676667
241    13.590000
242    11.335000
243     8.910000
244     9.390000
Length: 244, dtype: float64

In [7]:
# assigning ppp to a column 

tips['price_per_person'] = tips['total_bill'] / tips['size']

In [8]:
tips = tips.rename(columns = {'size' : 'party_size'})
#renaming

In [9]:
# verifying that the new / renamed columns exists

tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,party_size,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,8.495
2,10.34,1.66,Male,No,Sun,Dinner,3,3.446667
3,21.01,3.5,Male,No,Sun,Dinner,3,7.003333
4,23.68,3.31,Male,No,Sun,Dinner,2,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,6.1475


In [10]:
# splitting in to train, validate, test

train, val, test = train_val_test(tips)
train.shape, val.shape, test.shape


((146, 8), (49, 8), (49, 8))

### 1.b. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount? 

Tip amount = target variable, y-value.  

- time
- total_bill
- party_size

### 1.c. Use select k best to select the top 2 features for predicting tip amount. What are they?

Total_bill & party_size.

In [11]:
# scaling the data in specific columns, reassigning the scaled numbers to the column names

mms = MinMaxScaler()

train[['total_bill', 'price_per_person', 
       'party_size']] = mms.fit_transform(train[['total_bill', 
                                                 'price_per_person', 'party_size']])
                                                    
train.head(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,party_size,price_per_person
195,0.240346,4.0,Male,Yes,Thur,Lunch,0.2,0.311207
77,0.270084,3.08,Male,Yes,Sat,Dinner,0.2,0.349713
42,0.259876,2.54,Male,No,Sun,Dinner,0.2,0.336494


In [12]:
# subset the feature data
# feature array / df with continuous features (X, ...) and target (y, tip)

X_train_scaled = train[['total_bill', 'price_per_person', 'party_size']]
y_train = train['tip']

In [13]:
# create an instance of the SelectKBest object

# use the f_reg stats test, k = select the top 2 features out of our 3 continuous features

f_selector = SelectKBest(f_regression, k = 2)

In [14]:
# fit the object to data : .fit(features, target variable)

f_selector.fit(X_train_scaled, y_train)


In [15]:
# get support method tells us what variables are true (useful) and false (not useful)

f_selector.get_support()

array([ True, False,  True])

In [16]:
# masking the values by assigning a variable to the T/F in order to apply it to the columns

f_selector_mask = f_selector.get_support()

# see only the column names relevant to our analysis

X_train_scaled.columns[f_selector_mask]

Index(['total_bill', 'party_size'], dtype='object')

In [17]:
# .iloc[rows to select, boolean mask] = in order to mask column-wise, keeping only the desired columns

X_train_scaled.iloc[:, f_selector_mask].head(3)

Unnamed: 0,total_bill,party_size
195,0.240346,0.2
77,0.270084,0.2
42,0.259876,0.2


### 1.d. Use recursive feature elimination to select the top 2 features for tip amount. What are they?

Total_bill & price_per_person.

In [18]:
# dropping 'tip' to allow for fair evaluation of data

X_train = train.drop(columns = 'tip')
X_train.head(1)

Unnamed: 0,total_bill,sex,smoker,day,time,party_size,price_per_person
195,0.240346,Male,Yes,Thur,Lunch,0.2,0.311207


In [19]:
# make dummies of categorical columns

# then reassign to variable X_train

X_train = pd.get_dummies(X_train, columns = ['sex', 'smoker', 'day', 'time', 'party_size'])

In [20]:
X_train.head(1)

# now all features in dataset fall from 0 to 1 and will work in a regression algorithm

Unnamed: 0,total_bill,price_per_person,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch,party_size_0.0,party_size_0.2,party_size_0.4000000000000001,party_size_0.6000000000000001,party_size_0.8,party_size_1.0000000000000002
195,0.240346,0.311207,0,1,0,1,0,0,0,1,0,1,0,1,0,0,0,0


In [21]:
# RFE uses a machine learning model (here, linear regression) 
#   to determine the 2 features with the most predictive capability

lm = LinearRegression()

rfe = RFE(lm, n_features_to_select = 2)

In [22]:
# fitting 

rfe.fit(X_train, y_train)

In [23]:
# ranking of the categorical columns and aligning them with their column names

ranking = rfe.ranking_

features = X_train.columns.tolist()

In [24]:
# turning ranks & columns into a df
# 'key' = column names : values = list of values to fill column

feature_rankings = pd.DataFrame({'ranking' : ranking,
                                'feature' : features})

feature_rankings.sort_values('ranking')

Unnamed: 0,ranking,feature
0,1,total_bill
1,1,price_per_person
16,2,party_size_0.8
12,3,party_size_0.0
13,4,party_size_0.2
17,5,party_size_1.0000000000000002
2,6,sex_Female
14,7,party_size_0.4000000000000001
11,8,time_Lunch
4,9,smoker_No


### 1.e. Why do you think SelectKBest and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features you are selecting?

- Because SKB only takes continuous variables into account, whereas RFE creates dummies and allow for all variables to be addressed equally.

- It does change, because the more options (variables) there are, the more options can be taken into accounts.

### 2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [25]:
# # SelectKBest function

# # df = dataframe,
# # scaled_cols = columns to scale, entered as a list, i.e, ['variable_name'],
# # target_var = target variable, entered as a string, i.e, 'variable_name'
# # kk = the k number of features to select / return

# def select_best(df, scaled_cols, target_var, kk):
 
#     '''
#     This function takes in the predictors (X), the target (y) and 
#     the number of features to select (k) and returns the names of
#     the top k selected features based on the SelectKBest class. 
#     '''
    
#     # scaling the data in specific columns, reassigning the scaled numbers to the column names
#     mms = MinMaxScaler()
#     df[['total_bill', 'price_per_person', 'party_size']] = mms.fit_transform(df[['total_bill', 
#                                                                          'price_per_person', 'party_size']])

#     # feature array / df with continuous features (X, ...) and target (y, tip)
#     X_train_scaled = df[['total_bill', 'price_per_person', 'party_size']]
#     y_train = df['tip']

#     # create an instance of the SelectKBest object
#     df = SelectKBest(f_regression, k = kk)

#     # fit the object to data : .fit(features, target variable)
#     df.fit(X_train_scaled, y_train)
    
#     # masking the values by assigning a variable to the T/F in order to apply it to the columns
#     df = df.get_support()

#     # see only the column names relevant to our analysis
#     df = X_train_scaled.columns[df]
    
#     return df

In [26]:
# SelectKBest function

# df = dataframe,
# scaled_cols = columns to scale, entered as a list, i.e, ['variable_name'],
# target_var = target variable, entered as a string, i.e, 'variable_name'
# kk = the k number of features to select / return

def select_best(df, scaled_cols, target_var, kk):
 
    '''
    This function takes in the predictors (X), the target (y) and 
    the number of features to select (k) and returns the names of
    the top k selected features based on the SelectKBest class. 
    '''
    
    # scaling the data in specific columns, reassigning the scaled numbers to the column names
    mms = MinMaxScaler()
    df[scaled_cols] = mms.fit_transform(df[scaled_cols])

    # feature array / df with continuous features (X, ...) and target (y, tip)
    X_train_scaled = df[scaled_cols]
    y_train = df[target_var]

    # create an instance of the SelectKBest object
    df = SelectKBest(f_regression, k = kk)

    # fit the object to data : .fit(features, target variable)
    df.fit(X_train_scaled, y_train)
    
    # masking the values by assigning a variable to the T/F in order to apply it to the columns
    df = df.get_support()

    # see only the column names relevant to our analysis
    df = X_train_scaled.columns[df]
    
    return df

In [27]:
select_best(tips, ['total_bill', 'price_per_person', 'party_size'], 'tip', 2)

Index(['total_bill', 'party_size'], dtype='object')

### 3. Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [42]:
# RFE function

# df = dataframe,
# target_var = target variable (y-value), the column to drop, entered as a string, i.e, 'variable_name'
# fts = n_features_to_select
# dummy_columns = columns of which to make dummies, entered as a list, i.e ['variable_name']

def rfe(df, fts, target_var, dummy_columns):
    
    '''This function takes in the predictors, the target and the 
    number of features to select and returns the top k-features 
    based on the RFE class.
    '''
    
    # dropping 'tip' to allow for fair evaluation of data
    X_train = df.drop(columns = target_var)

    # make dummies of categorical columns, then reassign to variable X_train
    X_train = pd.get_dummies(X_train, columns = dummy_columns)

    # RFE uses a machine learning model (here, linear regression) 
    #   to determine the 2 features with the most predictive capability
    rfe = RFE(LinearRegression(), n_features_to_select = fts)

    # fitting 
    rfe.fit(X_train, y_train)

    # ranking of the categorical columns and aligning them with their column names
    ranking = rfe.ranking_
    features = X_train.columns.tolist()

    # turning ranks & columns into a df
    df = pd.DataFrame({'ranking' : ranking,
                       'feature' : features})

    df = df.sort_values('ranking')
    
    return df

In [43]:
# def rfe(df, target, fts, drop_columns, dummy_columns):

rfe(train, 4, 'tip', 
    ['sex', 'smoker', 'day', 'time', 'party_size']).head()

Unnamed: 0,ranking,feature
0,1,total_bill
1,1,price_per_person
12,1,party_size_0.0
16,1,party_size_0.8
13,2,party_size_0.2


### 4. Load the ```swiss``` dataset and use all the other features to predict Fertility. Find the top 3 features using both SelectKBest and RFE. (Use the functions you just built to help you out.)

In [30]:
suisse = data('swiss')
suisse.head(3)

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2


In [31]:
suisse.info()

<class 'pandas.core.frame.DataFrame'>
Index: 47 entries, Courtelary to Rive Gauche
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Fertility         47 non-null     float64
 1   Agriculture       47 non-null     float64
 2   Examination       47 non-null     int64  
 3   Education         47 non-null     int64  
 4   Catholic          47 non-null     float64
 5   Infant.Mortality  47 non-null     float64
dtypes: float64(4), int64(2)
memory usage: 2.6+ KB


In [32]:
suisse = suisse.rename(columns = {'Infant.Mortality' : 'Infant_Mortality'})
#renaming

In [33]:
# splitting in to train, validate, test

train_s, val_s, test_s = train_val_test(suisse)
train_s.shape, val_s.shape, test_s.shape

((28, 6), (9, 6), (10, 6))

In [34]:
train_s.head(3)

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant_Mortality
Sarine,82.9,45.2,16,13,91.38,24.4
Grandson,71.7,34.0,17,8,3.3,20.0
Yverdon,65.4,49.5,15,8,6.1,22.5


In [35]:
# SelectKBest function

# df = dataframe,
# scaled_cols = columns to scale, entered as a list, i.e, ['variable_name'],
# target_var = target variable, entered as a string, i.e, 'variable_name'
# kk = the k number of features to select / return

# def select_best(df, scaled_cols, target_var, kk):

select_best(train_s, ['Agriculture', 'Catholic', 'Examination', 'Education','Infant_Mortality'], 
            'Fertility', 3)

Index(['Catholic', 'Examination', 'Education'], dtype='object')

In [47]:
# RFE function

# df = dataframe,
# target_var = target variable (y-value), the column to drop, entered as a string, i.e, 'variable_name'
# fts = n_features_to_select
# dummy_columns = columns of which to make dummies, entered as a list, i.e ['variable_name']

def rfe(df, fts, target_var, dummy_columns):
    
    '''This function takes in the predictors, the target and the 
    number of features to select and returns the top k-features 
    based on the RFE class.
    '''
    
    # dropping 'tip' to allow for fair evaluation of data
    X_train = df.drop(columns = target_var)

    # make dummies of categorical columns, then reassign to variable X_train
    X_train = pd.get_dummies(X_train, columns = dummy_columns)
    y_train = df[target_var]

    # RFE uses a machine learning model (here, linear regression) 
    #   to determine the 2 features with the most predictive capability
    rfe = RFE(LinearRegression(), n_features_to_select = fts)

    # fitting 
    rfe.fit(X_train, y_train)

    # ranking of the categorical columns and aligning them with their column names
    ranking = rfe.ranking_
    features = X_train.columns.tolist()

    # turning ranks & columns into a df
    df = pd.DataFrame({'ranking' : ranking,
                       'feature' : features})

    df = df.sort_values('ranking')
    
    return df

In [56]:
# RFE function

# df = dataframe,
# target_var = target variable (y-value), 
# fts = n_features_to_select
# dummy_columns = columns of which to make dummies, entered as a list, i.e ['variable_name']

# def rfe(df, target_var, fts, dummy_columns):

rfe(train_s, 3, 'Fertility', []).head()

Unnamed: 0,ranking,feature
0,1,Agriculture
2,1,Education
4,1,Infant_Mortality
3,2,Catholic
1,3,Examination
