In [1]:
import pandas as pd
import numpy as np
import math
import wrangle
from env import get_db_url
import seaborn as sns
from pydataset import data

#new for this lesson
from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector
from sklearn.linear_model import LinearRegression

In [2]:
df = sns.load_dataset('tips')

In [3]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


# With the tips dataset:


- Create a column named price_per_person. This should be the total bill divided by the party size.
- Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?
- Use select k best to select the top 2 features for predicting tip amount. What are they?
- Use recursive feature elimination to select the top 2 features for tip amount. What are they?
- Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features you are selecting?

## a) Create a column named price_per_person. This should be total bill divided by the part size.

In [4]:
df.rename(columns={'size': 'party_size'}, inplace = True)


In [5]:
df['price_per_person'] = (df.total_bill/df.party_size).round()

In [6]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,party_size,price_per_person
0,16.99,1.01,Female,No,Sun,Dinner,2,8.0
1,10.34,1.66,Male,No,Sun,Dinner,3,3.0
2,21.01,3.5,Male,No,Sun,Dinner,3,7.0
3,23.68,3.31,Male,No,Sun,Dinner,2,12.0
4,24.59,3.61,Female,No,Sun,Dinner,4,6.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   total_bill        244 non-null    float64 
 1   tip               244 non-null    float64 
 2   sex               244 non-null    category
 3   smoker            244 non-null    category
 4   day               244 non-null    category
 5   time              244 non-null    category
 6   party_size        244 non-null    int64   
 7   price_per_person  244 non-null    float64 
dtypes: category(4), float64(3), int64(1)
memory usage: 9.3 KB


## b) Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?

- i think that total bill, day, and time will be the best indicators of tip amount

### split data

In [8]:
train, validate, test = wrangle.split_data(df)

In [9]:
train.shape

(136, 8)

In [10]:
df.shape

(244, 8)

## c) Use select k best to select the top 2 features for predicting tip amount. What are they?

In [11]:
# create dummies to turn strings into numerics

In [12]:
dummy_df = pd.get_dummies(train[['sex', 'smoker', 'day', 'time']],dummy_na = False, drop_first=False)

In [13]:
train = pd.concat([train, dummy_df], axis = 1)

In [14]:
train.drop(columns = ['party_size','sex', 'smoker', 'day', 'time'], inplace = True)

In [15]:
train.head()

Unnamed: 0,total_bill,tip,price_per_person,sex_Male,sex_Female,smoker_Yes,smoker_No,day_Thur,day_Fri,day_Sat,day_Sun,time_Lunch,time_Dinner
54,25.56,4.34,6.0,1,0,0,1,0,0,0,1,0,1
158,13.39,2.61,7.0,0,1,0,1,0,0,0,1,0,1
151,13.13,2.0,7.0,1,0,0,1,0,0,0,1,0,1
230,24.01,2.0,6.0,1,0,1,0,0,0,1,0,0,1
68,20.23,2.01,10.0,1,0,0,1,0,0,1,0,0,1


In [16]:
X_train, y_train = train[['total_bill', 
                      'price_per_person', 
                      'sex_Male', 
                      'sex_Female', 
                      'smoker_Yes', 
                      'smoker_No', 
                      'day_Thur', 
                      'day_Fri', 
                      'day_Sat', 
                      'day_Sun', 
                      'time_Lunch', 
                      'time_Dinner']], train.tip 

In [17]:
#make the thing
f_selector= SelectKBest(f_regression, k = 2)
#fit the thing
kbest = f_selector.fit(X_train, y_train)

kbest.get_support()

array([ True,  True, False, False, False, False, False, False, False,
       False, False, False])

In [18]:
#pre-renaming
kbest.scores_

kbest.pvalues_

array([3.86477551e-22, 3.49104293e-06, 6.76036108e-01, 6.76036108e-01,
       4.30799695e-01, 4.30799695e-01, 1.06802664e-01, 3.52782823e-01,
       4.75670572e-01, 1.64277304e-01, 5.32839785e-02, 5.32839785e-02])

In [19]:
kbest_support_mask = kbest.get_support()

In [20]:
kbest_feature = f_feature = X_train.loc[:,kbest_support_mask].columns.tolist()

print(kbest_feature)

['total_bill', 'price_per_person']


In [21]:
#post
kbest_results = pd.DataFrame(
    dict(p=kbest.pvalues_, f=kbest.scores_),
                             index = X_train.columns)

In [22]:
kbest_results

Unnamed: 0,p,f
total_bill,3.8647760000000003e-22,136.100652
price_per_person,3.491043e-06,23.444393
sex_Male,0.6760361,0.17539
sex_Female,0.6760361,0.17539
smoker_Yes,0.4307997,0.624436
smoker_No,0.4307997,0.624436
day_Thur,0.1068027,2.636209
day_Fri,0.3527828,0.869457
day_Sat,0.4756706,0.511648
day_Sun,0.1642773,1.955763


In [23]:
X_train.columns[kbest.get_support()]

Index(['total_bill', 'price_per_person'], dtype='object')

In [24]:
kbest_results#sort_values#(ascending = False)

Unnamed: 0,p,f
total_bill,3.8647760000000003e-22,136.100652
price_per_person,3.491043e-06,23.444393
sex_Male,0.6760361,0.17539
sex_Female,0.6760361,0.17539
smoker_Yes,0.4307997,0.624436
smoker_No,0.4307997,0.624436
day_Thur,0.1068027,2.636209
day_Fri,0.3527828,0.869457
day_Sat,0.4756706,0.511648
day_Sun,0.1642773,1.955763


In [25]:
kbest_mask=kbest.get_support()

## d) Use recursive feature elimination to select the top 2 features for tip amount. What are they?


In [26]:
model = LinearRegression()

In [27]:
# make thing
rfe = RFE(model, n_features_to_select=2)
# fit thing
rfe.fit(X_train, y_train)

RFE(estimator=LinearRegression(), n_features_to_select=2)

In [28]:
rfe.ranking_

array([ 6,  8,  5,  7,  9, 10,  3, 11,  1,  1,  4,  2])

In [29]:
RFE_df =pd.DataFrame(
{
    'rfe_ranking': rfe.ranking_
}, index = X_train.columns)

In [30]:
RFE_df.rank(ascending=False)

Unnamed: 0,rfe_ranking
total_bill,6.0
price_per_person,4.0
sex_Male,7.0
sex_Female,5.0
smoker_Yes,3.0
smoker_No,2.0
day_Thur,9.0
day_Fri,1.0
day_Sat,11.5
day_Sun,11.5


In [31]:
rfe.get_support()

array([False, False, False, False, False, False, False, False,  True,
        True, False, False])

In [32]:
kbest_mask

array([ True,  True, False, False, False, False, False, False, False,
       False, False, False])

## ELECTIVE BONUS: SFS

In [33]:
model

LinearRegression()

In [34]:
sfs = SequentialFeatureSelector(model, n_features_to_select=2)

In [35]:
sfs.fit(X_train, y_train)

SequentialFeatureSelector(estimator=LinearRegression(), n_features_to_select=2)

In [36]:
SFS = pd.DataFrame(
sfs.transform(X_train),
index = X_train.index,
columns = X_train.columns[sfs.support_])

In [37]:
SFS.head(5)

Unnamed: 0,total_bill,smoker_Yes
54,25.56,0.0
158,13.39,0.0
151,13.13,0.0
230,24.01,1.0
68,20.23,0.0


## e) Why do you think select k best and recursive feature elimination might give different answers for the top features? 
- Maybe because of how small the dataset is?


## Does this change as you change the number of features you are selecting?
- Not yet. I'll have to chase some rabbits to answer the question of "how are the ranking orders related before I mess with additional features.

# 2. Write select_kbest function 

In [38]:
def select_k_best_function(df, target, k):
    ''' This function is my pride and joy. 
    Takes in a dataframe, a target variable, and the number of top attributes
    and SO LONG AS ALL COLUMNS ARE NUMERIC
    will return a dataframe '''
    
    
    
    
    # define X_train and y_train:
    y_train = df[target]
    X_train = df.drop(columns=target)
    
    #make the thing
    kbest= SelectKBest(f_regression, k = k)
    
    #fit the thing
    _ = kbest.fit(X_train, y_train)
    
    # makes it pretty
    kbest_results = pd.DataFrame(
    dict(p=kbest.pvalues_, f=kbest.scores_),
                             index = X_train.columns).sort_values('f').head(k)
    
    return kbest_results

In [39]:
swiss = data('swiss')

In [40]:
select_k_best_function(swiss, 'Fertility', 3)

Unnamed: 0,p,f
Agriculture,0.014917,6.408884
Infant.Mortality,0.003585,9.447708
Catholic,0.001029,12.325096


In [41]:
select_k_best_function(train, 'tip', 2)

Unnamed: 0,p,f
sex_Female,0.676036,0.17539
sex_Male,0.676036,0.17539


In [42]:
train.head()

Unnamed: 0,total_bill,tip,price_per_person,sex_Male,sex_Female,smoker_Yes,smoker_No,day_Thur,day_Fri,day_Sat,day_Sun,time_Lunch,time_Dinner
54,25.56,4.34,6.0,1,0,0,1,0,0,0,1,0,1
158,13.39,2.61,7.0,0,1,0,1,0,0,0,1,0,1
151,13.13,2.0,7.0,1,0,0,1,0,0,0,1,0,1
230,24.01,2.0,6.0,1,0,1,0,0,0,1,0,0,1
68,20.23,2.01,10.0,1,0,0,1,0,0,1,0,0,1


In [43]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 136 entries, 54 to 82
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   total_bill        136 non-null    float64
 1   tip               136 non-null    float64
 2   price_per_person  136 non-null    float64
 3   sex_Male          136 non-null    uint8  
 4   sex_Female        136 non-null    uint8  
 5   smoker_Yes        136 non-null    uint8  
 6   smoker_No         136 non-null    uint8  
 7   day_Thur          136 non-null    uint8  
 8   day_Fri           136 non-null    uint8  
 9   day_Sat           136 non-null    uint8  
 10  day_Sun           136 non-null    uint8  
 11  time_Lunch        136 non-null    uint8  
 12  time_Dinner       136 non-null    uint8  
dtypes: float64(3), uint8(10)
memory usage: 5.6 KB


In [44]:
def k_best_2(df, target):
    ''' This function is my pride and joy. 
    Takes in a dataframe, a target variable, and the number of top attributes
    and SO LONG AS ALL COLUMNS ARE NUMERIC
    will return a dataframe '''
    
    
    
    
    # define X_train and y_train:
    y_train = df[target]
    X_train = df.drop(columns=target)
    
    #make the thing
    kbest= SelectKBest(f_regression, k = 2)
    
    #fit the thing
    _ = kbest.fit(X_train, y_train)
    
    # makes it pretty
    kbest_results = pd.DataFrame(
    dict(p=kbest.pvalues_, f=kbest.scores_),index = X_train.columns)#.sort_values('f').head(k)
    
    
    
    return kbest_results

In [45]:
k_best_2(train, 'tip')

Unnamed: 0,p,f
total_bill,3.8647760000000003e-22,136.100652
price_per_person,3.491043e-06,23.444393
sex_Male,0.6760361,0.17539
sex_Female,0.6760361,0.17539
smoker_Yes,0.4307997,0.624436
smoker_No,0.4307997,0.624436
day_Thur,0.1068027,2.636209
day_Fri,0.3527828,0.869457
day_Sat,0.4756706,0.511648
day_Sun,0.1642773,1.955763


In [46]:
frame=pd.DataFrame = list(X_train.loc[:,kbest_support_mask])

In [47]:
kbest_feature = f_feature = X_train.loc[:,kbest_support_mask].columns.tolist()

print(kbest_feature)

['total_bill', 'price_per_person']


frame_2 = pd.DataFrame=list(
    dict(p=kbest.pvalues_, f=kbest.scores_),
    index = X_train.columns, 
    X_train.loc[:,kbest_support_mask])

In [48]:
def test_func(df, target, k):
    # define X_train and y_train:
    y_train = df[target]
    X_train = df.drop(columns=target)
    
    #make the thing
    kbest= SelectKBest(f_regression, k = 2)
    
    #fit the thing
    _ = kbest.fit(X_train, y_train)
    
    # makes it pretty
    kbest_results = pd.DataFrame(
    dict(p=kbest.pvalues_, f=kbest.scores_),index = X_train.columns.sort_values('f').head(k))
    
    
    
    return kbest_results

In [49]:
#test_func(train, 'tip', 2)

# 3. RFE function

### Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [50]:
def rfe(pred, target, k=2):
    rf = RFE(LinearRegression(), n_features_to_select=k)
    rf.fit(pred, target)
    mask = rf.get_support()
    return pred.columns[mask]

In [51]:
rfe(X_train, y_train, 2)

Index(['day_Sat', 'day_Sun'], dtype='object')

# 4. Swiss data for fertility

### Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out)

In [52]:
swiss = data('swiss')

In [53]:
swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [54]:
select_k_best_function(swiss, 'Fertility', 3)

TypeError: 'list' object is not callable

In [57]:
def select_k_best_function(df, target, k):
    ''' This function is my pride and joy. 
    Takes in a dataframe, a target variable, and the number of top attributes
    and SO LONG AS ALL COLUMNS ARE NUMERIC
    will return a dataframe '''
    
    
    
    
    # define X_train and y_train:
    y_train = df[target]
    X_train = df.drop(columns=target)
    
    #make the thing
    kbest= SelectKBest(f_regression, k = k)
    
    #fit the thing
    _ = kbest.fit(X_train, y_train)
    
    # makes it pretty
    kbest_results = pd.DataFrame(
    dict(p=kbest.pvalues_, f=kbest.scores_),
                             index = X_train.columns).sort_values('f').head(k)
    
    return kbest_results

- This function that I wrote for #2 works when called just above being used on swiss data, but somewhere in this notebook between here and there I must have reset X_train to a list because I cannot call the function without getting the above error

In [59]:
def new_select_kbest(X,y,k=2):
    kbest = SelectKBest(f_regression, k=k)
    kbest.fit(X, y)
    mask = kbest.get_support()
    return X.columns[mask]

In [60]:
X = swiss.drop(columns=['Fertility'])
y = swiss['Fertility']

In [61]:
new_select_kbest(X,y,3)

Index(['Examination', 'Education', 'Catholic'], dtype='object')

### The above features are different from the ones when I ran it beneatht the function I created in number two. Those features were, 'Agriculture', 'Infant_Mortality', and 'Catholic'

In [62]:
rfe(X,y,3)

Index(['Examination', 'Education', 'Infant.Mortality'], dtype='object')

### I quit