In [1]:
import numpy as np
import pandas as pd
import mirz_wrangle as w
import seaborn as sns
from pydataset import data

from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector
from sklearn.linear_model import LinearRegression

In [2]:
df = sns.load_dataset('tips')

In [3]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [4]:
df['price_per_person'] = df['total_bill']/df['size']

In [5]:
df['avg_price_per_person'] = df['price_per_person'].sum()/len(df)

In [6]:
df_dummy = pd.get_dummies(df[['sex','smoker','day','time',]], dummy_na=False) #, drop_first=[True,True,True,True])

In [7]:
df_dummy

Unnamed: 0,sex_Male,sex_Female,smoker_Yes,smoker_No,day_Thur,day_Fri,day_Sat,day_Sun,time_Lunch,time_Dinner
0,0,1,0,1,0,0,0,1,0,1
1,1,0,0,1,0,0,0,1,0,1
2,1,0,0,1,0,0,0,1,0,1
3,1,0,0,1,0,0,0,1,0,1
4,0,1,0,1,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...
239,1,0,0,1,0,0,1,0,0,1
240,0,1,1,0,0,0,1,0,0,1
241,1,0,1,0,0,0,1,0,0,1
242,1,0,0,1,0,0,1,0,0,1


In [8]:
df = pd.concat([df,df_dummy], axis=1)
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,avg_price_per_person,sex_Male,sex_Female,smoker_Yes,smoker_No,day_Thur,day_Fri,day_Sat,day_Sun,time_Lunch,time_Dinner
0,16.99,1.01,Female,No,Sun,Dinner,2,8.495,7.88823,0,1,0,1,0,0,0,1,0,1
1,10.34,1.66,Male,No,Sun,Dinner,3,3.446667,7.88823,1,0,0,1,0,0,0,1,0,1
2,21.01,3.5,Male,No,Sun,Dinner,3,7.003333,7.88823,1,0,0,1,0,0,0,1,0,1
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84,7.88823,1,0,0,1,0,0,0,1,0,1
4,24.59,3.61,Female,No,Sun,Dinner,4,6.1475,7.88823,0,1,0,1,0,0,0,1,0,1


In [9]:
x = df.drop(columns=['tip','sex','smoker','day','time', 'avg_price_per_person'])
y = df['tip']

# select_kbest

In [10]:
#kbest process manual
kbest = SelectKBest(f_regression, k=2)
_ = kbest.fit(x, y)

In [11]:
#kbest.scores_

In [12]:
#kbest.pvalues_

In [13]:
kbest_results = pd.DataFrame(dict(p=kbest.pvalues_,f=kbest.scores_),
                            index=x.columns)

In [14]:
kbest_results

Unnamed: 0,p,f
total_bill,6.692471e-34,203.357723
size,4.300543e-16,76.175426
price_per_person,2.502102e-08,33.213257
sex_Male,0.1664562,1.926155
sex_Female,0.1664562,1.926155
smoker_Yes,0.9265932,0.008506
smoker_No,0.9265932,0.008506
day_Thur,0.135324,2.245302
day_Fri,0.38837,0.746727
day_Sat,0.9654161,0.001884


In [15]:
x_transformed = pd.DataFrame(
    kbest.transform(x),
    columns=x.columns[kbest.get_support()],
    index=x.index)

In [16]:
x_transformed.head(3)

Unnamed: 0,total_bill,size
0,16.99,2.0
1,10.34,3.0
2,21.01,3.0


In [17]:
# combine manual kbest process into function
def selectkbest(pred_vars, target_var, k_features):
        kbest = SelectKBest(f_regression, k=k_features)
        _ = kbest.fit(pred_vars, target_var)
        kbest_results = pd.DataFrame(dict(p=kbest.pvalues_,f=kbest.scores_),
                            index=pred_vars.columns)
        pred_vars_transformed = pd.DataFrame(
        kbest.transform(pred_vars),
        columns=pred_vars.columns[kbest.get_support()],
        index=pred_vars.index)
        return pred_vars_transformed

In [18]:
# working as intended
selectkbest(x,y,2)

Unnamed: 0,total_bill,size
0,16.99,2.0
1,10.34,3.0
2,21.01,3.0
3,23.68,2.0
4,24.59,4.0
...,...,...
239,29.03,3.0
240,27.18,2.0
241,22.67,2.0
242,17.82,2.0


# rfe

In [19]:
model = LinearRegression()

In [20]:
rfe = RFE(model, n_features_to_select=2)
rfe.fit(x,y)
rfe.ranking_

array([ 3,  5,  4, 12, 10,  2,  9,  7,  1,  6,  1,  8, 11])

In [21]:
pd.DataFrame(
{
    'rfe_ranking':rfe.ranking_
}, index=x.columns)

Unnamed: 0,rfe_ranking
total_bill,3
size,5
price_per_person,4
sex_Male,12
sex_Female,10
smoker_Yes,2
smoker_No,9
day_Thur,7
day_Fri,1
day_Sat,6


In [22]:
rfe.get_support()

array([False, False, False, False, False, False, False, False,  True,
       False,  True, False, False])

In [23]:
x_transformed = pd.DataFrame(
rfe.transform(x),
index = x.index,
columns = x.columns[rfe.support_])

In [24]:
x_transformed.head()

Unnamed: 0,day_Fri,day_Sun
0,0.0,1.0
1,0.0,1.0
2,0.0,1.0
3,0.0,1.0
4,0.0,1.0


In [25]:
# combine manual rfe process into function
def rfe(pred_vars, target_var, n_features):
    model = LinearRegression()
    rfe = RFE(model, n_features_to_select=n_features)
    rfe.fit(pred_vars,target_var)
    #rfe.ranking_
    
    pred_vars_transformed = pd.DataFrame(
    rfe.transform(pred_vars),
    index = pred_vars.index,
    columns = pred_vars.columns[rfe.support_])
    
    return pred_vars_transformed
    

In [26]:
# working as intended
rfe(x,y,2)

Unnamed: 0,day_Fri,day_Sun
0,0.0,1.0
1,0.0,1.0
2,0.0,1.0
3,0.0,1.0
4,0.0,1.0
...,...,...
239,0.0,0.0
240,0.0,0.0
241,0.0,0.0
242,0.0,0.0


In [27]:
df2 = data('swiss')
df2.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [29]:
predictors = df2[['Agriculture','Examination','Education', 'Catholic', 'Infant.Mortality']]
target = df2['Fertility']

In [None]:
selectkbest(x,y,2)