In [1]:
import numpy as np
import pandas as pd
import mirz_wrangle as w
import seaborn as sns
from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, QuantileTransformer
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = sns.load_dataset('tips')

In [3]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [4]:
df['price_per_person'] = df['total_bill']/df['size']

In [5]:
df['avg_price_per_person'] = df['price_per_person'].sum()/len(df)

In [7]:
# playing with scaler for possible better rfe
scaler = MinMaxScaler()
df['ppp_scaled'] = scaler.fit_transform(df[['price_per_person']])

In [8]:
df_dummy = pd.get_dummies(df[['sex','smoker','day','time',]], dummy_na=False) #, drop_first=[True,True,True,True])

In [9]:
df_dummy

Unnamed: 0,sex_Male,sex_Female,smoker_Yes,smoker_No,day_Thur,day_Fri,day_Sat,day_Sun,time_Lunch,time_Dinner
0,0,1,0,1,0,0,0,1,0,1
1,1,0,0,1,0,0,0,1,0,1
2,1,0,0,1,0,0,0,1,0,1
3,1,0,0,1,0,0,0,1,0,1
4,0,1,0,1,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...
239,1,0,0,1,0,0,1,0,0,1
240,0,1,1,0,0,0,1,0,0,1
241,1,0,1,0,0,0,1,0,0,1
242,1,0,0,1,0,0,1,0,0,1


In [10]:
df = pd.concat([df,df_dummy], axis=1)
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,avg_price_per_person,ppp_scaled,sex_Male,sex_Female,smoker_Yes,smoker_No,day_Thur,day_Fri,day_Sat,day_Sun,time_Lunch,time_Dinner
0,16.99,1.01,Female,No,Sun,Dinner,2,8.495,7.88823,0.322989,0,1,0,1,0,0,0,1,0,1
1,10.34,1.66,Male,No,Sun,Dinner,3,3.446667,7.88823,0.032854,1,0,0,1,0,0,0,1,0,1
2,21.01,3.5,Male,No,Sun,Dinner,3,7.003333,7.88823,0.237261,1,0,0,1,0,0,0,1,0,1
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84,7.88823,0.51523,1,0,0,1,0,0,0,1,0,1
4,24.59,3.61,Female,No,Sun,Dinner,4,6.1475,7.88823,0.188075,0,1,0,1,0,0,0,1,0,1


In [11]:
from sklearn.model_selection import train_test_split

In [12]:
#x = train.drop(columns=['tip','sex','smoker','day','time', 'avg_price_per_person'])
#y = train['tip']

In [13]:
train_validate, test = train_test_split(df, test_size=0.2, random_state=777)

In [14]:
train, validate = train_test_split(train_validate, test_size=0.3, random_state=777)

In [15]:
x = train.drop(columns=['tip','sex','smoker','day','time', 'avg_price_per_person'])
y = train['tip']

In [16]:
x.head()

Unnamed: 0,total_bill,size,price_per_person,ppp_scaled,sex_Male,sex_Female,smoker_Yes,smoker_No,day_Thur,day_Fri,day_Sat,day_Sun,time_Lunch,time_Dinner
219,30.14,4,7.535,0.267816,0,1,1,0,0,0,1,0,0,1
197,43.11,4,10.7775,0.454167,0,1,1,0,1,0,0,0,1,0
126,8.52,2,4.26,0.079598,1,0,0,1,1,0,0,0,1,0
52,34.81,4,8.7025,0.334914,0,1,0,1,0,0,0,1,0,1
148,9.78,2,4.89,0.115805,1,0,0,1,1,0,0,0,1,0


>I predicted that total_bill and time will be the top prediction variables

# select_kbest

In [17]:
#kbest process manual
kbest = SelectKBest(f_regression, k=2)
_ = kbest.fit(x, y)

In [18]:
#kbest.scores_

In [19]:
#kbest.pvalues_

In [20]:
kbest_results = pd.DataFrame(dict(p=kbest.pvalues_,f=kbest.scores_),
                            index=x.columns)

In [21]:
kbest_results

Unnamed: 0,p,f
total_bill,7.486440999999999e-19,107.464922
size,3.839601e-12,58.266185
price_per_person,0.000613763,12.310364
ppp_scaled,0.000613763,12.310364
sex_Male,0.8375949,0.042177
sex_Female,0.8375949,0.042177
smoker_Yes,0.6516957,0.204687
smoker_No,0.6516957,0.204687
day_Thur,0.3579151,0.851044
day_Fri,0.1285736,2.338434


In [22]:
x_transformed = pd.DataFrame(
    kbest.transform(x),
    columns=x.columns[kbest.get_support()],
    index=x.index)

In [23]:
x_transformed.head(3)

Unnamed: 0,total_bill,size
219,30.14,4.0
197,43.11,4.0
126,8.52,2.0


In [24]:
# combine manual kbest process into function
def selectkbest(pred_vars, target_var, k_features):
        kbest = SelectKBest(f_regression, k=k_features)
        _ = kbest.fit(pred_vars, target_var)
        kbest_results = pd.DataFrame(dict(p=kbest.pvalues_,f=kbest.scores_),
                            index=pred_vars.columns)
        pred_vars_transformed = pd.DataFrame(
        kbest.transform(pred_vars),
        columns=pred_vars.columns[kbest.get_support()],
        index=pred_vars.index)
        return pred_vars_transformed

In [25]:
# working as intended
selectkbest(x,y,2)

Unnamed: 0,total_bill,size
219,30.14,4.0
197,43.11,4.0
126,8.52,2.0
52,34.81,4.0
148,9.78,2.0
62,11.02,2.0
199,13.51,2.0
3,23.68,2.0
111,7.25,1.0
225,16.27,2.0


# rfe

In [26]:
model = LinearRegression()

In [27]:
rfe = RFE(model, n_features_to_select=2)
rfe.fit(x,y)
rfe.ranking_

array([ 7,  1, 12, 13,  9, 11,  3,  8,  4, 10,  1,  2,  6,  5])

In [28]:
pd.DataFrame(
{
    'rfe_ranking':rfe.ranking_
}, index=x.columns)

Unnamed: 0,rfe_ranking
total_bill,7
size,1
price_per_person,12
ppp_scaled,13
sex_Male,9
sex_Female,11
smoker_Yes,3
smoker_No,8
day_Thur,4
day_Fri,10


In [29]:
rfe.get_support()

array([False,  True, False, False, False, False, False, False, False,
       False,  True, False, False, False])

In [30]:
x_transformed = pd.DataFrame(
rfe.transform(x),
index = x.index,
columns = x.columns[rfe.support_])

In [31]:
x_transformed.head()

Unnamed: 0,size,day_Sat
219,4.0,1.0
197,4.0,0.0
126,2.0,0.0
52,4.0,0.0
148,2.0,0.0


In [32]:
# combine manual rfe process into function
def rfe(pred_vars, target_var, n_features):
    model = LinearRegression()
    rfe = RFE(model, n_features_to_select=n_features)
    rfe.fit(pred_vars,target_var)
    #rfe.ranking_
    
    pred_vars_transformed = pd.DataFrame(
    rfe.transform(pred_vars),
    index = pred_vars.index,
    columns = pred_vars.columns[rfe.support_])
    
    return pred_vars_transformed
    

In [33]:
# working as intended
rfe(x,y,2)

Unnamed: 0,size,day_Sat
219,4.0,1.0
197,4.0,0.0
126,2.0,0.0
52,4.0,0.0
148,2.0,0.0
62,2.0,1.0
199,2.0,0.0
3,2.0,0.0
111,1.0,1.0
225,2.0,0.0


# selectkbest() and rfe() on swiss

In [34]:
df2 = data('swiss')
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 47 entries, Courtelary to Rive Gauche
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Fertility         47 non-null     float64
 1   Agriculture       47 non-null     float64
 2   Examination       47 non-null     int64  
 3   Education         47 non-null     int64  
 4   Catholic          47 non-null     float64
 5   Infant.Mortality  47 non-null     float64
dtypes: float64(4), int64(2)
memory usage: 2.6+ KB


In [35]:
predictors = df2[['Agriculture','Examination','Education', 'Catholic', 'Infant.Mortality']]
target = df2['Fertility']

In [36]:
selectkbest(predictors,target,2).head()

Unnamed: 0,Examination,Education
Courtelary,15.0,12.0
Delemont,6.0,9.0
Franches-Mnt,5.0,5.0
Moutier,12.0,7.0
Neuveville,17.0,15.0


In [37]:
rfe(predictors,target,2).head()

Unnamed: 0,Education,Infant.Mortality
Courtelary,12.0,22.2
Delemont,9.0,22.2
Franches-Mnt,5.0,20.2
Moutier,7.0,20.3
Neuveville,15.0,20.6
