In [104]:
import pandas as pd
import numpy as np
import pydataset
import evaluate
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

tips = pydataset.data('tips')

In [105]:
tips['tip_percentage'] = round(tips['tip'] / tips['total_bill'], 2)
tips['price_per_person'] = round(tips['total_bill'] / tips['size'], 2)
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,0.06,8.49
2,10.34,1.66,Male,No,Sun,Dinner,3,0.16,3.45
3,21.01,3.5,Male,No,Sun,Dinner,3,0.17,7.0
4,23.68,3.31,Male,No,Sun,Dinner,2,0.14,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,0.15,6.15


#### which features do you think would be most important for predicting the tip amount?
- total_bill and tip_percentage

In [106]:
# Little bit of data prep
X = tips[[col for col in tips.columns if tips[col].dtype != object]].drop(columns = 'tip')
y = tips.tip

In [107]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=123)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [108]:
kbest = SelectKBest(f_regression, k=2)
kbest.fit(X_train_scaled, y_train)
kbest_features = X_train.columns[kbest.get_support()]

In [109]:
kbest

SelectKBest(k=2, score_func=<function f_regression at 0x7ff2bf601280>)

#### What are the 2 best features?

In [97]:
features = list(X_train.columns)
for i in range(len(kbest.scores_)):
    print('Feature %d: %f' % (i, kbest.scores_[i]))
f'The top {len(kbest_features)} features from kbest are: {kbest_features}'

Feature 0: 172.913650
Feature 1: 65.274770
Feature 2: 29.764992
Feature 3: 26.729285


"The top 2 features from kbest are: Index(['total_bill', 'size'], dtype='object')"

### Recursive Feature Elimination

In [98]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=123)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [99]:
rfe = RFE(estimator=LinearRegression(), n_features_to_select=2)
rfe.fit(X_train_scaled, y_train)
rfe_features = X_train.columns[rfe.get_support()]

#### What are the 2 best features?

In [100]:
features = list(X_train.columns)
f'The top {len(rfe_features)} features from RFE are: {rfe_features}'

"The top 2 features from RFE are: Index(['total_bill', 'tip_percentage'], dtype='object')"

In [103]:
# Different feats
print(f'The top {len(kbest_features)} features from kbest are: {kbest_features}')
print(f'The top {len(rfe_features)} features from RFE are: {rfe_features}')

The top 2 features from kbest are: Index(['total_bill', 'size'], dtype='object')
The top 2 features from RFE are: Index(['total_bill', 'tip_percentage'], dtype='object')


In [110]:
def kbest_features(x_scaled, x, y, k):
    '''
    X_scaled : Takes in a scaled dataframe of features not including the target
    X : Unscaled dataframe without target feature
    Y : Takes in an an array containing the target
    K : Number of features to select
    '''
    # creating SelectKBest object with {k} selected features
    kbest = SelectKBest(f_regression, k= k)
    
    # fitting object
    kbest.fit(x_scaled, y)
    
    # assigning features to var
    features = x.columns[kbest.get_support()]
    
    return features

In [112]:
kbest_features(X_train_scaled, X_train, y_train, 2)

Index(['total_bill', 'size'], dtype='object')

In [115]:
def rfe_features(x_scaled, x, y, n, model = LinearRegression()):
    '''
    Model: Which model type to use for RFE
    X_scaled : Takes in a scaled dataframe of features not including the target
    X : Unscaled dataframe without target feature
    Y : Takes in an an array containing the target
    N : Number of features to whittle down to
    '''
    # creating RFE object with {n} selected features
    rfe = RFE(estimator= model, n_features_to_select=n)
    
    # fitting object
    rfe.fit(x_scaled, y)
    
    # assigning features to var
    features = x.columns[rfe.get_support()]
    
    return features

In [116]:
rfe_features(X_train_scaled, X_train, y_train, 2)

Index(['total_bill', 'tip_percentage'], dtype='object')

In [118]:
swiss = pydataset.data('swiss')
swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [121]:
X = swiss[[col for col in swiss.columns if swiss[col].dtype != object]].drop(columns = 'Fertility')
y = swiss.Fertility

In [122]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=123)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [123]:
kbest_features(X_train_scaled, X_train, y_train, 3)

Index(['Examination', 'Education', 'Catholic'], dtype='object')

In [124]:
rfe_features(X_train_scaled, X_train, y_train, 3)

Index(['Agriculture', 'Education', 'Catholic'], dtype='object')