In [1]:
import pandas as pd
import numpy as np
import pydataset
import evaluate as ev
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

tips = pydataset.data('tips')

In [2]:
tips['tip_percentage'] = round(tips['tip'] / tips['total_bill'], 2)
tips['price_per_person'] = round(tips['total_bill'] / tips['size'], 2)
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,0.06,8.49
2,10.34,1.66,Male,No,Sun,Dinner,3,0.16,3.45
3,21.01,3.5,Male,No,Sun,Dinner,3,0.17,7.0
4,23.68,3.31,Male,No,Sun,Dinner,2,0.14,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,0.15,6.15


#### which features do you think would be most important for predicting the tip amount?
- total_bill and tip_percentage

In [27]:
# Little bit of data prep
X = tips[[col for col in tips.columns if tips[col].dtype != object]].drop(columns = 'tip')
y = tips.tip

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=123)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [5]:
kbest = SelectKBest(f_regression, k=2)
kbest.fit(X_train_scaled, y_train)
kbest_features = X_train.columns[kbest.get_support()]
kbest_scores = kbest.scores_[kbest.get_support()]


In [6]:
kbest_scores, kbest_features

(array([172.91364957,  65.27477   ]),
 Index(['total_bill', 'size'], dtype='object'))

#### What are the 2 best features?

In [7]:
features = list(X_train.columns)
for i in range(len(kbest.scores_)):
    print('Feature %d: %f' % (i, kbest.scores_[i]))
f'The top {len(kbest_features)} features from kbest are: {kbest_features}'

Feature 0: 172.913650
Feature 1: 65.274770
Feature 2: 29.764992
Feature 3: 26.729285


"The top 2 features from kbest are: Index(['total_bill', 'size'], dtype='object')"

### Recursive Feature Elimination

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=123)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [39]:
rfe = RFE(estimator=LinearRegression(), n_features_to_select=2)
rfe.fit(X_train_scaled, y_train)
rfe_features = X_train.columns[rfe.get_support()]

#### What are the 2 best features?

In [10]:
features = list(X_train.columns)
f'The top {len(rfe_features)} features from RFE are: {rfe_features}'

"The top 2 features from RFE are: Index(['total_bill', 'tip_percentage'], dtype='object')"

In [11]:
# Different feats
print(f'The top {len(kbest_features)} features from kbest are: {list(kbest_features)}')
print(f'The top {len(rfe_features)} features from RFE are: {list(rfe_features)}')

The top 2 features from kbest are: ['total_bill', 'size']
The top 2 features from RFE are: ['total_bill', 'tip_percentage']


### Functions

In [12]:
def kbest_features(df, target, k, show_scores = False, stratify = False, scaler_type = StandardScaler()):
    '''
    Takes a dataframe and uses SelectKBest to select for
    the most relevant drivers of target.
    
    Parameters:
    -----------
    df : Unscaled dataframe
    target : Target variable of df
    k : Number of features to select
    stratify : No stratification by default. If stratify = true, 
            stratifies for the target during the train/test split
    scaler_type : Default is StandardScaler, determines the type 
            of scaling applied to the df before
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    
    Output:
    -------------
    A list of features of (k) length that SelectKBest has selected to be the
    main drivers of the target.
    '''

    # only selects numeric cols and separates target
    X = df[[col for col in df.columns if df[col].dtype != object]].drop(columns = target)
    y = df[target]
    
    # train, test split checking for stratify
    if stratify == True:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, 
                                                            random_state=123,
                                                            stratify=df[target])
    elif stratify == False:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2,
                                                            random_state = 123)

    # scaling data
    if scaler_type == StandardScaler():
        scaler = StandardScaler()
    else:
        scaler = scaler_type

    # fitting scaler to each split
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # creating SelectKBest object with {k} selected features
    kbest = SelectKBest(f_regression, k= k)
    
    # fitting object
    kbest.fit(X_train_scaled, y_train)
    
    # assigning features to var
    features = X.columns[kbest.get_support()]
    if show_scores == True:
        # getting feature scores
        scores = kbest.scores_[kbest.get_support()]

        # creating zipped list of feats and their scores
        feat_scores = list(zip(features, scores))
    
        fs_df = pd.DataFrame(data = feat_scores, columns= ['Feat_names','F_Scores'])
    
        fs_sorted = fs_df.sort_values(['F_Scores','Feat_names'], ascending = [False, True])

        return fs_sorted
    else:
        return list(features)

In [13]:
def rfe_features(df, target, n, stratify = False, est_model = LinearRegression(), scaler_type = StandardScaler()):
    '''
    Takes a dataframe and uses Recursive Feature Elimination to select for
    the most relevant drivers of target.
    
    Parameters:
    -----------
    df : Unscaled dataframe
    
    target : Target variable of df
    
    n : Number of features to select
    
    stratify : No stratification by default. If stratify = true, 
            stratifies for the target during the train/test split
            
    est_model : Defailt is LinearRegression, determines the estimator
            used by the RFE function
    
    scaler_type : Default is StandardScaler, determines the type 
            of scaling applied to the df before
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    
    Output:
    -------------
    A list of features of (n) length that RFE has selected to be the
    main drivers of the target.
    '''
    # only selects numeric cols and separates target
    X = df[[col for col in df.columns if df[col].dtype != object]].drop(columns = target)
    y = df[target]

    # train, test split checking for stratify
    if stratify == True:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, 
                                                            random_state=123,
                                                            stratify=df[target])
    elif stratify == False:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2,
                                                            random_state = 123)
    
    # scaling data
    if scaler_type == StandardScaler():
        scaler = StandardScaler()
    else:
        scaler = scaler_type
    
    # fitting scaler to each split
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # creating RFE object with {n} selected features
    rfe = RFE(estimator= est_model, n_features_to_select=n)
    
    # fitting object
    rfe.fit(X_train_scaled, y_train)
    
    # assigning features to var
    features = X.columns[rfe.get_support()]
    
    return list(features)

### Swiss Dataset

In [14]:
swiss = pydataset.data('swiss')
swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [15]:
kbest_features(swiss, 'Fertility', 3, show_scores= True)

Unnamed: 0,Feat_names,F_Scores
1,Education,15.085392
0,Examination,14.806314
2,Catholic,9.737175


In [16]:
rfe_features(swiss, 'Fertility', 3)

['Agriculture', 'Education', 'Catholic']

In [17]:
ev.kbest_features(swiss, 'Fertility', 3)

['Examination', 'Education', 'Catholic']

In [18]:
ev.rfe_features(swiss, 'Fertility', 3)

['Agriculture', 'Education', 'Catholic']