<h2 id="exercises">Exercises</h2>
<p>Do your work for this exercise in a jupyter notebook named <code>feature_engineering</code> within the <code>regression-exercises</code> repo. Add, commit, and push your work.</p>


In [1]:
from pydataset import data
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
import sklearn.linear_model
import sklearn.feature_selection
import sklearn.preprocessing

<ol>
<li>
<p>Load the <code>tips</code> dataset.</p>
<ol>


In [2]:
tips = data('tips')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [3]:
tips.shape

(244, 7)

In [4]:
tips = tips.rename(columns={'size': 'number_of_people'})

a. Create a column named <code>tip_percentage</code>. This should be the tip amount divided by the total bill.

In [5]:
tips["tip_percentage"] = tips.tip / tips.total_bill


In [6]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,number_of_people,tip_percentage
1,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
2,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
3,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
4,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
5,24.59,3.61,Female,No,Sun,Dinner,4,0.146808


b. Create a column named <code>price_per_person</code>. This should be the total bill divided by the party size.


In [8]:
tips["price_per_person"] = tips.total_bill / tips.number_of_people
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,number_of_people,tip_percentage,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,0.059447,8.495
2,10.34,1.66,Male,No,Sun,Dinner,3,0.160542,3.446667
3,21.01,3.5,Male,No,Sun,Dinner,3,0.166587,7.003333
4,23.68,3.31,Male,No,Sun,Dinner,2,0.13978,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,0.146808,6.1475


c. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount? The tip percentage?


In [10]:
# Select only numeric features

tips = tips[['total_bill', 'tip', 'number_of_people', 'tip_percentage', 'price_per_person']]
tips.head()

Unnamed: 0,total_bill,tip,number_of_people,tip_percentage,price_per_person
1,16.99,1.01,2,0.059447,8.495
2,10.34,1.66,3,0.160542,3.446667
3,21.01,3.5,3,0.166587,7.003333
4,23.68,3.31,2,0.13978,11.84
5,24.59,3.61,4,0.146808,6.1475


In [11]:
tips.dtypes

total_bill          float64
tip                 float64
number_of_people      int64
tip_percentage      float64
price_per_person    float64
dtype: object

d. Use select k best and recursive feature elimination to select the top 2 features for predicting tip amount. What are they?

In [12]:
def split(df):
    '''
    take in a DataFrame and return train, validate, and test DataFrames.
    return train, validate, test DataFrames.
    '''
    train_validate, test = train_test_split(df, test_size=.2, random_state=123)
    train, validate = train_test_split(train_validate, 
                                       test_size=.3, 
                                       random_state=123)
    return train, validate, test

In [13]:
# Split the data
train, validate, test = split(tips)

In [15]:
print(train.shape)
print(validate.shape)
print(test.shape)

(136, 5)
(59, 5)
(49, 5)


In [16]:
target = "tip"


In [17]:
# split train into X /Y

X_train = train.drop(columns=[target])
y_train = train[target]

In [18]:
# Same validata and test

X_validate = validate.drop(columns=[target])
y_validate = validate[target]
X_test = test.drop(columns=[target])
y_test = test[target]

In [19]:
X_train.head()

Unnamed: 0,total_bill,number_of_people,tip_percentage,price_per_person
19,16.97,3,0.206246,5.656667
173,7.25,2,0.710345,3.625
119,12.43,2,0.144811,6.215
29,21.7,2,0.198157,10.85
238,32.83,2,0.035638,16.415


In [26]:
#SCALE

scaler = sklearn.preprocessing.MinMaxScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_validate_scaled = scaler.transform(X_validate)
X_test_scaled = scaler.transform(X_test)

In [27]:
X_train_scaled


array([[0.30711445, 0.4       , 0.25286274, 0.15034389],
       [0.09235528, 0.2       , 1.        , 0.03225806],
       [0.20680513, 0.2       , 0.1618078 , 0.1827957 ],
       [0.41162174, 0.2       , 0.24087288, 0.45219413],
       [0.65753425, 0.2       , 0.        , 0.77564661],
       [0.78789218, 0.6       , 0.06198426, 0.38433595],
       [0.44410075, 0.6       , 0.36296815, 0.15823888],
       [0.3804684 , 0.2       , 0.18166098, 0.41121767],
       [0.31794079, 0.2       , 0.16279257, 0.32897414],
       [0.40720283, 0.6       , 0.18845606, 0.13397268],
       [0.1979673 , 0.2       , 0.13198349, 0.17117117],
       [0.19774635, 0.2       , 0.19009056, 0.17088056],
       [0.34710561, 0.2       , 0.18394107, 0.36733508],
       [0.32744145, 0.2       , 0.11287299, 0.3414705 ],
       [0.31816173, 0.2       , 0.24411394, 0.32926475],
       [0.69730446, 0.2       , 0.09911586, 0.82795699],
       [0.71122404, 0.6       , 0.15735076, 0.33391456],
       [0.33517455, 0.2       ,

In [28]:
k = 2

In [29]:
# Select K Best

kbest = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_regression, k=2)

kbest.fit(X_train, y_train)

kbest_features = X_train.columns[kbest.get_support()].tolist()

print(kbest_features)


['total_bill', 'number_of_people']


e. Use select k best and recursive feature elimination to select the top 2 features for predicting tip percentage. What are they?

In [30]:
lm = sklearn.linear_model.LinearRegression()
rfe = sklearn.feature_selection.RFE(lm, n_features_to_select=2)

rfe.fit(X_train, y_train)

rfe_columns = X_train.columns[rfe.support_].tolist()
rfe_columns


['number_of_people', 'tip_percentage']

f. Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?

2. Write a function named <code>select_kbest</code> that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the <code>SelectKBest</code> class. Test your function with the <code>tips</code> dataset. You should see the same results as when you did the process manually.

In [31]:
def select_kbest(X, y, k):
    kbest = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_regression, k=k)
    kbest.fit(X, y)
    return X.columns[kbest.get_support()].tolist()

select_kbest(X_train, y_train, 2)

['total_bill', 'number_of_people']

3. Write a function named <code>rfe</code> that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the <code>RFE</code> class. Test your function with the <code>tips</code> dataset. You should see the same results as when you did the process manually.

In [32]:
def show_features_rankings(X_train, rfe):
    """
    Takes in a dataframe and a fit RFE object in order to output the rank of all features
    """
    # rfe here is reference rfe from cell 15
    var_ranks = rfe.ranking_
    var_names = X_train.columns.tolist()
    ranks = pd.DataFrame({'Var': var_names, 'Rank': var_ranks})
    ranks = ranks.sort_values(by="Rank", ascending=True)
    return ranks

In [33]:
def select_rfe(predictors, target, k):
    # make the thing
    lm = sklearn.linear_model.LinearRegression()
    rfe = sklearn.feature_selection.RFE(lm, n_features_to_select=k)

    # Fit the thing
    rfe.fit(predictors, target)
    
    # use the thing
    features_to_use = predictors.columns[rfe.support_].tolist()
    
    # we need to send show_feature_rankings a trained/fit RFE object
    all_rankings = show_features_rankings(predictors, rfe)
    
    return features_to_use, all_rankings

In [34]:
select_rfe(X_train, y_train, 2)

(['number_of_people', 'tip_percentage'],
                 Var  Rank
 1  number_of_people     1
 2    tip_percentage     1
 0        total_bill     2
 3  price_per_person     3)

4. Load the <code>swiss</code> dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [35]:
swiss = data('swiss')
swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [36]:
train, validate, test = split(swiss)


In [37]:
X_train = train.drop(columns='Fertility')
y_train = train.Fertility

X_validate = validate.drop(columns='Fertility')
y_validate = validate.Fertility

X_test = test.drop(columns='Fertility')
y_test = test.Fertility

In [45]:
scaler = sklearn.preprocessing.MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
X_validate_scaled = pd.DataFrame(scaler.transform(X_validate), columns=X_validate.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

In [46]:
X_train_scaled.head()

Unnamed: 0,Agriculture,Examination,Education,Catholic,Infant.Mortality
0,0.647561,0.40625,0.290323,0.054508,0.122449
1,0.796341,0.5,0.258065,0.004508,0.5
2,0.526829,0.59375,0.354839,0.130533,0.163265
3,0.953659,0.0,0.032258,0.997029,0.0
4,0.509756,0.375,0.225806,0.03791,0.755102


In [47]:
X_validate_scaled.head()

Unnamed: 0,Agriculture,Examination,Education,Catholic,Infant.Mortality
0,0.578049,0.34375,0.064516,0.021721,0.744898
1,0.791463,0.46875,0.032258,0.223361,0.622449
2,0.35122,0.28125,0.193548,0.321414,0.530612
3,0.635366,0.59375,0.290323,0.028996,0.295918
4,0.937805,0.0,0.064516,0.994467,0.122449


In [48]:
X_test_scaled.head()

Unnamed: 0,Agriculture,Examination,Education,Catholic,Infant.Mortality
0,0.556098,0.28125,0.193548,0.976127,0.602041
1,0.662195,0.5625,0.354839,0.062705,0.142857
2,-0.079268,1.0625,1.677419,0.409221,0.295918
3,0.091463,0.875,0.612903,-0.002561,-0.438776
4,0.134146,0.6875,0.193548,0.064037,0.44898


In [49]:
print(X_train_scaled.shape)
print(X_validate_scaled.shape)
print(X_test_scaled.shape)

(25, 5)
(12, 5)
(10, 5)


In [50]:
select_kbest(X_train_scaled, y_train, 3)

['Examination', 'Catholic', 'Infant.Mortality']

In [51]:
selected_features, all_rankings = select_rfe(X_train, y_train, 3)
print(selected_features)
all_rankings

['Agriculture', 'Examination', 'Infant.Mortality']


Unnamed: 0,Var,Rank
0,Agriculture,1
1,Examination,1
4,Infant.Mortality,1
2,Education,2
3,Catholic,3
