# Exercises
Do your work for this exercise in a jupyter notebook named feature_engineering within the regression-exercises repo. Add, commit, and push your work.

In [1]:
import math
import evaluate


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import MinMaxScaler


from pydataset import data

1. Load the tips dataset.



In [2]:
#Loading tips
tips_df = data('tips')

In [3]:
#Looking at docs for tips
data('tips', show_doc=True) 

tips

PyDataset Documentation (adopted from R Documentation. The displayed examples are in R)

## Tipping data

### Description

One waiter recorded information about each tip he received over a period of a
few months working in one restaurant. He collected several variables:

### Usage

    data(tips)

### Format

A data frame with 244 rows and 7 variables

### Details

  * tip in dollars, 

  * bill in dollars, 

  * sex of the bill payer, 

  * whether there were smokers in the party, 

  * day of the week, 

  * time of day, 

  * size of the party. 

In all he recorded 244 tips. The data was reported in a collection of case
studies for business statistics (Bryant & Smith 1995).

### References

Bryant, P. G. and Smith, M (1995) _Practical Data Analysis: Case Studies in
Business Statistics_. Homewood, IL: Richard D. Irwin Publishing:




In [4]:
tips_df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [5]:
# a. Create a column named tip_percentage. This should be the tip amount divided by the total bill.
tips_df['tip_percentage'] = tips_df['tip']/tips_df['total_bill'] # creating tip_percentage column

In [6]:
tips_df.head() # checking to see tip_percentage column was added

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage
1,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
2,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
3,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
4,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
5,24.59,3.61,Female,No,Sun,Dinner,4,0.146808


In [7]:
# b. Create a column named price_per_person. This should be the total bill divided by the party size.

tips_df['price_per_person'] = tips_df['total_bill']/tips_df['size'] # Making a price per person column

In [8]:
tips_df.head() # checking to see column was added

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,0.059447,8.495
2,10.34,1.66,Male,No,Sun,Dinner,3,0.160542,3.446667
3,21.01,3.5,Male,No,Sun,Dinner,3,0.166587,7.003333
4,23.68,3.31,Male,No,Sun,Dinner,2,0.13978,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,0.146808,6.1475


In [9]:
# c. Before using any of the methods discussed in the lesson, 
# which features do you think would be most important for predicting the tip amount? The tip percentage?

#Answer: I would think total bill and party size would be most important for tip amount and tip percentage.


In [10]:
# d. Use all the other numeric features to predict tip amount. 
#Use select k best and recursive feature elimination to select the top 2 features. What are they?

In [11]:
tips_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 244 entries, 1 to 244
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   total_bill        244 non-null    float64
 1   tip               244 non-null    float64
 2   sex               244 non-null    object 
 3   smoker            244 non-null    object 
 4   day               244 non-null    object 
 5   time              244 non-null    object 
 6   size              244 non-null    int64  
 7   tip_percentage    244 non-null    float64
 8   price_per_person  244 non-null    float64
dtypes: float64(4), int64(1), object(4)
memory usage: 19.1+ KB


In [12]:
def train_validate_test(df, target):
    """
    this function takes in a dataframe and splits it into 3 samples,
    a test, which is 20% of the entire dataframe,
    a validate, which is 24% of the entire dataframe,
    and a train, which is 56% of the entire dataframe.
    It then splits each of the 3 samples into a dataframe with independent variables
    and a series with the dependent, or target variable.
    The function returns 3 dataframes and 3 series:
    X_train (df) & y_train (series), X_validate & y_validate, X_test & y_test.
    """
    # split df into test (20%) and train_validate (80%)
    train_validate, test = train_test_split(df, test_size=0.2, random_state=123)

    # split train_validate off into train (70% of 80% = 56%) and validate (30% of 80% = 24%)
    train, validate = train_test_split(train_validate, test_size=0.3, random_state=123)

    # split train into X (dataframe, drop target) & y (series, keep target only)
    X_train = train.drop(columns=[target])
    y_train = train[target]

    # split validate into X (dataframe, drop target) & y (series, keep target only)
    X_validate = validate.drop(columns=[target])
    y_validate = validate[target]

    # split test into X (dataframe, drop target) & y (series, keep target only)
    X_test = test.drop(columns=[target])
    y_test = test[target]

    return X_train, y_train, X_validate, y_validate, X_test, y_test

In [13]:
X_train, y_train, X_validate, y_validate, X_test, y_test= train_validate_test(tips_df, 'tip')

In [14]:
X_train.head()

Unnamed: 0,total_bill,sex,smoker,day,time,size,tip_percentage,price_per_person
19,16.97,Female,No,Sun,Dinner,3,0.206246,5.656667
173,7.25,Male,Yes,Sun,Dinner,2,0.710345,3.625
119,12.43,Female,No,Thur,Lunch,2,0.144811,6.215
29,21.7,Male,No,Sat,Dinner,2,0.198157,10.85
238,32.83,Male,Yes,Sat,Dinner,2,0.035638,16.415


In [15]:
object_cols= ['sex', 'smoker', 'day', 'time']

In [16]:
numeric_cols = [col for col in X_train.columns.values if col not in object_cols]

numeric_cols

['total_bill', 'size', 'tip_percentage', 'price_per_person']

In [26]:
def min_max_scale(X_train, X_validate, X_test, numeric_cols):
    """
    this function takes in 3 dataframes with the same columns,
    a list of numeric column names (because the scaler can only work with numeric columns),
    and fits a min-max scaler to the first dataframe and transforms all
    3 dataframes using that scaler.
    it returns 3 dataframes with the same column names and scaled values.
    """
    # create the scaler object and fit it to X_train (i.e. identify min and max)
    # if copy = false, inplace row normalization happens and avoids a copy (if the input is already a numpy array).

    scaler = MinMaxScaler(copy=True).fit(X_train[numeric_cols])

    # scale X_train, X_validate, X_test using the mins and maxes stored in the scaler derived from X_train.
    #
    X_train_scaled_array = scaler.transform(X_train[numeric_cols])
    X_validate_scaled_array = scaler.transform(X_validate[numeric_cols])
    X_test_scaled_array = scaler.transform(X_test[numeric_cols])

    # convert arrays to dataframes
    X_train_scaled = pd.DataFrame(X_train_scaled_array, columns=numeric_cols).set_index(
        [X_train.index.values]
    )

    X_validate_scaled = pd.DataFrame(
        X_validate_scaled_array, columns=numeric_cols
    ).set_index([X_validate.index.values])

    X_test_scaled = pd.DataFrame(X_test_scaled_array, columns=numeric_cols).set_index(
        [X_test.index.values]
    )

    return X_train_scaled, X_validate_scaled, X_test_scaled

In [27]:
X_train_scaled, X_validate_scaled, X_test_scaled= min_max_scale(X_train, X_validate, X_test, numeric_cols)

In [28]:
#Using SelectKbest 
from sklearn.feature_selection import SelectKBest, f_regression

f_selector = SelectKBest(score_func=f_regression, k=2)
f_selector.fit(X_train_scaled, y_train)

SelectKBest(k=2, score_func=<function f_regression at 0x7fc13002f3a0>)

In [29]:
mask = f_selector.get_support()
X_train_scaled.columns[mask]

Index(['total_bill', 'size'], dtype='object')

In [30]:
X_train_kbest = f_selector.transform(X_train_scaled)

model = LinearRegression().fit(X_train_kbest, y_train)

In [31]:
# Recursive Feature Elimination (RFE)

model = LinearRegression().fit(X_train_scaled, y_train)
model.coef_

array([ 6.06110509,  0.75422556,  8.43886118, -0.16428038])

In [32]:
model = DecisionTreeRegressor().fit(X_train_scaled, y_train)
model.feature_importances_

array([0.54995094, 0.02376776, 0.40864854, 0.01763276])

In [33]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

lm = LinearRegression()
rfe = RFE(estimator=lm, n_features_to_select=2)
rfe.fit(X_train_scaled, y_train)

RFE(estimator=LinearRegression(), n_features_to_select=2)

In [34]:
rfe.support_

array([ True, False,  True, False])

In [35]:
X_train_scaled.columns[rfe.support_]

Index(['total_bill', 'tip_percentage'], dtype='object')

In [36]:
# Answer: Top two are total bill and size in selectKBest and the answer when using recursive feature elimination(RFE)

In [37]:
# e. Use all the other numeric features to predict tip percentage. Use select k best and recursive feature 
# elimination to select the top 2 features. What are they?

In [None]:
y_col = 'tip_percentage'

X_train, y_train = train.drop(columns= ['tip_percentage']), train[y_col]
X_validate, y_validate = validate.drop(columns= ['tip_percentage']), validate[y_col]
X_test, y_test = test.drop(columns= ['tip_percentage']), test[y_col]

In [54]:
X_train2, y_train2, X_validate2, y_validate2, X_test2, y_test2= train_validate_test(tips_df, 'tip_percentage')

In [55]:
object_cols= ['sex', 'smoker', 'day', 'time']

In [56]:
numeric_cols2 = [col for col in X_train2.columns.values if col not in object_cols]

numeric_cols

['total_bill', 'size', 'tip_percentage', 'price_per_person']

In [57]:
X_train2.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
19,16.97,3.5,Female,No,Sun,Dinner,3,5.656667
173,7.25,5.15,Male,Yes,Sun,Dinner,2,3.625
119,12.43,1.8,Female,No,Thur,Lunch,2,6.215
29,21.7,4.3,Male,No,Sat,Dinner,2,10.85
238,32.83,1.17,Male,Yes,Sat,Dinner,2,16.415


In [None]:
# f. Why do you think select k best and recursive feature elimination
# might give different answers for the top features? Does this change as 
# you change the number of features your are selecting?

2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.


3. Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

4. Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).