In [1]:
#Importing required packages and files
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pydataset import data


#These imports were constructed
from wrangle import train_validate

#Tools to build machine learning models and reports
import sklearn.preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import f_regression, RFE, SelectKBest

#Removes warnings and imporves asthenics
import warnings
warnings.filterwarnings("ignore")

#Sets an option to show all columns
pd.options.display.max_columns = None
pd.options.display.width = 100
pd.options.display.max_colwidth = None
pd.options.display.max_rows = 200


1. Load the tips dataset.

In [2]:
tips = sns.load_dataset('tips')

In [3]:
#Found schema on kaggle site
schema = pd.read_excel('zillow_data_dictionary.xlsx')
#schema
schema.loc[[4, 3, 11, 51, 50, 54, 17]]

Unnamed: 0,Feature,Description
4,'bedroomcnt',Number of bedrooms in home
3,'bathroomcnt',Number of bathrooms in home including fractional bathrooms
11,'calculatedfinishedsquarefeet',Calculated total finished living area of the home
51,'taxvaluedollarcnt',The total tax assessed value of the parcel
50,'yearbuilt',The Year the principal residence was built
54,'taxamount',The total property tax assessed for that assessment year
17,'fips',Federal Information Processing Standard code - see https://en.wikipedia.org/wiki/FIPS_county_code for more details


In [4]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [5]:
tips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


    a. Create a column named `price_per_person`. This should be the total bill divided by the party size.

In [6]:
tips['price_per_person'] = tips['total_bill'] / tips['size']

In [7]:
#Create dummy columns of my categorical features
tips_dummy = pd.get_dummies(columns=['sex', 'smoker', 'day', 'time', 'size'], data=tips)
tips_dummy.head()

Unnamed: 0,total_bill,tip,price_per_person,sex_Male,sex_Female,smoker_Yes,smoker_No,day_Thur,day_Fri,day_Sat,day_Sun,time_Lunch,time_Dinner,size_1,size_2,size_3,size_4,size_5,size_6
0,16.99,1.01,8.495,0,1,0,1,0,0,0,1,0,1,0,1,0,0,0,0
1,10.34,1.66,3.446667,1,0,0,1,0,0,0,1,0,1,0,0,1,0,0,0
2,21.01,3.5,7.003333,1,0,0,1,0,0,0,1,0,1,0,0,1,0,0,0
3,23.68,3.31,11.84,1,0,0,1,0,0,0,1,0,1,0,1,0,0,0,0
4,24.59,3.61,6.1475,0,1,0,1,0,0,0,1,0,1,0,0,0,1,0,0


In [8]:
tips_dummy.drop(columns=['sex_Female', 'smoker_No'], inplace=True)
tips_dummy.head()

Unnamed: 0,total_bill,tip,price_per_person,sex_Male,smoker_Yes,day_Thur,day_Fri,day_Sat,day_Sun,time_Lunch,time_Dinner,size_1,size_2,size_3,size_4,size_5,size_6
0,16.99,1.01,8.495,0,0,0,0,0,1,0,1,0,1,0,0,0,0
1,10.34,1.66,3.446667,1,0,0,0,0,1,0,1,0,0,1,0,0,0
2,21.01,3.5,7.003333,1,0,0,0,0,1,0,1,0,0,1,0,0,0
3,23.68,3.31,11.84,1,0,0,0,0,1,0,1,0,1,0,0,0,0
4,24.59,3.61,6.1475,0,0,0,0,0,1,0,1,0,0,0,1,0,0


In [9]:
train, val, test = train_validate(tips_dummy)

In [10]:
train.columns

Index(['total_bill', 'tip', 'price_per_person', 'sex_Male', 'smoker_Yes', 'day_Thur', 'day_Fri',
       'day_Sat', 'day_Sun', 'time_Lunch', 'time_Dinner', 'size_1', 'size_2', 'size_3', 'size_4',
       'size_5', 'size_6'],
      dtype='object')

In [11]:
cols = ['tip', 'sex', 'smoker', 'day', 'time', 'size']

In [12]:
scale_cols = ['total_bill', 'price_per_person']

In [13]:
train_scaled = train[['tip', 'sex_Male', 'smoker_Yes', 'day_Thur', 'day_Fri',
       'day_Sat', 'day_Sun', 'time_Lunch', 'time_Dinner', 'size_1', 'size_2', 'size_3', 'size_4',
       'size_5', 'size_6']]
val_scaled = val[['tip', 'sex_Male', 'smoker_Yes', 'day_Thur', 'day_Fri',
       'day_Sat', 'day_Sun', 'time_Lunch', 'time_Dinner', 'size_1', 'size_2', 'size_3', 'size_4',
       'size_5', 'size_6']]
test_scaled = test[['tip', 'sex_Male', 'smoker_Yes', 'day_Thur', 'day_Fri',
       'day_Sat', 'day_Sun', 'time_Lunch', 'time_Dinner', 'size_1', 'size_2', 'size_3', 'size_4',
       'size_5', 'size_6']]

In [14]:
#Create the scaler object
mm_scaler = sklearn.preprocessing.MinMaxScaler()

#Fit the scaler to the data
#Using 'fit' and 'transform' seperatly was causing 59 rows to become null. Unsure why.
train_scaled[['total_bill', 'price_per_person']] = mm_scaler.fit_transform(train[['total_bill', 'price_per_person']])
val_scaled[['total_bill', 'price_per_person']] = mm_scaler.fit_transform(val[['total_bill', 'price_per_person']])
test_scaled[['total_bill', 'price_per_person']] = mm_scaler.fit_transform(test[['total_bill', 'price_per_person']])


In [15]:
train_scaled.head()

Unnamed: 0,tip,sex_Male,smoker_Yes,day_Thur,day_Fri,day_Sat,day_Sun,time_Lunch,time_Dinner,size_1,size_2,size_3,size_4,size_5,size_6,total_bill,price_per_person
129,2.18,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0.442802,0.271935
108,3.76,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0.323995,0.358908
228,2.72,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0.195331,0.216379
203,2.5,0,1,1,0,0,0,1,0,0,1,0,0,0,0,0.276265,0.306034
98,3.0,1,1,0,1,0,0,0,1,0,1,0,0,0,0,0.39585,0.438506


    b. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?

My guess is tips['total_bill']

    c. Use select k best to select the top 2 features for predicting tip amount. What are they?

In [16]:
def train_val_test(train, val, test):
    """
    Seperates out the target variable and creates a series with only the target variable to test accuracy.
    """
    #Seperating out the target variable
    X_train = train.drop(columns=['tip'])
    y_train = train.tip

    X_val = val.drop(columns = ['tip'])
    y_val = val.tip

    X_test = test.drop(columns = ['tip'])
    y_test = test.tip
    return X_train, y_train, X_val, y_val, X_test, y_test

In [17]:
X_train, y_train, X_val, y_val, X_test, y_test = train_val_test(train_scaled, val_scaled, test_scaled)

In [18]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 136 entries, 129 to 80
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   sex_Male          136 non-null    uint8  
 1   smoker_Yes        136 non-null    uint8  
 2   day_Thur          136 non-null    uint8  
 3   day_Fri           136 non-null    uint8  
 4   day_Sat           136 non-null    uint8  
 5   day_Sun           136 non-null    uint8  
 6   time_Lunch        136 non-null    uint8  
 7   time_Dinner       136 non-null    uint8  
 8   size_1            136 non-null    uint8  
 9   size_2            136 non-null    uint8  
 10  size_3            136 non-null    uint8  
 11  size_4            136 non-null    uint8  
 12  size_5            136 non-null    uint8  
 13  size_6            136 non-null    uint8  
 14  total_bill        136 non-null    float64
 15  price_per_person  136 non-null    float64
dtypes: float64(2), uint8(14)
memory usage: 5.0 

In [19]:
# parameters: f_regression stats test, give me 2 features
f_selector = SelectKBest(f_regression, k=2)

# find the top 2 X's correlated with y
X_two = f_selector.fit_transform(X_train, y_train)

# get list of top K features. 
feature_mask = f_selector.get_support()

X_train.columns[feature_mask]

Index(['size_4', 'total_bill'], dtype='object')

    d. Use recursive feature elimination to select the top 2 features for tip amount. What are they?

In [20]:
#Build the model
lm = LinearRegression()
#Create the RFE object
rfe = RFE(lm, n_features_to_select=2)
#Fit the object and transform the DataFrame
rfe.fit_transform(X_train, y_train)
#Create the rankings table
var_ranks = rfe.ranking_
var_names = X_train.columns.tolist()
feature_ranks = pd.DataFrame({'Var':var_names, 'Rank': var_ranks})

In [21]:
feature_ranks.sort_values('Rank').head(2)

Unnamed: 0,Var,Rank
14,total_bill,1
15,price_per_person,1


    e. Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features you are selecting?

One goes through every option while the other samples randomly.

2. Write a function named `select_kbest` that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the `SelectKBest` class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [22]:
def select_kbest(X_train_scaled, y_train, k=2):
    """
    Takes in the scaled x_train, y_train data frames and the number(k) of features that you want and returns the top k features.
    """
    
    f_selector = SelectKBest(f_regression, k=2)
    f_selector.fit(X_train_scaled, y_train)
    X_reduced = f_selector.transform(X_train_scaled)

    print(X_train_scaled.shape)
    print(X_reduced.shape)

    f_support = f_selector.get_support()

    print(f_support) 
    f_feature = X_train_scaled.loc[:,f_support].columns.tolist()

    print(str(len(f_feature)), 'selected features')
    print(f_feature)


In [29]:
select_kbest(X_train, y_train, k=4)

(136, 16)
(136, 2)
[False False False False False False False False False False False  True
 False False  True False]
2 selected features
['size_4', 'total_bill']


3. Write a function named `rfe` that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the `tips` dataset. You should see the same results as when you did the process manually.

In [23]:
def rfe(X_train_scaled , y_train, k=2):
    lm = LinearRegression()
    rfe = RFE(lm, n_features_to_select=2)

    # Transforming data using RFE
    X_rfe = rfe.fit_transform(X_train_scaled,y_train)  

    #Fitting the data to model
    lm.fit(X_rfe,y_train)

    mask = rfe.support_

    rfe_features = X_train_scaled.loc[:,mask].columns.tolist()

    print(str(len(rfe_features)), 'selected features')
    print(rfe_features)


In [30]:
rfe(X_train, y_train, k=4)

2 selected features
['total_bill', 'price_per_person']


4. Load the `swiss` dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [24]:
#Load in the swiss dataset
swiss = data('swiss')

swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [25]:
#Create X and y dataframes
X = swiss.drop(columns=['Fertility'])
y = swiss['Fertility']

In [26]:
#Create a list of columns
X_columns = list(X.columns)

In [27]:
#Create a scaler and fit/transform the columns
mms2 = sklearn.preprocessing.MinMaxScaler()

X[X_columns] = mms2.fit_transform(X[X_columns])

X.head()

Unnamed: 0,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,0.178531,0.352941,0.211538,0.079816,0.721519
Delemont,0.496045,0.088235,0.153846,0.845069,0.721519
Franches-Mnt,0.435028,0.058824,0.076923,0.93255,0.594937
Moutier,0.39887,0.264706,0.115385,0.323148,0.601266
Neuveville,0.477966,0.411765,0.269231,0.030761,0.620253


In [28]:
#%whos