In [71]:
import pandas as pd
import numpy as np
from scipy import stats
from math import sqrt
import matplotlib.pyplot as plt
import seaborn as sns

import wrangle

import sklearn.preprocessing
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression 
from sklearn.feature_selection import SelectKBest, f_regression, RFE

import warnings
warnings.filterwarnings('ignore')

In [55]:
df = data('tips')

In [56]:
#check the df and familiarize with column names and data
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [57]:
df = df.rename(columns = {'size': 'party'})
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,party
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [59]:
# feature engineer columns to help with modeling
df['tip_percentage'] = df.tip/df.total_bill
df['price_per_person'] = df.total_bill / df.party

In [60]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,party,tip_percentage,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,0.059447,8.495
2,10.34,1.66,Male,No,Sun,Dinner,3,0.160542,3.446667
3,21.01,3.5,Male,No,Sun,Dinner,3,0.166587,7.003333
4,23.68,3.31,Male,No,Sun,Dinner,2,0.13978,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,0.146808,6.1475


##### - I would think that important features for predicting tip amount would be total bill and party size.

In [61]:
df = df.drop(columns = {'sex', 'smoker', 'day', 'time'})

In [66]:
# Split the data into train, validate, split for exploration
train, validate, split = wrangle.split_data(df)

In [67]:
train.shape, validate.shape, split.shape

((136, 5), (59, 5), (49, 5))

In [68]:
# further split the date to narrow down feature 
X_train = train.drop(columns = ['tip'])
y_train = train['tip']

X_validate = train.drop(columns = ['tip'])
y_validate = train['tip']

X_test = train.drop(columns = ['tip'])
y_test = train['tip']

In [69]:
X_train.head()

Unnamed: 0,total_bill,party,tip_percentage,price_per_person
19,16.97,3,0.206246,5.656667
173,7.25,2,0.710345,3.625
119,12.43,2,0.144811,6.215
29,21.7,2,0.198157,10.85
238,32.83,2,0.035638,16.415


In [83]:
# making the thing
scaler = sklearn.preprocessing.MinMaxScaler()

# fit the thing
scaler.fit(X_train)

# use scaler to transform train, validate, test
X_train_scaled = pd.DataFrame(scaler.transform(X_train))
X_validate_scaled = pd.DataFrame(scaler.transform(X_validate))
X_test_scaled = pd.DataFrame(scaler.transform(X_test))

In [86]:
# get list of column names
cols = X_train.columns.tolist()
cols

['total_bill', 'party', 'tip_percentage', 'price_per_person']

In [89]:
# set columns names from original df to scaled df
X_train_scaled.columns = cols
X_validate_scaled.columns = cols
X_test_scaled.columns = cols

X_train_scaled.head()

Unnamed: 0,total_bill,party,tip_percentage,price_per_person
0,0.307114,0.4,0.252863,0.150344
1,0.092355,0.2,1.0,0.032258
2,0.206805,0.2,0.161808,0.182796
3,0.411622,0.2,0.240873,0.452194
4,0.657534,0.2,0.0,0.775647


In [91]:
# use SelectKBest to select top two features

# make the thing
kbest = SelectKBest (f_regression, k=2)

# fit
kbest.fit(X_train, y_train)

kbest_features = X_train.columns[kbest.get_support()].tolist()

print(f'Kbest_features for predicting tip are: {kbest_features}')

Kbest_features for predicting tip are: ['total_bill', 'party']


In [92]:
# use RFE to select top two features

# make the thing

lm = LinearRegression()
rfe = RFE(lm, n_features_to_select=2)

#fit

rfe.fit(X_train_scaled, y_train)

# use
rfe_columns = X_train_scaled.columns[rfe.support_].tolist()
print(f'rfe_features for predicting tip with scaled columns are: {rfe_columns}')

rfe_features for predicting tip with scaled columns are: ['total_bill', 'tip_percentage']


#### There are two different approaches when it comes to finding which features are best at predicting.
SelectKBest uses statistical analysis and performance with the target variable to decde feature selection. 
RFE is a more in depth process.  
The comparison takes all features within a model and will remove the weakest features until you are left with the desired number of features.