In [63]:
import pandas as pd

from pydataset import data



from sklearn.model_selection import train_test_split


# module used to test multicolinearity
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [64]:
# loading our tips dataset using the data from pydataset

tip_df = data('tips')


# getting information about the dataset

tip_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 244 entries, 1 to 244
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 15.2+ KB


In [65]:
# taking a look at the data

tip_df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


**Takeaways**
- we have some non numeric data types for categorical variables which can be converted to numeric 
- we might have to scale our data so that each variable have equal contribution to our model


In [66]:
# encoding categorical to numeric data types


tips = pd.concat([tip_df, (pd.get_dummies(tip_df[['sex', 'smoker', 'time']], drop_first = True))], axis = 1)

In [67]:
# dropping the repeated columns


tips = tips.drop(columns = ['sex', 'smoker', 'time'])

In [68]:
tips

Unnamed: 0,total_bill,tip,day,size,sex_Male,smoker_Yes,time_Lunch
1,16.99,1.01,Sun,2,0,0,0
2,10.34,1.66,Sun,3,1,0,0
3,21.01,3.50,Sun,3,1,0,0
4,23.68,3.31,Sun,2,1,0,0
5,24.59,3.61,Sun,4,0,0,0
...,...,...,...,...,...,...,...
240,29.03,5.92,Sat,3,1,0,0
241,27.18,2.00,Sat,2,0,1,0
242,22.67,2.00,Sat,2,1,1,0
243,17.82,1.75,Sat,2,1,0,0


- Create a column named tip_percentage. This should be the tip amount divided by the total bill.

In [69]:
tips['tip_percentage'] = tips.tip/tips.total_bill


- Create a column named price_per_person. This should be the total bill divided by the party size.

In [70]:
tips['price_per_person'] = tips['total_bill']/tips['size']


In [71]:
# encodig the categorical variables to numeric variables

tips = pd.concat([tips ,pd.get_dummies(tips.day)], axis = 1)

In [72]:
# dropping the repeated column

tips = tips.drop(columns = 'day')

In [73]:
tips

Unnamed: 0,total_bill,tip,size,sex_Male,smoker_Yes,time_Lunch,tip_percentage,price_per_person,Fri,Sat,Sun,Thur
1,16.99,1.01,2,0,0,0,0.059447,8.495000,0,0,1,0
2,10.34,1.66,3,1,0,0,0.160542,3.446667,0,0,1,0
3,21.01,3.50,3,1,0,0,0.166587,7.003333,0,0,1,0
4,23.68,3.31,2,1,0,0,0.139780,11.840000,0,0,1,0
5,24.59,3.61,4,0,0,0,0.146808,6.147500,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
240,29.03,5.92,3,1,0,0,0.203927,9.676667,0,1,0,0
241,27.18,2.00,2,0,1,0,0.073584,13.590000,0,1,0,0
242,22.67,2.00,2,1,1,0,0.088222,11.335000,0,1,0,0
243,17.82,1.75,2,1,0,0,0.098204,8.910000,0,1,0,0


In [74]:
# lets split our data

train_validate, test = train_test_split(tips, test_size = .15, random_state = 123)


train, validate = train_test_split(train_validate, test_size = 0.15, random_state = 123)

# checking the shape of the splitted data
train.shape, test.shape, validate.shape

((175, 12), (37, 12), (32, 12))

# Checking for multicolinearity between the variables

In [75]:
# creating a function that outputs a dataframe with a column of variable names and column with their 
# variance inflation factor


# Variance inflation factor score determines if the variable can be explained by another variables. Meaning the variale is correlated to 
#  other variables



def vif(x):
    vif = pd.DataFrame()
    vif['Variables'] = x.columns
    vif["VIF"] = [variance_inflation_factor(x.values, i) for i in range(x.shape[1])]
    return(vif)


In [76]:
# separating the independent varaibles from dependent variable

x_train = train.drop(columns = 'tip')
y_train = train[['tip']]

In [77]:
# checking multicolinearity amongst independent variables

vif(x_train)

Unnamed: 0,Variables,VIF
0,total_bill,18.565164
1,size,11.825821
2,sex_Male,1.109787
3,smoker_Yes,1.211814
4,time_Lunch,8.872043
5,tip_percentage,1.239107
6,price_per_person,11.926037
7,Fri,9.001133
8,Sat,36.364606
9,Sun,35.127854


**Variables with VIF value more than 5 is considered to have multicolinearity**

In [78]:
# we can reduce the VIF score be dropping the columns with highest VIF score



df1 = x_train.drop(columns = [ 'Thur', 'Sat', 'total_bill', 'size'])

In [79]:
vif(df1)

Unnamed: 0,Variables,VIF
0,sex_Male,2.685086
1,smoker_Yes,2.041642
2,time_Lunch,1.61972
3,tip_percentage,4.514289
4,price_per_person,4.641482
5,Fri,1.199355
6,Sun,1.937633
