In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [2]:
tips=pd.read_csv('tipsDATA.csv')

In [3]:
tips.head()

Unnamed: 0,total_bill,tip,gender,smoker,day,time,size
0,2125.5,360.79,Male,No,Thur,Lunch,1
1,2727.18,259.42,Female,No,Sun,Dinner,5
2,1066.02,274.68,Female,Yes,Thur,Dinner,4
3,3493.45,337.9,Female,No,Sun,Dinner,1
4,3470.56,567.89,Male,Yes,Sun,Lunch,6


In [4]:
tips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 744 entries, 0 to 743
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  744 non-null    float64
 1   tip         744 non-null    float64
 2   gender      744 non-null    object 
 3   smoker      744 non-null    object 
 4   day         744 non-null    object 
 5   time        744 non-null    object 
 6   size        744 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 40.8+ KB


In [6]:
tips.describe

<bound method NDFrame.describe of      total_bill     tip  gender smoker   day    time  size
0       2125.50  360.79    Male     No  Thur   Lunch     1
1       2727.18  259.42  Female     No   Sun  Dinner     5
2       1066.02  274.68  Female    Yes  Thur  Dinner     4
3       3493.45  337.90  Female     No   Sun  Dinner     1
4       3470.56  567.89    Male    Yes   Sun   Lunch     6
..          ...     ...     ...    ...   ...     ...   ...
739     3164.27  645.28    Male     No   Sat  Dinner     3
740     2962.62  218.00  Female    Yes   Sat  Dinner     2
741     2471.03  218.00    Male    Yes   Sat  Dinner     2
742     1942.38  190.75    Male     No   Sat  Dinner     2
743     2047.02  327.00  Female     No  Thur  Dinner     2

[744 rows x 7 columns]>

In [7]:
tips.shape

(744, 7)

In [8]:
tips.isnull().sum()

total_bill    0
tip           0
gender        0
smoker        0
day           0
time          0
size          0
dtype: int64

In [11]:
tips.columns

Index(['total_bill', 'tip', 'gender', 'smoker', 'day', 'time', 'size'], dtype='object')

# PREPROCESSING

In [16]:
cat_features = tips.select_dtypes(include=['object']).columns

In [17]:
cat_features

Index(['gender', 'smoker', 'day', 'time'], dtype='object')

In [18]:
tips['gender']

0        Male
1      Female
2      Female
3      Female
4        Male
        ...  
739      Male
740    Female
741      Male
742      Male
743    Female
Name: gender, Length: 744, dtype: object

In [19]:
for cat_feature in cat_features:
    print(tips[cat_feature].unique())

['Male' 'Female']
['No' 'Yes']
['Thur' 'Sun' 'Mon' 'Sat' 'Wed' 'Tues' 'Fri']
['Lunch' 'Dinner']


In [21]:
#Encode features
tips['gender'].astype('category').cat.codes

0      1
1      0
2      0
3      0
4      1
      ..
739    1
740    0
741    1
742    1
743    0
Length: 744, dtype: int8

In [23]:
for cat_feature in cat_features:
    tips[cat_feature]=tips[cat_feature].astype('category').cat.codes

In [24]:
tips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 744 entries, 0 to 743
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  744 non-null    float64
 1   tip         744 non-null    float64
 2   gender      744 non-null    int8   
 3   smoker      744 non-null    int8   
 4   day         744 non-null    int8   
 5   time        744 non-null    int8   
 6   size        744 non-null    int64  
dtypes: float64(2), int64(1), int8(4)
memory usage: 20.5 KB


# Model Building

In [25]:
tips.head

<bound method NDFrame.head of      total_bill     tip  gender  smoker  day  time  size
0       2125.50  360.79       1       0    4     1     1
1       2727.18  259.42       0       0    3     0     5
2       1066.02  274.68       0       1    4     0     4
3       3493.45  337.90       0       0    3     0     1
4       3470.56  567.89       1       1    3     1     6
..          ...     ...     ...     ...  ...   ...   ...
739     3164.27  645.28       1       0    2     0     3
740     2962.62  218.00       0       1    2     0     2
741     2471.03  218.00       1       1    2     0     2
742     1942.38  190.75       1       0    2     0     2
743     2047.02  327.00       0       0    4     0     2

[744 rows x 7 columns]>

In [32]:
y=['tips']
X=tips.drop('tip', axis=1)

In [37]:
#data segmentation
y=tips['tip']
X=tips.drop('tip',axis=1)

#spliting the data

X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.25,random_state=45)

In [38]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((558, 6), (186, 6), (558,), (186,))

In [40]:
#Training Linear Regression

from sklearn.linear_model import LinearRegression
#instantiate the model
lin_model=LinearRegression()

In [41]:
lin_model.fit(X_train, y_train)

LinearRegression()

In [44]:
#predict
lin_preds=lin_model.predict(X_test)

In [45]:
lin_preds[:10]

array([342.13895449, 318.73568409, 284.22016272, 304.05401373,
       326.66082377, 450.09405066, 344.13958473, 332.74136051,
       296.3177147 , 326.12841455])

In [42]:
lin_preds=lin_model.predict(X_test)

In [43]:
#Mean Absolute Error
mean_absolute_error(y_test,lin_preds)

103.68817769867371

In [46]:
mean_squared_error(lin_preds, y_test)

19799.502158192518

In [48]:
r2_score(y_test, lin_preds)

0.03923201023030676

In [49]:
from sklearn.tree import DecisionTreeRegressor

In [50]:
tree=DecisionTreeRegressor(random_state=45)

In [55]:
tree_model=LinearRegression()

In [59]:
tree.fit(X_train, y_train)

DecisionTreeRegressor(random_state=45)

In [62]:
tree_preds=tree.predict(X_test)

In [63]:
mean_absolute_error(y_test, tree_preds)

143.87413978494624

In [64]:
mean_squared_error(tree_preds, y_test)

36382.32396935483

In [65]:
r2_score(y_test, tree_preds)

-0.7654470291175131

In [66]:
r2_score(tree_preds, y_test)

-0.14247558225913415

In [67]:
pd.get_dummies(tips,columns=['day'])

Unnamed: 0,total_bill,tip,gender,smoker,time,size,day_0,day_1,day_2,day_3,day_4,day_5,day_6
0,2125.50,360.79,1,0,1,1,0,0,0,0,1,0,0
1,2727.18,259.42,0,0,0,5,0,0,0,1,0,0,0
2,1066.02,274.68,0,1,0,4,0,0,0,0,1,0,0
3,3493.45,337.90,0,0,0,1,0,0,0,1,0,0,0
4,3470.56,567.89,1,1,1,6,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,3164.27,645.28,1,0,0,3,0,0,1,0,0,0,0
740,2962.62,218.00,0,1,0,2,0,0,1,0,0,0,0
741,2471.03,218.00,1,1,0,2,0,0,1,0,0,0,0
742,1942.38,190.75,1,0,0,2,0,0,1,0,0,0,0
