# Support Vector Regression on Tips Dataset

In [14]:
import seaborn as sns
df = sns.load_dataset('tips')

In [15]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


In [17]:
df['sex'].value_counts()

sex
Male      157
Female     87
Name: count, dtype: int64

In [18]:
df['smoker'].value_counts()

smoker
No     151
Yes     93
Name: count, dtype: int64

In [19]:
df['day'].value_counts()

day
Sat     87
Sun     76
Thur    62
Fri     19
Name: count, dtype: int64

In [20]:
df['time'].value_counts()

time
Dinner    176
Lunch      68
Name: count, dtype: int64

In [21]:
## Feature Encoding (Label Encoding and One Hot Encoding)
## Independent and Dependent Features
X = df.drop('tip', axis=1)
y = df['tip']


In [22]:
## Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=10)


In [23]:
X_train

Unnamed: 0,total_bill,sex,smoker,day,time,size
58,11.24,Male,Yes,Sat,Dinner,2
1,10.34,Male,No,Sun,Dinner,3
2,21.01,Male,No,Sun,Dinner,3
68,20.23,Male,No,Sat,Dinner,2
184,40.55,Male,Yes,Sun,Dinner,2
...,...,...,...,...,...,...
64,17.59,Male,No,Sat,Dinner,3
15,21.58,Male,No,Sun,Dinner,2
228,13.28,Male,No,Sat,Dinner,2
125,29.80,Female,No,Thur,Lunch,6


In [24]:
from sklearn.preprocessing import LabelEncoder
le1 = LabelEncoder()
le2 = LabelEncoder()
le3 = LabelEncoder()
X_train['sex'] = le1.fit_transform(X_train['sex'])
X_train['smoker'] = le2.fit_transform(X_train['smoker'])
X_train['time'] = le3.fit_transform(X_train['time'])

In [25]:
X_train

Unnamed: 0,total_bill,sex,smoker,day,time,size
58,11.24,1,1,Sat,0,2
1,10.34,1,0,Sun,0,3
2,21.01,1,0,Sun,0,3
68,20.23,1,0,Sat,0,2
184,40.55,1,1,Sun,0,2
...,...,...,...,...,...,...
64,17.59,1,0,Sat,0,3
15,21.58,1,0,Sun,0,2
228,13.28,1,0,Sat,0,2
125,29.80,0,0,Thur,1,6


In [26]:
X_test['sex'] = le1.transform(X_test['sex'])
X_test['smoker'] = le2.transform(X_test['smoker'])
X_test['time'] = le3.transform(X_test['time'])
X_test

Unnamed: 0,total_bill,sex,smoker,day,time,size
162,16.21,0,0,Sun,0,3
60,20.29,1,1,Sat,0,2
61,13.81,1,1,Sat,0,2
63,18.29,1,1,Sat,0,4
69,15.01,1,1,Sat,0,2
...,...,...,...,...,...,...
201,12.74,0,1,Thur,1,2
149,7.51,1,0,Thur,1,2
175,32.90,1,1,Sun,0,2
226,10.09,0,1,Fri,1,2


In [27]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [30]:
ct = ColumnTransformer(transformers=[('onehot', OneHotEncoder(drop='first'), [3])],
                        remainder='passthrough')

In [31]:
ct.fit_transform(X_train)

array([[1., 0., 0., ..., 1., 0., 2.],
       [0., 1., 0., ..., 0., 0., 3.],
       [0., 1., 0., ..., 0., 0., 3.],
       ...,
       [1., 0., 0., ..., 0., 0., 2.],
       [0., 0., 1., ..., 0., 1., 6.],
       [0., 1., 0., ..., 0., 0., 2.]], shape=(183, 8))

In [37]:
import numpy  as np
import sys
np.set_printoptions(threshold=sys.maxsize)
X_train = ct.fit_transform(X_train)

In [33]:
X_test = ct.transform(X_test)
X_test

array([[ 0.  ,  1.  ,  0.  , 16.21,  0.  ,  0.  ,  0.  ,  3.  ],
       [ 1.  ,  0.  ,  0.  , 20.29,  1.  ,  1.  ,  0.  ,  2.  ],
       [ 1.  ,  0.  ,  0.  , 13.81,  1.  ,  1.  ,  0.  ,  2.  ],
       [ 1.  ,  0.  ,  0.  , 18.29,  1.  ,  1.  ,  0.  ,  4.  ],
       [ 1.  ,  0.  ,  0.  , 15.01,  1.  ,  1.  ,  0.  ,  2.  ],
       [ 0.  ,  0.  ,  1.  , 32.68,  1.  ,  1.  ,  1.  ,  2.  ],
       [ 0.  ,  1.  ,  0.  , 19.49,  1.  ,  0.  ,  0.  ,  2.  ],
       [ 1.  ,  0.  ,  0.  , 25.89,  1.  ,  1.  ,  0.  ,  4.  ],
       [ 0.  ,  1.  ,  0.  , 22.23,  1.  ,  0.  ,  0.  ,  2.  ],
       [ 1.  ,  0.  ,  0.  , 24.06,  1.  ,  0.  ,  0.  ,  3.  ],
       [ 0.  ,  1.  ,  0.  , 23.33,  1.  ,  1.  ,  0.  ,  2.  ],
       [ 1.  ,  0.  ,  0.  , 44.3 ,  0.  ,  1.  ,  0.  ,  3.  ],
       [ 0.  ,  0.  ,  1.  ,  7.56,  1.  ,  0.  ,  1.  ,  2.  ],
       [ 1.  ,  0.  ,  0.  , 30.14,  0.  ,  1.  ,  0.  ,  4.  ],
       [ 0.  ,  1.  ,  0.  , 16.49,  1.  ,  0.  ,  0.  ,  4.  ],
       [ 0.  ,  0.  ,  1.

In [36]:
print(X_test, y_test)
print(X_train, y_train)

[[ 0.    1.    0.   16.21  0.    0.    0.    3.  ]
 [ 1.    0.    0.   20.29  1.    1.    0.    2.  ]
 [ 1.    0.    0.   13.81  1.    1.    0.    2.  ]
 [ 1.    0.    0.   18.29  1.    1.    0.    4.  ]
 [ 1.    0.    0.   15.01  1.    1.    0.    2.  ]
 [ 0.    0.    1.   32.68  1.    1.    1.    2.  ]
 [ 0.    1.    0.   19.49  1.    0.    0.    2.  ]
 [ 1.    0.    0.   25.89  1.    1.    0.    4.  ]
 [ 0.    1.    0.   22.23  1.    0.    0.    2.  ]
 [ 1.    0.    0.   24.06  1.    0.    0.    3.  ]
 [ 0.    1.    0.   23.33  1.    1.    0.    2.  ]
 [ 1.    0.    0.   44.3   0.    1.    0.    3.  ]
 [ 0.    0.    1.    7.56  1.    0.    1.    2.  ]
 [ 1.    0.    0.   30.14  0.    1.    0.    4.  ]
 [ 0.    1.    0.   16.49  1.    0.    0.    4.  ]
 [ 0.    0.    1.   18.64  0.    0.    1.    3.  ]
 [ 0.    0.    1.   14.52  0.    0.    1.    2.  ]
 [ 0.    0.    1.   13.42  0.    0.    1.    2.  ]
 [ 0.    1.    0.   20.9   0.    1.    0.    3.  ]
 [ 1.    0.    0.   15.81  1.  

In [38]:
from sklearn.svm import SVR
svr = SVR()
svr.fit(X_train, y_train)
y_pred = svr.predict(X_test)


In [39]:
print(y_pred)

[2.64024205 2.96409085 2.16921082 2.82939701 2.32245173 4.09540938
 2.9455181  3.65784443 3.24844354 3.43965137 3.33922705 4.88722627
 1.47092538 4.06674454 2.66206126 2.93622407 2.35341125 2.20762245
 3.21059039 2.42435685 1.78316338 2.1573245  2.10900518 2.00612787
 3.25230364 1.81571464 2.68721939 3.61798094 1.92932064 5.04802312
 4.0754218  3.83653595 1.61615267 2.07780844 2.92178066 2.11009095
 3.07237994 1.99679259 2.98632675 4.58402636 2.9131762  4.90184667
 1.91801328 3.92631374 2.1573588  1.93784329 3.25296257 2.226638
 1.70093106 3.25191872 2.69297237 4.25985546 2.65060541 3.68149848
 1.4545632  3.08058649 2.12381557 1.46630072 4.0960782  1.79921963
 2.77078057]


In [40]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print("R2 Score:", r2)
print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)


R2 Score: 0.4906520205317081
Mean Absolute Error: 0.7032136104409814
Mean Squared Error: 0.9053245437514419
Root Mean Squared Error: 0.9514854406408129


In [41]:
## Hyperparameter Tuning using GridSearchCV
from sklearn.model_selection import GridSearchCV
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf', 'linear']
}
grid = GridSearchCV(SVR(), param_grid, refit=True, verbose=3)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.043 total time=   0.0s
[CV 2/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.003 total time=   0.0s
[CV 3/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.145 total time=   0.0s
[CV 4/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.064 total time=   0.0s
[CV 5/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.073 total time=   0.0s
[CV 1/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.154 total time=   0.0s
[CV 2/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.393 total time=   0.0s
[CV 3/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.517 total time=   0.0s
[CV 4/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.291 total time=   0.0s
[CV 5/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.341 total time=   0.0s
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.190 total time=   0.0s
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf

0,1,2
,estimator,SVR()
,param_grid,"{'C': [0.1, 1, ...], 'gamma': [1, 0.1, ...], 'kernel': ['rbf', 'linear']}"
,scoring,
,n_jobs,
,refit,True
,cv,
,verbose,3
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,kernel,'rbf'
,degree,3
,gamma,0.01
,coef0,0.0
,tol,0.001
,C,10
,epsilon,0.1
,shrinking,True
,cache_size,200
,verbose,False


In [42]:
grid.best_params_

{'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}

In [43]:
grid.predict(X_test)
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
r2 = r2_score(y_test, grid.predict(X_test))
mae = mean_absolute_error(y_test, grid.predict(X_test))
mse = mean_squared_error(y_test, grid.predict(X_test))
rmse = np.sqrt(mse)
print("R2 Score:", r2)
print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)


R2 Score: 0.31575739864936225
Mean Absolute Error: 0.792356118540305
Mean Squared Error: 1.216185487826459
Root Mean Squared Error: 1.1028080013431436
