# SVM (Regression) or SVR Implementation

In [84]:
## Importing the "Tips" Dataset
import seaborn as sns
df = sns.load_dataset('tips')

## printing top 5 entries
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [85]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


This means - sex, smoker, day and time are categorical data.  

In [86]:
df['sex'].value_counts()

sex
Male      157
Female     87
Name: count, dtype: int64

In [87]:
df['smoker'].value_counts()

smoker
No     151
Yes     93
Name: count, dtype: int64

In [88]:
df['day'].value_counts()

day
Sat     87
Sun     76
Thur    62
Fri     19
Name: count, dtype: int64

In [89]:
df['day'].value_counts()

day
Sat     87
Sun     76
Thur    62
Fri     19
Name: count, dtype: int64

In [90]:
df.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

Now here we perform - OneHotEncoding(on multiple datapoints) and Label Encoding (for binary classification).

In [91]:
X = df[['tip', 'sex', 'smoker', 'day', 'time', 'size']]
y = df['total_bill']

In [92]:
## performing train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.25)

In [93]:
X_train.head()

Unnamed: 0,tip,sex,smoker,day,time,size
115,3.5,Female,No,Sun,Dinner,2
181,5.65,Male,Yes,Sun,Dinner,2
225,2.5,Female,Yes,Fri,Lunch,2
68,2.01,Male,No,Sat,Dinner,2
104,4.08,Female,No,Sat,Dinner,2


In [94]:
from sklearn.preprocessing import LabelEncoder

In [95]:
le1 = LabelEncoder()
le2 = LabelEncoder()
le3 = LabelEncoder()

In [96]:
X_train['sex'] = le1.fit_transform(X_train['sex'])
X_train['smoker'] = le2.fit_transform(X_train['smoker'])
X_train['time'] = le3.fit_transform(X_train['time'])

In [97]:
X_train['sex'].value_counts()

sex
1    116
0     67
Name: count, dtype: int64

In [98]:
X_test['sex'] = le1.transform(X_test['sex'])
X_test['smoker'] = le2.transform(X_test['smoker'])
X_test['time'] = le3.transform(X_test['time'])

In [99]:
X_test.head()

Unnamed: 0,tip,sex,smoker,day,time,size
24,3.18,1,0,Sat,0,2
6,2.0,1,0,Sun,0,2
153,2.0,1,0,Sun,0,4
211,5.16,1,1,Sat,0,4
198,2.0,0,1,Thur,1,2


Still we can see "Day" column is not Encoded. For that, we'll use OneHot Encoding to change it.  

In [100]:
## OneHot Encoding ---- Using COlUMN Transformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [101]:
ct = ColumnTransformer(transformers = [('onehot', OneHotEncoder(drop = 'first'), [3])], remainder='passthrough') ## 3 represent the column number on which we are applying Encoding.  

In [102]:
## Checking the complete datapoints
import sys
import numpy as np

np.set_printoptions(threshold=sys.maxsize)
X_train=ct.fit_transform(X_train)

In [103]:
X_train

array([[ 0.  ,  1.  ,  0.  ,  3.5 ,  0.  ,  0.  ,  0.  ,  2.  ],
       [ 0.  ,  1.  ,  0.  ,  5.65,  1.  ,  1.  ,  0.  ,  2.  ],
       [ 0.  ,  0.  ,  0.  ,  2.5 ,  0.  ,  1.  ,  1.  ,  2.  ],
       [ 1.  ,  0.  ,  0.  ,  2.01,  1.  ,  0.  ,  0.  ,  2.  ],
       [ 1.  ,  0.  ,  0.  ,  4.08,  0.  ,  0.  ,  0.  ,  2.  ],
       [ 1.  ,  0.  ,  0.  ,  2.09,  1.  ,  1.  ,  0.  ,  2.  ],
       [ 1.  ,  0.  ,  0.  ,  1.  ,  0.  ,  0.  ,  0.  ,  1.  ],
       [ 1.  ,  0.  ,  0.  ,  3.41,  1.  ,  1.  ,  0.  ,  3.  ],
       [ 0.  ,  1.  ,  0.  ,  3.48,  1.  ,  0.  ,  0.  ,  3.  ],
       [ 0.  ,  1.  ,  0.  ,  2.  ,  1.  ,  0.  ,  0.  ,  4.  ],
       [ 1.  ,  0.  ,  0.  ,  3.  ,  1.  ,  1.  ,  0.  ,  5.  ],
       [ 0.  ,  0.  ,  1.  ,  2.71,  1.  ,  0.  ,  1.  ,  2.  ],
       [ 1.  ,  0.  ,  0.  ,  3.  ,  0.  ,  0.  ,  0.  ,  2.  ],
       [ 1.  ,  0.  ,  0.  ,  3.  ,  1.  ,  1.  ,  0.  ,  2.  ],
       [ 1.  ,  0.  ,  0.  ,  3.  ,  1.  ,  0.  ,  0.  ,  4.  ],
       [ 0.  ,  1.  ,  0.

In [104]:
X_test=ct.transform(X_test) ## same for X_test data

Till now, we have done our Encoding and now we'll start implementing Support Vector Regression.

In [105]:
## importing SVR
from sklearn.svm import SVR
svr = SVR()

In [107]:
## fitting the data into model
svr.fit(X_train, y_train)

In [108]:
y_pred=svr.predict(X_test)

In [109]:
## performance metrices
from sklearn.metrics import r2_score, mean_absolute_error
print("R2 Score:", r2_score(y_test, y_pred))
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))

R2 Score: 0.49798620106004743
Mean Absolute Error: 4.463296539661224


### Hyperparameter Tuning using Grid Search CV

In [None]:
from sklearn.model_selection import GridSearchCV
## defining the parameter range
param_grid = {
    'C': [1, 10],
    'gamma': [0.1, 0.01],
    'kernel': ['rbf', 'linear']
}

In [116]:
grid = GridSearchCV(estimator=SVR(), param_grid=param_grid, refit=True, cv=5, verbose=3, n_jobs=-1)

In [118]:
## Fitting the model
grid.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [119]:
## Making Predictions
y_pred = grid.predict(X_test)