In [1]:
import seaborn as sns

df = sns.load_dataset('tips')

In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.model_selection import train_test_split,GridSearchCV,StratifiedKFold
from sklearn.svm import SVR
from sklearn.metrics import r2_score,mean_absolute_error

In [3]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


In [5]:
df.describe()

Unnamed: 0,total_bill,tip,size
count,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672
std,8.902412,1.383638,0.9511
min,3.07,1.0,1.0
25%,13.3475,2.0,2.0
50%,17.795,2.9,2.0
75%,24.1275,3.5625,3.0
max,50.81,10.0,6.0


In [6]:
for col in df.columns:
    print(df[col].value_counts())

total_bill
13.42    3
21.01    2
20.69    2
10.33    2
10.34    2
        ..
29.03    1
27.18    1
22.67    1
17.82    1
18.78    1
Name: count, Length: 229, dtype: int64
tip
2.00    33
3.00    23
4.00    12
2.50    10
5.00    10
        ..
1.47     1
1.17     1
4.67     1
5.92     1
1.75     1
Name: count, Length: 123, dtype: int64
sex
Male      157
Female     87
Name: count, dtype: int64
smoker
No     151
Yes     93
Name: count, dtype: int64
day
Sat     87
Sun     76
Thur    62
Fri     19
Name: count, dtype: int64
time
Dinner    176
Lunch      68
Name: count, dtype: int64
size
2    156
3     38
4     37
5      5
1      4
6      4
Name: count, dtype: int64


### Feature Encoding
 * label encodeing - smoker,time,sex
 * one hot encoding - day

In [7]:
#Label-Encoding
df['sex'] = LabelEncoder().fit_transform(df['sex'])
df['smoker'] = LabelEncoder().fit_transform(df['smoker'])
df['time'] = LabelEncoder().fit_transform(df['time'])

In [8]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,0,0,Sun,0,2
1,10.34,1.66,1,0,Sun,0,3
2,21.01,3.50,1,0,Sun,0,3
3,23.68,3.31,1,0,Sun,0,2
4,24.59,3.61,0,0,Sun,0,4
...,...,...,...,...,...,...,...
239,29.03,5.92,1,0,Sat,0,3
240,27.18,2.00,0,1,Sat,0,2
241,22.67,2.00,1,1,Sat,0,2
242,17.82,1.75,1,0,Sat,0,2


In [9]:
df['sex'].value_counts()

sex
1    157
0     87
Name: count, dtype: int64

In [10]:
df['time'].value_counts()

time
0    176
1     68
Name: count, dtype: int64

In [11]:
df['smoker'].value_counts()

smoker
0    151
1     93
Name: count, dtype: int64

### One-Hot Encoding

In [12]:
encoder = OneHotEncoder()
day_onehot_encode = encoder.fit_transform(df[['day']]).toarray()
day_onehot_encode

array([[0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],


In [13]:
df_day = pd.DataFrame(day_onehot_encode,columns=encoder.get_feature_names_out(['day']))
df_day

Unnamed: 0,day_Fri,day_Sat,day_Sun,day_Thur
0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0
4,0.0,0.0,1.0,0.0
...,...,...,...,...
239,0.0,1.0,0.0,0.0
240,0.0,1.0,0.0,0.0
241,0.0,1.0,0.0,0.0
242,0.0,1.0,0.0,0.0


In [14]:
df = pd.concat([df,df_day],axis=1)
df = df.drop('day',axis=1)
df

Unnamed: 0,total_bill,tip,sex,smoker,time,size,day_Fri,day_Sat,day_Sun,day_Thur
0,16.99,1.01,0,0,0,2,0.0,0.0,1.0,0.0
1,10.34,1.66,1,0,0,3,0.0,0.0,1.0,0.0
2,21.01,3.50,1,0,0,3,0.0,0.0,1.0,0.0
3,23.68,3.31,1,0,0,2,0.0,0.0,1.0,0.0
4,24.59,3.61,0,0,0,4,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,1,0,0,3,0.0,1.0,0.0,0.0
240,27.18,2.00,0,1,0,2,0.0,1.0,0.0,0.0
241,22.67,2.00,1,1,0,2,0.0,1.0,0.0,0.0
242,17.82,1.75,1,0,0,2,0.0,1.0,0.0,0.0


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    int64  
 3   smoker      244 non-null    int64  
 4   time        244 non-null    int64  
 5   size        244 non-null    int64  
 6   day_Fri     244 non-null    float64
 7   day_Sat     244 non-null    float64
 8   day_Sun     244 non-null    float64
 9   day_Thur    244 non-null    float64
dtypes: float64(6), int64(4)
memory usage: 19.2 KB


In [16]:
X = df.drop('tip',axis=1)
Y = df['tip']

In [17]:
print(X.columns)

Index(['total_bill', 'sex', 'smoker', 'time', 'size', 'day_Fri', 'day_Sat',
       'day_Sun', 'day_Thur'],
      dtype='object')


In [18]:
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2)
print(x_train.shape,x_test.shape,y_train.shape,y_test.shape)

(195, 9) (49, 9) (195,) (49,)


In [19]:
svr = SVR()
svr.fit(x_train,y_train)
y_pred = svr.predict(x_test)
print("mean Absolute error : ",mean_absolute_error(y_test,y_pred))
print("R2 score : ",r2_score(y_test,y_pred))

mean Absolute error :  0.5515386357157324
R2 score :  0.39442396856456485


In [20]:
params = {'C':[0.001,0.01,0.1,1.0,10,20,3500,1000],
          'gamma':[0.001,0.01,0.1,1.0,10,2],
          'kernel':['rbf','poly','linear'],
          'degree':[2,3,4]}

In [21]:
grid_svr = GridSearchCV(estimator=svr,param_grid=params,cv=5,verbose=3,refit=True,)
grid_svr

In [22]:
grid_svr.fit(x_train,y_train)

Fitting 5 folds for each of 432 candidates, totalling 2160 fits
[CV 1/5] END C=0.001, degree=2, gamma=0.001, kernel=rbf;, score=-0.054 total time=   0.0s
[CV 2/5] END C=0.001, degree=2, gamma=0.001, kernel=rbf;, score=-0.036 total time=   0.0s
[CV 3/5] END C=0.001, degree=2, gamma=0.001, kernel=rbf;, score=-0.034 total time=   0.0s
[CV 4/5] END C=0.001, degree=2, gamma=0.001, kernel=rbf;, score=0.009 total time=   0.0s
[CV 5/5] END C=0.001, degree=2, gamma=0.001, kernel=rbf;, score=0.006 total time=   0.0s
[CV 1/5] END C=0.001, degree=2, gamma=0.001, kernel=poly;, score=-0.046 total time=   0.0s
[CV 2/5] END C=0.001, degree=2, gamma=0.001, kernel=poly;, score=-0.030 total time=   0.0s
[CV 3/5] END C=0.001, degree=2, gamma=0.001, kernel=poly;, score=-0.033 total time=   0.0s
[CV 4/5] END C=0.001, degree=2, gamma=0.001, kernel=poly;, score=0.013 total time=   0.0s
[CV 5/5] END C=0.001, degree=2, gamma=0.001, kernel=poly;, score=0.008 total time=   0.0s
[CV 1/5] END C=0.001, degree=2, gam

KeyboardInterrupt: 

In [None]:
y_train.dtypes

dtype('float64')