In [75]:
import pandas as pd 
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [76]:
tips= sns.load_dataset("tips")
tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [77]:
tips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


In [78]:
y= tips["tip"]
x= pd.get_dummies(tips.drop("tip", axis= "columns"), columns= ["sex", "smoker", "day", "time"], drop_first=True)

In [79]:
x

Unnamed: 0,total_bill,size,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner
0,16.99,2,True,True,False,False,True,True
1,10.34,3,False,True,False,False,True,True
2,21.01,3,False,True,False,False,True,True
3,23.68,2,False,True,False,False,True,True
4,24.59,4,True,True,False,False,True,True
...,...,...,...,...,...,...,...,...
239,29.03,3,False,True,False,True,False,True
240,27.18,2,True,False,False,True,False,True
241,22.67,2,False,False,False,True,False,True
242,17.82,2,False,True,False,True,False,True


In [80]:
y

0      1.01
1      1.66
2      3.50
3      3.31
4      3.61
       ... 
239    5.92
240    2.00
241    2.00
242    1.75
243    3.00
Name: tip, Length: 244, dtype: float64

In [81]:
X_train, X_test, y_train, y_test= train_test_split(x, y, test_size= 0.3, random_state= 42)

In [82]:
print(f"{X_train.shape= }")
print(f"{y_train.shape= }")
print(f"{X_test.shape= }")
print(f"{y_test.shape= }")

X_train.shape= (170, 8)
y_train.shape= (170,)
X_test.shape= (74, 8)
y_test.shape= (74,)


In [83]:
from sklearn.preprocessing import StandardScaler

scaler= StandardScaler()
scaled_X_train= scaler.fit_transform(X_train)
scaled_X_test= scaler.transform(X_test)

scaled_X_train.mean(), scaled_X_train.std(), scaled_X_test.mean(), scaled_X_test.std()

(-1.6898247507162308e-17, 1.0, -0.023739715688131217, 0.9759307323187313)

In [84]:
from sklearn.preprocessing import PolynomialFeatures

model_polynomial= PolynomialFeatures(2, include_bias= False)

#poly_features= model_polynomial.fit(scaled_X_train)
X_train_direct= model_polynomial.fit_transform(scaled_X_train)

X_train_poly= model_polynomial.transform(scaled_X_train)
X_test_poly= model_polynomial.transform(scaled_X_test)

np.allclose(X_train_poly, X_train_direct)
X_train_poly.shape

(170, 44)

In [92]:
scaled_X_train.shape, scaled_X_train.mean(), scaled_X_test.mean(), scaled_X_test.shape, scaled_X_test.std()

((170, 8),
 -1.6898247507162308e-17,
 -0.023739715688131217,
 (74, 8),
 0.9759307323187313)

In [100]:
from sklearn.linear_model import LinearRegression

model_linear_regression= LinearRegression()

test_RAMSE= []
train_RAMSE= []
for degree in range(1, 5):
    poly= PolynomialFeatures(degree= degree, include_bias= False)
    
    poly_X_train= poly.fit_transform(scaled_X_train)
    poly_X_test= poly.transform(scaled_X_test)
    
    print(poly_X_test.shape)
  
    model_linear_regression.fit(poly_X_train, y_train)
    
    
    y_train_pred= model_linear_regression.predict(poly_X_train)
    y_test_pred= model_linear_regression.predict(poly_X_test)
    
    train_RAMSE.append(np.sqrt(mean_squared_error(y_train, y_train_pred)))
    test_RAMSE.append(np.sqrt(mean_squared_error(y_test, y_test_pred)))
    

test_RAMSE, train_RAMSE

(74, 8)
(74, 44)
(74, 164)
(74, 494)


([0.9653146231105687,
  1.043670419960901,
  17328643781.897804,
  79658267261.11716],
 [1.0410512768283988,
  0.8993034960395306,
  0.6751493642301042,
  0.440620947920175])

In [95]:
y_test

24     3.18
6      2.00
153    2.00
211    5.16
198    2.00
       ... 
165    3.48
154    2.00
216    3.00
79     2.71
29     3.00
Name: tip, Length: 74, dtype: float64

In [96]:
y_test_pred

array([ 4.04290771e+00,  9.67803955e-01,  4.12097168e+00,  1.95924377e+00,
        1.92887878e+00,  4.94355774e+00,  3.21017456e+00,  1.89843750e+00,
        2.41310120e+00, -2.42214971e+08,  2.59808350e+00,  1.94151306e+00,
        1.66925049e+00, -5.15747070e-01, -1.79854448e+11,  6.83096964e+10,
        2.68554665e+10,  3.18315125e+00,  2.62893677e+00, -4.53716684e+10,
       -1.27089091e+10,  3.41975403e+00,  1.96656799e+00,  1.72583008e+00,
        2.81010437e+00,  2.25605774e+00,  1.60176086e+00,  3.02885437e+00,
        1.82884216e+00,  3.62661047e+11,  1.29725189e+01,  9.80606079e-01,
        2.79547119e+00,  3.75022888e+00,  2.72189331e+00,  1.17591356e+11,
        5.62092853e+09, -2.06941067e+11,  1.98989868e+00,  3.98416138e+00,
        1.89152527e+00,  2.18542480e+00,  4.28340149e+00, -7.47810364e+00,
        9.49249268e-02,  2.89371689e+11,  2.11524963e+00,  2.79174805e+00,
        1.60386658e+00,  2.50137329e+00,  3.97438049e+00, -1.46445320e+09,
        5.62356567e+00,  