In [69]:
import pandas as pd 
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [70]:
tips= sns.load_dataset("tips")
tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [71]:
tips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


In [72]:
y= tips["tip"]
x= pd.get_dummies(tips.drop("tip", axis= "columns"), columns= ["sex", "smoker", "day", "time"], drop_first=True)
x

Unnamed: 0,total_bill,size,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner
0,16.99,2,True,True,False,False,True,True
1,10.34,3,False,True,False,False,True,True
2,21.01,3,False,True,False,False,True,True
3,23.68,2,False,True,False,False,True,True
4,24.59,4,True,True,False,False,True,True
...,...,...,...,...,...,...,...,...
239,29.03,3,False,True,False,True,False,True
240,27.18,2,True,False,False,True,False,True
241,22.67,2,False,False,False,True,False,True
242,17.82,2,False,True,False,True,False,True


In [73]:
x= x[["total_bill", "size", "sex_Female"]]

In [74]:
y

0      1.01
1      1.66
2      3.50
3      3.31
4      3.61
       ... 
239    5.92
240    2.00
241    2.00
242    1.75
243    3.00
Name: tip, Length: 244, dtype: float64

In [75]:
X_train, X_test, y_train, y_test= train_test_split(x, y, test_size= 0.3, random_state= 42)

In [76]:
print(f"{X_train.shape= }")
print(f"{y_train.shape= }")
print(f"{X_test.shape= }")
print(f"{y_test.shape= }")

X_train.shape= (170, 3)
y_train.shape= (170,)
X_test.shape= (74, 3)
y_test.shape= (74,)


In [77]:
from sklearn.preprocessing import StandardScaler

scaler= StandardScaler()
scaled_X_train= scaler.fit_transform(X_train)
scaled_X_test= scaler.transform(X_test)

scaled_X_train.mean(), scaled_X_train.std(), scaled_X_test.mean(), scaled_X_test.std()

(-4.179663151530001e-17, 1.0, -0.09755846199564487, 0.9927004178768081)

In [78]:
from sklearn.preprocessing import PolynomialFeatures

model_polynomial= PolynomialFeatures(2, include_bias= False)

#poly_features= model_polynomial.fit(scaled_X_train)
X_train_direct= model_polynomial.fit_transform(scaled_X_train)

X_train_poly= model_polynomial.transform(scaled_X_train)
X_test_poly= model_polynomial.transform(scaled_X_test)

np.allclose(X_train_poly, X_train_direct)

True

In [79]:
scaled_X_train.shape, scaled_X_train.mean(), scaled_X_test.mean(), scaled_X_test.shape, scaled_X_test.std()

((170, 3),
 -4.179663151530001e-17,
 -0.09755846199564487,
 (74, 3),
 0.9927004178768081)

In [80]:
from sklearn.linear_model import LinearRegression

model_linear_regression= LinearRegression()

test_RAMSE= []
train_RAMSE= []
for degree in range(1, 5):
    poly= PolynomialFeatures(degree= degree, include_bias= False)
    
    poly_X_train= poly.fit_transform(scaled_X_train)
    poly_X_test= poly.transform(scaled_X_test)
    
    print(poly_X_test.shape)
  
    model_linear_regression.fit(poly_X_train, y_train)
    
    
    y_train_pred= model_linear_regression.predict(poly_X_train)
    y_test_pred= model_linear_regression.predict(poly_X_test)
    
    train_RAMSE.append(np.sqrt(mean_squared_error(y_train, y_train_pred)))
    test_RAMSE.append(np.sqrt(mean_squared_error(y_test, y_test_pred)))
    

test_RAMSE, train_RAMSE

(74, 3)
(74, 9)
(74, 19)
(74, 34)


([0.9213797244087817,
  0.9744434971568211,
  1.0004249372962883,
  1.1700068250294997],
 [1.0515135378066789,
  1.0369898861458895,
  0.9414087709958213,
  0.8595899716312863])

In [81]:
y_test

24     3.18
6      2.00
153    2.00
211    5.16
198    2.00
       ... 
165    3.48
154    2.00
216    3.00
79     2.71
29     3.00
Name: tip, Length: 74, dtype: float64

In [82]:
y_test_pred

array([ 3.38305325,  1.49440327,  3.89431402,  4.055189  ,  2.25800843,
        3.04941106,  3.67598888,  2.17035022,  2.40675463,  2.65688771,
        3.12442769,  1.6663007 ,  1.89132666,  2.3431185 , -0.92138803,
        3.67853405,  3.19437813,  3.61246168,  2.8156256 ,  3.90243005,
        1.29250512,  3.50608859,  2.48160713,  1.53710895,  4.40868205,
        1.80340191,  1.60517582,  2.69402053,  2.8239521 ,  2.51580389,
        3.0122045 ,  1.49693455,  2.94431651,  3.45205936,  3.05873132,
        2.8291206 ,  1.86085115,  5.08744933,  2.28528974,  3.71223684,
        1.63320884,  2.51689281,  3.78522468,  0.67007907,  1.61309358,
       -0.36351319,  1.88341577,  3.19096811,  1.50517204,  2.09546626,
        3.7959107 ,  3.75397054,  4.40992265,  2.66214139,  3.33148672,
        2.04611422,  1.52529904,  3.03601495,  2.94351661,  2.67969081,
        1.12293196,  2.92569753,  3.75590802,  2.79043876,  3.44354967,
        3.13872628,  2.45573952, -1.0386095 ,  3.98903961,  3.92