In [176]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [177]:
data = pd.read_csv('https://raw.githubusercontent.com/amankharwal/Website-data/master/tips.csv')
print(data.head())

   total_bill   tip     sex smoker  day    time  size
0       16.99  1.01  Female     No  Sun  Dinner     2
1       10.34  1.66    Male     No  Sun  Dinner     3
2       21.01  3.50    Male     No  Sun  Dinner     3
3       23.68  3.31    Male     No  Sun  Dinner     2
4       24.59  3.61  Female     No  Sun  Dinner     4


In [178]:
figure = px.scatter(data_frame = data, x = 'total_bill', y = 'tip', color = 'day', size = 'size')
figure.show()

In [179]:
figure = px.scatter(data_frame = data, x = 'total_bill', y = 'tip', size='size', color = 'time')
figure.show()

In [180]:
figure = px.pie(data, values='tip', names='day')
figure.show()

In [181]:
figure = px.pie(data, names='sex')
figure.show(0)

In [182]:
figure=px.pie(data, names='smoker')
figure.show()

In [183]:
figure=px.pie(data, names='time')
figure.show()

In [184]:
data["sex"] = data["sex"].map({"Female": 0, "Male": 1})
data["smoker"] = data["smoker"].map({"No": 0, "Yes": 1})
data["day"] = data["day"].map({"Thur": 0, "Fri": 1, "Sat": 2, "Sun": 3})
data['time']=data['time'].map({'Lunch':0, 'Dinner':1})
print(data.head())

   total_bill   tip  sex  smoker  day  time  size
0       16.99  1.01    0       0    3     1     2
1       10.34  1.66    1       0    3     1     3
2       21.01  3.50    1       0    3     1     3
3       23.68  3.31    1       0    3     1     2
4       24.59  3.61    0       0    3     1     4


In [185]:
X=np.array(data[["total_bill", "sex", "smoker", "day", "time", "size"]])
Y=np.array(data[['tip']])
Xtrain, Xtest, Ytrain, Ytest=train_test_split(X, Y, test_size=0.2, random_state=42)

In [186]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(Xtrain, Ytrain)
predictions = model.predict(Xtest)


In [187]:
print(r2_score(Ytest, predictions))
print(mean_squared_error(Ytest, predictions))
print(mean_absolute_error(Ytest, predictions))

0.4429399687489898
0.6963090766605349
0.6685728160722872


In [190]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
kfold = KFold(n_splits=5)
result = cross_val_score(model, Xtrain, Ytrain, cv=kfold, scoring='r2')
print(result.mean())

0.31812596918902464


In [191]:
from sklearn.model_selection import GridSearchCV
param_grid = {'fit_intercept':[True, False], 'normalize':[True, False]}
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring = 'r2', cv=kfold)
grid.fit(Xtrain, Ytrain)


'normalize' was deprecated in version 1.0 and will be removed in 1.2.
If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)




'normalize' was deprecated in version 1.0 and will be removed in 1.2.
If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwa

GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=False),
             estimator=LinearRegression(),
             param_grid={'fit_intercept': [True, False],
                         'normalize': [True, False]},
             scoring='r2')

In [192]:
results=pd.DataFrame.from_dict(grid.cv_results_)
results



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_fit_intercept,param_normalize,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.002411,0.00048,0.0004,0.00049,True,True,"{'fit_intercept': True, 'normalize': True}",-0.118341,0.465241,0.570983,0.551783,0.120964,0.318126,0.271865,1
1,0.001806,0.000403,0.000799,0.000399,True,False,"{'fit_intercept': True, 'normalize': False}",-0.118341,0.465241,0.570983,0.551783,0.120964,0.318126,0.271865,1
2,0.001797,0.000749,0.000605,0.000494,False,True,"{'fit_intercept': False, 'normalize': True}",-0.2155,0.498407,0.594571,0.541215,0.071577,0.298054,0.316929,3
3,0.001397,0.000492,0.000604,0.000493,False,False,"{'fit_intercept': False, 'normalize': False}",-0.2155,0.498407,0.594571,0.541215,0.071577,0.298054,0.316929,3


In [193]:
print('Best: %f using %s' % (grid.best_score_, grid.best_params_))



Best: 0.318126 using {'fit_intercept': True, 'normalize': True}


In [194]:
pred=grid.predict(Xtest)
print(mean_absolute_error(Ytest, pred))
print(r2_score(Ytest, pred))

0.6685728160722874
0.44293996874898955
