**Importing all the required libraries**

In [188]:
import pandas as pd
import numpy as np
import plotly.express as px
import sklearn

**Creating a DataFrame**

In [145]:
df = pd.read_csv(r"D:\Sql_Practice\Python_Material\Machine_learning_project_data\tips.csv")

In [147]:
#printing first 5 rows of the data
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,customer_per_table
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [146]:
#Changing the name of the column 'size' to 'cutomer_per_Table' 
df.rename(columns={'size':'customer_per_table'},inplace=True)

**Performing basic eda**

In [148]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   total_bill          244 non-null    float64
 1   tip                 244 non-null    float64
 2   sex                 244 non-null    object 
 3   smoker              244 non-null    object 
 4   day                 244 non-null    object 
 5   time                244 non-null    object 
 6   customer_per_table  244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 13.5+ KB


In [149]:
df.describe()

Unnamed: 0,total_bill,tip,customer_per_table
count,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672
std,8.902412,1.383638,0.9511
min,3.07,1.0,1.0
25%,13.3475,2.0,2.0
50%,17.795,2.9,2.0
75%,24.1275,3.5625,3.0
max,50.81,10.0,6.0


In [150]:
#checking if data has any null values
df.isnull().sum()

total_bill            0
tip                   0
sex                   0
smoker                0
day                   0
time                  0
customer_per_table    0
dtype: int64

**Using some visualization to understand more about dataset**

In [151]:
#finding how tip varies according to total_bill and according to time of the meal
fig = px.scatter(data_frame=df,x='total_bill', y="tip",color='time',trendline="ols",title="tip v/s time")
fig.show()

In [152]:
#finding how tip varies according to total_bill and day of the meal
fig = px.scatter(data_frame=df, x='total_bill',y="tip",color='day',trendline='ols',title="tip v/s day")
fig.show()

In [154]:
#Finding out the days when most amount of tips was given
fig = px.pie(data_frame=df,names='day',values='tip',hole=0.6,title='tips during days')
fig.show()

In [137]:
#finding out the maximum tips given by gender
fig = px.pie(data_frame=df, names='sex', values='tip', hole=0.6, title='tips by gender')
fig.show()

In [155]:
#Finding out the time of the mean when the tips was maximum
fig = px.pie(data_frame = df, names='time', values='tip', hole=0.6, title='tips by time of meal')
fig.show()

**Converting categorical values to numerical values for model**

In [156]:
#First finding all the unique categorical values
col_lst = ['sex','smoker','day','time']
for i in col_lst:
    print(df[i].unique())

['Female' 'Male']
['No' 'Yes']
['Sun' 'Sat' 'Thur' 'Fri']
['Dinner' 'Lunch']


In [159]:
#Converting the values into 1 and 0 
df['sex'].replace({'Female':0, 'Male':1}, inplace=True)
df['smoker'].replace({'No':0,'Yes':1}, inplace=True)
df['day'].replace({'Sun':3, 'Sat':2, 'Thur':0, 'Fri':1}, inplace=True)
df['time'].replace({'Lunch':0, 'Dinner':1}, inplace=True)

In [160]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,customer_per_table
0,16.99,1.01,0,0,3,1,2
1,10.34,1.66,1,0,3,1,3
2,21.01,3.5,1,0,3,1,3
3,23.68,3.31,1,0,3,1,2
4,24.59,3.61,0,0,3,1,4


**Splitting the data into test and train**

In [183]:
#independent variables
x_ind = np.array(df[['total_bill','sex','smoker','day','time','customer_per_table']])
#dependent variables
y_dep = np.array(df['tip'])

In [249]:
#splitting the data into train-test
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_ind, y_dep, test_size=0.2, random_state = 42)

In [250]:
#importing LinearRegression
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [251]:
#fitting the model
model.fit(x_train,y_train)

LinearRegression()

In [252]:
#Making Prediction
y_pred = model.predict(x_test)
y_pred

array([2.95915033, 1.9793852 , 3.93355525, 3.81512843, 2.17478172,
       2.65950784, 3.65751199, 2.30684737, 2.5469726 , 2.27028847,
       2.87845875, 2.10446382, 2.13402126, 2.33746574, 1.84729628,
       3.12055033, 2.97055543, 3.18916799, 2.61631688, 5.73865153,
       3.48064358, 3.26948544, 2.23303951, 1.98924641, 3.15115427,
       2.25680834, 2.12104595, 3.26905178, 3.17236483, 6.62813881,
       4.9784413 , 1.6371344 , 3.23322391, 2.82236266, 2.96117781,
       3.86625614, 1.97848176, 5.48228412, 2.37106691, 3.04713105,
       2.07990929, 2.49950599, 3.41299197, 2.3340929 , 1.94813896,
       0.93957882, 1.88172086, 3.08438206, 1.86676908])

In [263]:
#Finding the coefficient and intercept for the model
intercept = model.intercept_
coefficient = model.coef_
print(f"Intercept of the equation is: {intercept}\n")
print(f"Coefficients of the equation are: {coefficient}\n")
print("""The regression equation is Y = 0.6643309656240284 + 0.0944405 X1 + 0.02456897 X2 + -0.18117471 X3 + 0.06380238 X4 +
                              -0.19306402 X5 + 0.23194949 X6""")

Intercept of the equation is: 0.6643309656240284

Coefficients of the equation are: [ 0.0944405   0.02456897 -0.18117471  0.06380238 -0.19306402  0.23194949]

The regression equation is Y = 0.6643309656240284 + 0.0944405 X1 + 0.02456897 X2 + -0.18117471 X3 + 0.06380238 X4 +
                              -0.19306402 X5 + 0.23194949 X6


In [254]:
#Now evaluating the model 
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Calculate evaluation metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("MSE:", mse)
print("RMSE:", rmse)
print("MAE:", mae)
print("R2 Score:", r2)

MSE: 0.6963090766605344
RMSE: 0.8344513626692298
MAE: 0.6685728160722872
R2 Score: 0.4429399687489901
