In [None]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

In [None]:
sns.get_dataset_names()

## Load data

In [None]:
data_tips = sns.load_dataset('tips')
data_tips

## Explore data

In [None]:
data_tips.head(10)

In [None]:
data_tips.tail(10)

In [None]:
# Add a new column with the tip in percent
data_tips["tip_perc"] = data_tips["tip"] / data_tips["total_bill"]
data_tips

In [None]:
data_tips.info()

In [None]:
data_tips[data_tips.duplicated(keep=False)]
# There are two completely identical rows
# There is not enough information in the data to determine if this would be an error that should be deduplicated or just a coincident
# However duplicate rows reduce the variance in the data and may lead to higher overfiting of the linear regression model

In [None]:
# Summary for close_price
data_tips.describe()

In [None]:
sns.scatterplot(data=data_tips, x="total_bill", y="tip")

In [None]:
sns.scatterplot(data=data_tips, x="total_bill", y="tip", hue="time")

In [None]:
plt.figure(figsize=(10,10))
sns.scatterplot(data=data_tips, x="total_bill", y="tip", hue="time", size="size", sizes=(15, 150))

In [None]:
data_tips[data_tips['tip_perc'] == max(data_tips["tip_perc"])]

In [None]:
data_tips_female = data_tips[data_tips['sex'] == 'Female']
data_tips_male = data_tips[data_tips['sex'] == 'Male']

In [None]:
data_tips_female['sex'].unique(), data_tips_male['sex'].unique()

In [None]:
data_tips_female.describe()

In [None]:
data_tips_male.describe()

In [None]:
plt.figure(figsize=(10,10))
sns.scatterplot(data=data_tips, x="total_bill", y="tip_perc", hue="time", size="size", sizes=(15, 150))

## Modelling

In [None]:
X = data_tips.iloc[:, 0].values.reshape(-1, 1)
Y = data_tips.iloc[:, 1].values.reshape(-1, 1)

In [None]:
linear_regressor = LinearRegression()
lr = linear_regressor.fit(X, Y)

In [None]:
Y_pred = lr.predict(X)

In [None]:
lr.score(X, Y)

In [None]:
lr.intercept_

In [None]:
lr.coef_

In [None]:
lr.feature_names_in_

In [None]:
plt.scatter(X, Y)
plt.plot(X, Y_pred, color='red')
plt.show()