# Intermediate Machine Learning with sklearn

## Simple linear regression

- Simple means we have only one feature/input column and one label/output column
- linear means prediction is based on linear line
- regression means output will be in numeric variables

In [None]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression

# load the data
df = sns.load_dataset('tips')

# Select the feature column from df
X = df[['total_bill']]
# Select the target/label column from df
Y = df['tip']

# # Create "regression line" plot
# sns.lmplot(data=df, x="total_bill", y="tip") ## Line in graph is best fit line

# load the linear regression model
model = LinearRegression()

# Fit the model --> means; train the model
model.fit(X, Y)

# lets predict the tip for an unknown total_bill
model.predict([[50]]) # let total_bill = 50. find what will be tip?

## Multiple linear regression or Multilinear regression

### 2 Variables / Features / Inputs

In [None]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression

# load the data
df = sns.load_dataset('tips')

# Select the feature column from df
X = df[['total_bill', 'size']]
# Select the target/label column from df
Y = df['tip']

# load the linear regression model
model = LinearRegression()

# Fit the model --> means; train the model
model.fit(X, Y)

# lets predict the tip for an unknown total_bill and size
model.predict([[35.527, 2]]) # inputs are total_bill and size

In [None]:
# Set the plot size
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Plot 1: total_bill vs tip
sns.regplot(data=df, x="total_bill", y="tip", ax=axes[0], line_kws={"color": "red"}) # same as lmplot
axes[0].set_title("Total Bill vs Tip")

# Plot 2: size vs tip
sns.regplot(data=df, x="size", y="tip", ax=axes[1], line_kws={"color": "green"})
axes[1].set_title("Size vs Tip")

plt.tight_layout()
plt.show()

In [None]:
# Create a single plot
plt.figure(figsize=(10, 6))

# Plot 1: total_bill vs tip
sns.regplot(data=df, x="total_bill", y="tip", scatter=True, label="Total Bill vs Tip", color="blue")

# Plot 2: size vs tip
sns.regplot(data=df, x="size", y="tip", scatter=True, label="Size vs Tip", color="orange")

# Add labels and legend
plt.title("Regression Plots: Tip vs Total Bill and Size")
plt.xlabel("Total Bill / Size (mixed x-axis)")
plt.ylabel("Tip")
plt.legend()
plt.grid(True)
plt.show()

### 3 Variables / Features / Inputs

In [None]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression

# load the data
df = sns.load_dataset('tips')
# df.head()

# encode smoker column using manual encoder
df['smoker_num'] = df['smoker'].map({'Yes': 1, 'No': 0}) # --> converted smoker from categorial to numeric because model could only be trained on numeric data
# print(df.head())

# Select the feature column from df
X = df[['total_bill', 'size', 'smoker_num']]
# Select the target/label column from df
Y = df['tip']

# load the linear regression model
model = LinearRegression()

# Fit the model --> means; train the model
model.fit(X, Y)

# lets predict the tip for an unknown total_bill, size and smoker
print(model.predict([[35.527, 2, 0]]))
print(model.predict([[35.527, 2, 1]])) # inputs are total_bill, size and smoker

## Classification
- classification means output will be in categorical variables

### Logistic Regression

#### Binary Classification
        output will be only two categorical variable e.g; yes or no, male or female etc

In [None]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression

# load the data
df = sns.load_dataset('tips')
# df.head()

# Select the feature column from df
X = df[['total_bill', 'size', 'tip']]
# Select the target/label column from df
Y = df['smoker']

# load the logistic regression model
model = LogisticRegression()

# Fit the model --> means: train the model
model.fit(X, Y)

# Predict
model.predict([[31.51, 2, 6.322]])

### Gaussian Naive Bayes

In [None]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.naive_bayes import GaussianNB

# load the data
df = sns.load_dataset('tips')
# df.head()

# Select the feature column from df
X = df[['total_bill', 'size', 'tip']]
# Select the target/label column from df
Y = df['smoker']

# load the Gaussian Naive Bayes model
model = GaussianNB()

# Fit the model --> means: train the model
model.fit(X, Y)

# Predict
model.predict([[31.51, 2, 6.322]])

### Decision Tree Classification

In [None]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier

# load the data
df = sns.load_dataset('tips')
# df.head()

# Select the feature column from df
X = df[['total_bill', 'size', 'tip']]
# Select the target/label column from df
Y = df['smoker']

# load the decision tree model
model = DecisionTreeClassifier()

# Fit the model --> means: train the model
model.fit(X, Y)

# Predict
model.predict([[31.51, 2, 6.322]])

### Decision tree Regression

In [None]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeRegressor

# load the data
df = sns.load_dataset('tips')
# df.head()

# Select the feature column from df
X = df[['total_bill', 'size']]
# Select the target/label column from df
Y = df['tip']

# load the decision tree model
model = DecisionTreeRegressor()

# Fit the model --> means: train the model
model.fit(X, Y)

# Predict
model.predict([[31.51, 2]])

# Important Metrics

**Link**: https://scikit-learn.org/stable/api/sklearn.metrics.html#module-sklearn.metrics

**Slides Path in current repo**: 0_resources/regression_&_classification_metrics

## Important Classification Metrics
- accuracy_score
- precision_score
- recall_score
- confusion_matrix
- f1_score (Harmonic mean of precision and recall)
- classification_report

## Important Regression Metrics
- mean_absolute_error
- mean_absolute_percentage_error
- mean_squared_error
- root_mean_squared_error
- r2_score (Coefficient of Determination)


# Regression Metrics

In [None]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
# load tips dataset
df = sns.load_dataset('tips')
# print(df.head())

X = df[['total_bill', 'size']]
Y = df['tip']

# train test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, # test_size = 20% --> take 20% for testing and 80% for training
                                                    random_state=42
                                                    ) 

# Call the model
model = LinearRegression()
# fit the model
model.fit(X_train, Y_train)

# predict
y_pred = model.predict(X_test) # predicted values by our model

# metric to evaluate the model
# testing if actual values from dataset(Y_test) and predicted values by our model(y_pred) are same or not
print(f"Mean Squared Error: {mean_squared_error(Y_test, y_pred)}")
print(f"Root Mean Squared Error: {np.sqrt(mean_squared_error(Y_test, y_pred))}")
print(f"R2 Score: {r2_score(Y_test, y_pred)}")
print(f"Mean Absolute Error: {mean_absolute_error(Y_test, y_pred)}")
print(f"Mean Absolute Percentage Error: {mean_absolute_percentage_error(Y_test, y_pred)}")


