In [11]:
# Project Title: Linear Regression Model Pipeline for Tips Dataset

# Description:
# This project implements a machine learning pipeline using scikit-learn to predict tips based 
# on various features from the 'tips' dataset. The pipeline includes data preprocessing steps 
# such as imputation for missing values and scaling for numerical features, as well as 
# encoding for categorical features. Finally, it employs GridSearchCV to optimize model parameters.

# Steps:
# 1. Import necessary libraries
# 2. Load the dataset and define features and target variable
# 3. Identify numerical and categorical columns
# 4. Create preprocessing pipelines for numerical and categorical data
# 5. Construct the complete model pipeline
# 6. Split the data into training and testing sets
# 7. Fit the model and make predictions
# 8. Evaluate the model and perform hyperparameter tuning with GridSearchCV
# 9. Save the trained model pipeline

In [12]:
# Step 1: Import necessary libraries
import numpy as np 
import pandas as pd 
import plotly.express as px 
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV
import joblib

In [13]:
# Step 2: Load the dataset and define features and target variable
row_data = pd.read_csv("tips.csv")  # Load dataset
y = row_data['tip']                   # Target variable
x = row_data[['total_bill', 'sex', 'smoker', 'day', 'time', 'size']]  # Features

In [14]:
# Step 3: Identify numerical and categorical columns
num_col = [col for col in x.columns if x[col].dtype in ['int64', 'float64']]
cat_col = [col for col in x.columns if x[col].nunique() < 10 and x[col].dtype == 'object']

In [15]:
# Step 4: Create preprocessing pipelines
# Categorical pipeline
cat_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy="most_frequent")),  # Impute missing values with the most frequent
    ('OneHotEncoder', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical variables
])

# Numerical pipeline
num_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer()),  # Impute missing values with mean (default)
    ('MinMaxScaler', MinMaxScaler())  # Scale numerical features
])

# Combine both pipelines into a column transformer
col_transformer = ColumnTransformer(transformers=[
    ('num_pipe', num_pipe, num_col),
    ('cat_pipe', cat_pipe, cat_col)
], 
remainder='drop',  # Drop other columns not specified
n_jobs=-1  # Use all available CPU cores
)

In [None]:
# Step 5: Construct the complete model pipeline
model_pipeline = Pipeline(steps=[
    ('col_transformer', col_transformer),  # Apply column transformer
    ('model', LinearRegression())  # Linear regression model
])

display(model_pipeline)

In [17]:
# Step 6: Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [18]:
# Step 7: Fit the model and make predictions
model_pipeline.fit(x_train, y_train)  # Fit the model
preds = model_pipeline.predict(x_test)  # Make predictions
score = model_pipeline.score(x_test, y_test)  # Evaluate model score

In [19]:
# Step 8: Perform hyperparameter tuning with GridSearchCV
grid_params = {
    'model__fit_intercept': [True, False]  # Hyperparameter for Linear Regression
}

grid_search = GridSearchCV(model_pipeline, grid_params, cv=5)
grid_search.fit(x_train, y_train)  # Fit GridSearchCV

# Output best parameters and score from GridSearchCV
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)
print("Test score:", grid_search.score(x_test, y_test))

Best parameters: {'model__fit_intercept': False}
Best cross-validation score: 0.31051558185592787
Test score: 0.4373018194348234


In [20]:
# Step 9: Save the trained model pipeline
joblib.dump(model_pipeline, "LR_model_pipeline.joblib")

['LR_model_pipeline.joblib']