![dvd_image](dvd_image.jpg)

A DVD rental company needs your help! They want to figure out how many days a customer will rent a DVD for based on some features and has approached you for help. They want you to try out some regression models which will help predict the number of days a customer will rent a DVD for. The company wants a model which yeilds a MSE of 3 or less on a test set. The model you make will help the company become more efficient inventory planning.

The data they provided is in the csv file `rental_info.csv`. It has the following features:
- `"rental_date"`: The date (and time) the customer rents the DVD.
- `"return_date"`: The date (and time) the customer returns the DVD.
- `"amount"`: The amount paid by the customer for renting the DVD.
- `"amount_2"`: The square of `"amount"`.
- `"rental_rate"`: The rate at which the DVD is rented for.
- `"rental_rate_2"`: The square of `"rental_rate"`.
- `"release_year"`: The year the movie being rented was released.
- `"length"`: Lenght of the movie being rented, in minuites.
- `"length_2"`: The square of `"length"`.
- `"replacement_cost"`: The amount it will cost the company to replace the DVD.
- `"special_features"`: Any special features, for example trailers/deleted scenes that the DVD also has.
- `"NC-17"`, `"PG"`, `"PG-13"`, `"R"`: These columns are dummy variables of the rating of the movie. It takes the value 1 if the move is rated as the column name and 0 otherwise. For your convinience, the reference dummy has already been dropped.

In [17]:
# Import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import time
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [18]:
# write a wrangle function
def wrangle(file_path):
    
    # read file
    df = pd.read_csv(file_path)

    # drop columns 
    columns_to_drop = ['amount_2', 'length_2', 'rental_rate_2']
    df = df.drop(columns=columns_to_drop)

    # create return_duration column
    df['rental_duration'] = (pd.to_datetime(df.return_date) - pd.to_datetime(df.rental_date)).dt.days
    
    # drop unused column
    df = df.drop(columns=['rental_date', 'return_date'])

    # create dummary variable for special_faetures
    df['clean'] = (df['special_features']
               .str.strip('{}')
               .str.replace('"', '', regex=False))
    dummies = df['clean'].str.get_dummies(sep=',')
    df = df.join(dummies)
    df = df.drop(columns=['clean', 'special_features'])

    return df

rental = wrangle('rental_info.csv')

rental.head()

Unnamed: 0,amount,release_year,rental_rate,length,replacement_cost,NC-17,PG,PG-13,R,rental_duration,Behind the Scenes,Commentaries,Deleted Scenes,Trailers
0,2.99,2005.0,2.99,126.0,16.99,0,0,0,1,3,1,0,0,1
1,2.99,2005.0,2.99,126.0,16.99,0,0,0,1,2,1,0,0,1
2,2.99,2005.0,2.99,126.0,16.99,0,0,0,1,7,1,0,0,1
3,2.99,2005.0,2.99,126.0,16.99,0,0,0,1,2,1,0,0,1
4,2.99,2005.0,2.99,126.0,16.99,0,0,0,1,4,1,0,0,1


In [19]:
target = 'rental_duration'
X = rental.drop(columns=target)
y = rental[target]

In [20]:
# split the data 
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# define the pipeline
pipelines = {
    'Linear' : Pipeline(
        [
            ('Scaler', StandardScaler()),
            ('Model', LinearRegression())
        ]
    ),
    'Ridge' : Pipeline(
        [
            ('Scaler', StandardScaler()),
            ('Model', Ridge())
        ]
    ),
    'Lasso' : Pipeline(
        [
            ('Scaler', StandardScaler()),
            ('Model', Lasso())
        ]
    ),
    'Random Forest': Pipeline([
        ('Scaler', StandardScaler()),  # Optional for tree-based models
        ('Model', RandomForestRegressor())
    ]),
    'Gradient Boosting': Pipeline([
        ('Scaler', StandardScaler()),
        ('Model', GradientBoostingRegressor())
    ]),
    'XGBoost': Pipeline([
        ('Scaler', StandardScaler()),
        ('Model', XGBRegressor(eval_metric='rmse', use_label_encoder=False))
    ]),
    'LightGBM': Pipeline([
        ('Scaler', StandardScaler()),
        ('Model', LGBMRegressor())
    ])
}

# define hyperparameter
param_grids ={
    'Linear' : {},
    'Ridge'  : {'Model__alpha': [0.01, 0.1, 1.0, 10.0]},
    'Lasso'  : {'Model__alpha': [0.001, 0.01, 0.1, 1.0]},
    'Random Forest' : {
        'Model__n_estimators':[100, 200],
        'Model__max_depth':[None, 10, 20]
    },
     'Gradient Boosting': {
        'Model__n_estimators': [100, 200],
        'Model__learning_rate': [0.01, 0.1],
        'Model__max_depth': [3, 5]
    },
    'XGBoost': {
        'Model__n_estimators': [100, 200],
        'Model__learning_rate': [0.01, 0.1],
        'Model__max_depth': [3, 5]
    },
    'LightGBM': {
        'Model__n_estimators': [100, 200],
        'Model__learning_rate': [0.01, 0.1],
        'Model__max_depth': [-1, 10]
    }
}

In [21]:

# cross validation 
kf          = KFold(n_splits=5, shuffle=True, random_state=42)
neg_mse     = 'neg_mean_squared_error'
train_times = {}
cv_mses     = {}
test_mses   = {}
best_params = {}
best_models = {}

# loop through pipelines
for name, pipe in pipelines.items():
    print(f"\nRunning GridSearchCV for {name}...")

    gs = GridSearchCV(
        pipe,
        param_grids[name],
        cv=kf,
        scoring=neg_mse,
        n_jobs=-1,
        return_train_score=True
    )

    start = time.time()
    gs.fit(X_train, y_train)
    duration = time.time() - start

    # record
    train_times[name]  = duration
    cv_mses[name]      = -gs.best_score_
    best_params[name]  = gs.best_params_
    best_models[name]  = gs.best_estimator_

    # test error
    y_pred           = gs.best_estimator_.predict(X_test)
    test_mses[name]  = mean_squared_error(y_test, y_pred)

    print(
        f"{name:17s}"
        f"train time = {duration:.2f}s | "
        f"CV MSE = {cv_mses[name]:.4f} | "
        f"Test MSE = {test_mses[name]:.4f} | "
        f"Best Params = {best_params[name]}"
    )




Running GridSearchCV for Linear...
Linear           train time = 1.70s | CV MSE = 2.9228 | Test MSE = 2.9709 | Best Params = {}

Running GridSearchCV for Ridge...
Ridge            train time = 1.84s | CV MSE = 2.9228 | Test MSE = 2.9709 | Best Params = {'Model__alpha': 0.1}

Running GridSearchCV for Lasso...
Lasso            train time = 0.18s | CV MSE = 2.9229 | Test MSE = 2.9714 | Best Params = {'Model__alpha': 0.001}

Running GridSearchCV for Random Forest...
Random Forest    train time = 18.12s | CV MSE = 2.0815 | Test MSE = 1.9546 | Best Params = {'Model__max_depth': 20, 'Model__n_estimators': 200}

Running GridSearchCV for Gradient Boosting...
Gradient Boostingtrain time = 10.00s | CV MSE = 2.0142 | Test MSE = 1.9658 | Best Params = {'Model__learning_rate': 0.1, 'Model__max_depth': 5, 'Model__n_estimators': 200}

Running GridSearchCV for XGBoost...



Parameters: { "use_label_encoder" } are not used.




XGBoost          train time = 1.10s | CV MSE = 2.0218 | Test MSE = 1.9687 | Best Params = {'Model__learning_rate': 0.1, 'Model__max_depth': 5, 'Model__n_estimators': 200}

Running GridSearchCV for LightGBM...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001392 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 212
[LightGBM] [Info] Number of data points in the train set: 12688, number of used features: 13
[LightGBM] [Info] Start training from score 4.525299
LightGBM         train time = 14.40s | CV MSE = 1.9044 | Test MSE = 1.8492 | Best Params = {'Model__learning_rate': 0.1, 'Model__max_depth': -1, 'Model__n_estimators': 200}



X does not have valid feature names, but LGBMRegressor was fitted with feature names



In [22]:
metrics_df = pd.DataFrame({
    'model':        list(train_times.keys()),
    'train_time_s': [train_times[m] for m in train_times],
    'cv_mse':       [cv_mses[m]     for m in train_times],
    'test_mse':     [test_mses[m]   for m in train_times],
    'best_params':  [best_params[m] for m in train_times],
})

In [23]:
fig_mse = px.bar(
    metrics_df.sort_values("test_mse"),  # sort for better comparison
    x='model',
    y='test_mse',
    title='Test MSE by Model',
    text='test_mse',
    color='test_mse',
    color_continuous_scale='Oranges',
    template='plotly_dark',
    width=1000,  # Increase chart width
    height=500
)

fig_mse.update_traces(
    textposition='outside',
    texttemplate='%{text:.2f}',  # Format to 2 decimals
    textfont_size=12
)

fig_mse.update_layout(
    xaxis_title='Model',
    yaxis_title='Test MSE',
    title_font_size=20,
    xaxis_tickangle=-45,
    showlegend=False,
    plot_bgcolor='rgba(0,0,0,0)',  # Transparent plot area
    paper_bgcolor='rgba(0,0,0,0)',  # Transparent surrounding
    xaxis=dict(showgrid=False),
    yaxis=dict(showgrid=False),
    margin=dict(l=60, r=60, t=80, b=80),
    coloraxis_colorbar=dict(title="Test MSE")
)

fig_mse.show()


In [24]:
fig_time = px.bar(
    metrics_df.sort_values("train_time_s"),  # sort for better comparison
    x='model',
    y='train_time_s',
    title='Training Time by Model (s)',
    text='train_time_s',
    color='train_time_s',
    color_continuous_scale='Oranges',  # Use a warm tone for time
    template='plotly_dark',
    width=1000,
    height=500
)

fig_time.update_traces(
    textposition='outside',
    texttemplate='%{text:.2f}',  # format to 2 decimals
    textfont_size=12
)

fig_time.update_layout(
    xaxis_title='Model',
    yaxis_title='Training Time (s)',
    title_font_size=20,
    xaxis_tickangle=-45,
    showlegend=False,
    plot_bgcolor='rgba(0,0,0,0)',
    paper_bgcolor='rgba(0,0,0,0)',
    xaxis=dict(showgrid=False),
    yaxis=dict(showgrid=False),
    margin=dict(l=60, r=60, t=80, b=80),
    coloraxis_colorbar=dict(title="Time (s)")
)

fig_time.show()
