# Initial direct model (lgb)

In [8]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import mean_squared_error, mean_absolute_error, root_mean_squared_error
from sklearn import preprocessing 
import sklearn as sk
import lightgbm as lgb

In [9]:
Podcast_Train_df = pd.read_csv('Data/Podcast_train_clean.V2.csv')
Podcast_Test_df = pd.read_csv('Data/Podcast_test_clean.V2.csv')

In [10]:
# encode categorical
cat_cols =["Podcast_Name", "Genre", "Publication_Day", "Publication_Time", "Episode_Sentiment"]
for col in cat_cols:
    Podcast_Train_df[col] = Podcast_Train_df[col].astype('category')

In [11]:
# encode categorical
cat_cols =["Podcast_Name", "Genre", "Publication_Day", "Publication_Time", "Episode_Sentiment"]
for col in cat_cols:
    Podcast_Test_df[col] = Podcast_Test_df[col].astype('category')

In [12]:
# Train/Val split
X = Podcast_Train_df.drop(columns=["Listening_Time_minutes", "id"])
y = Podcast_Train_df["Listening_Time_minutes"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1)

In [13]:
Podcast_Train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 12 columns):
 #   Column                       Non-Null Count   Dtype   
---  ------                       --------------   -----   
 0   id                           750000 non-null  int64   
 1   Podcast_Name                 750000 non-null  category
 2   Episode_Length_minutes       750000 non-null  float64 
 3   Genre                        750000 non-null  category
 4   Host_Popularity_percentage   750000 non-null  float64 
 5   Publication_Day              750000 non-null  category
 6   Publication_Time             750000 non-null  category
 7   Guest_Popularity_percentage  750000 non-null  float64 
 8   Number_of_Ads                750000 non-null  float64 
 9   Episode_Sentiment            750000 non-null  category
 10  Listening_Time_minutes       750000 non-null  float64 
 11  Episode_Number               750000 non-null  float64 
dtypes: category(5), float64(6), int64(1)
memory 

In [14]:
# LightGBM Dataset
train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=cat_cols)
val_data = lgb.Dataset(X_val, label=y_val, categorical_feature=cat_cols)

In [15]:
# LightGBM Parameters
params = {
    "objective": "regression",
    "metric": "rmse",
    "boosting_type": "gbdt",
    "learning_rate": 0.02,
    "num_leaves": 64,
    "max_depth": 8,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "seed": 42,
    "verbosity": -1
}

In [16]:
# Callback for early stopping (new way in LightGBM v4)
callbacks = [
    lgb.early_stopping(stopping_rounds=100),
    lgb.log_evaluation(period=150)]

# Train Model
model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, val_data],
    num_boost_round=5000,
    callbacks=callbacks)
# Predict
preds = model.predict(Podcast_Test_df.drop(columns=["id"]), num_iteration=model.best_iteration)

# Clip predictions
preds = np.clip(preds, 0, 200)

# Prepare submission
sub = pd.DataFrame({
    "id": Podcast_Test_df["id"],
    "Listening_Time_minutes": preds
})
sub.to_csv("Data/submission.V2.lgb.csv", index=False)
print("✅ Submission file created!")

Training until validation scores don't improve for 100 rounds
[150]	training's rmse: 13.2833	valid_1's rmse: 13.2836
[300]	training's rmse: 13.0234	valid_1's rmse: 13.0672
[450]	training's rmse: 12.9617	valid_1's rmse: 13.0419
[600]	training's rmse: 12.9148	valid_1's rmse: 13.0294
[750]	training's rmse: 12.8736	valid_1's rmse: 13.0203
[900]	training's rmse: 12.8356	valid_1's rmse: 13.0119
[1050]	training's rmse: 12.7993	valid_1's rmse: 13.0057
[1200]	training's rmse: 12.7623	valid_1's rmse: 12.9977
[1350]	training's rmse: 12.7266	valid_1's rmse: 12.9912
[1500]	training's rmse: 12.6937	valid_1's rmse: 12.9848
[1650]	training's rmse: 12.6613	valid_1's rmse: 12.9794
[1800]	training's rmse: 12.6277	valid_1's rmse: 12.9733
[1950]	training's rmse: 12.5967	valid_1's rmse: 12.9683
[2100]	training's rmse: 12.565	valid_1's rmse: 12.9632
[2250]	training's rmse: 12.5345	valid_1's rmse: 12.9586
[2400]	training's rmse: 12.5048	valid_1's rmse: 12.9558
[2550]	training's rmse: 12.4757	valid_1's rmse: 1