# Initial direct model (lgb)

In [2]:
Podcast_Train_df = pd.read_csv('Data/Podcast_train_clean.csv')
Podcast_Test_df = pd.read_csv('Data/Podcast_test_clean.csv')

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import mean_squared_error, mean_absolute_error, root_mean_squared_error
from sklearn import preprocessing 
import sklearn as sk
import lightgbm as lgb

In [4]:
# encode categorical
cat_cols =["Podcast_Name", "Genre", "Publication_Day", "Publication_Time", "Episode_Sentiment"]
for col in cat_cols:
    Podcast_Train_df[col] = Podcast_Train_df[col].astype('category')

In [6]:
# encode categorical
cat_cols =["Podcast_Name", "Genre", "Publication_Day", "Publication_Time", "Episode_Sentiment"]
for col in cat_cols:
    Podcast_Test_df[col] = Podcast_Test_df[col].astype('category')

In [7]:
# Train/Val split
X = Podcast_Train_df.drop(columns=["Listening_Time_minutes", "id"])
y = Podcast_Train_df["Listening_Time_minutes"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1)

In [8]:
# LightGBM Dataset
train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=cat_cols)
val_data = lgb.Dataset(X_val, label=y_val, categorical_feature=cat_cols)

In [9]:
# LightGBM Parameters
params = {
    "objective": "regression",
    "metric": "rmse",
    "boosting_type": "gbdt",
    "learning_rate": 0.02,
    "num_leaves": 64,
    "max_depth": 8,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "seed": 42,
    "verbosity": -1
}

In [10]:
# Callback for early stopping (new way in LightGBM v4)
callbacks = [
    lgb.early_stopping(stopping_rounds=100),
    lgb.log_evaluation(period=150)]

# Train Model
model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, val_data],
    num_boost_round=5000,
    callbacks=callbacks)
# Predict
preds = model.predict(Podcast_Test_df.drop(columns=["id"]), num_iteration=model.best_iteration)

# Clip predictions
preds = np.clip(preds, 0, 200)

# Prepare submission
sub = pd.DataFrame({
    "id": Podcast_Test_df["id"],
    "Listening_Time_minutes": preds
})
sub.to_csv("submission.csv", index=False)
print("✅ Submission file created!")

Training until validation scores don't improve for 100 rounds
[150]	training's rmse: 13.2498	valid_1's rmse: 13.3455
[300]	training's rmse: 12.98	valid_1's rmse: 13.1151
[450]	training's rmse: 12.9172	valid_1's rmse: 13.0877
[600]	training's rmse: 12.8701	valid_1's rmse: 13.0756
[750]	training's rmse: 12.8268	valid_1's rmse: 13.0645
[900]	training's rmse: 12.7875	valid_1's rmse: 13.0532
[1050]	training's rmse: 12.751	valid_1's rmse: 13.0455
[1200]	training's rmse: 12.7146	valid_1's rmse: 13.0382
[1350]	training's rmse: 12.6787	valid_1's rmse: 13.0295
[1500]	training's rmse: 12.6446	valid_1's rmse: 13.0228
[1650]	training's rmse: 12.6119	valid_1's rmse: 13.0167
[1800]	training's rmse: 12.5787	valid_1's rmse: 13.011
[1950]	training's rmse: 12.5468	valid_1's rmse: 13.0056
[2100]	training's rmse: 12.5153	valid_1's rmse: 12.9994
[2250]	training's rmse: 12.4858	valid_1's rmse: 12.9943
[2400]	training's rmse: 12.4564	valid_1's rmse: 12.9896
[2550]	training's rmse: 12.4257	valid_1's rmse: 12.9