**Structure**
--
1. Data Collection
2. Data Cleaning, Data Binning and Label Encoding
3. Model Choosing
4. Summary

**1. Data Collection**
--

**Import Data and Required Packages**

In [None]:
import pandas as pd
import numpy as np
import time
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
import statistics
from sklearn.ensemble import VotingRegressor, StackingRegressor, RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import Ridge
from scipy.stats import randint, uniform, loguniform

**Import the CSV Data as Pandas DataFrame**

In [47]:
start = time.time()
test = pd.read_csv("data/test.csv")
train = pd.read_csv("data/train.csv")
sample_submission = pd.read_csv("data/sample_submission.csv")
print("Files loaded in", time.time()-start, "seconds")

Files loaded in 1.9657115936279297 seconds


**2. Data Cleaning, Data Binning and Label Encoding**
--

In [48]:
X_train = train.drop(['id', 'Listening_Time_minutes'], axis=1)
Y_train = train['Listening_Time_minutes']
X_test = test.drop('id', axis=1)

In [49]:
X_train['Number_of_Ads']=X_train['Number_of_Ads'].fillna(X_train['Number_of_Ads'].median())
X_train['Number_of_Ads'].isna().sum()

np.int64(0)

**Replacing nullable data in `Guest_Popularity_percentage` with 0**

In [50]:
X_train['Guest_Popularity_percentage']=X_train['Guest_Popularity_percentage'].fillna(0)
X_train['Guest_Popularity_percentage'].isna().sum()

np.int64(0)

In [51]:
X_test['Guest_Popularity_percentage']=X_test['Guest_Popularity_percentage'].fillna(0)
X_test['Guest_Popularity_percentage'].isna().sum()

np.int64(0)

**Using Simple Encoder**

In [52]:
imputer = SimpleImputer(strategy='median')
num_without_id = ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads']
X_train[num_without_id] = imputer.fit_transform(train[num_without_id])
print(f"Training data has been transformed")
X_test[num_without_id] = imputer.transform(test[num_without_id])
print(f"Testing data has been transformed")

Training data has been transformed
Testing data has been transformed


**Handling outliers**

In [53]:
def find_anomalies(data):
    anomalies=[]
    data_std = statistics.stdev(data)
    data_mean = statistics.mean(data)
    anomaly_cut_off = data_std * 3
    lower_limit = data_mean - anomaly_cut_off
    upper_limit = data_mean +anomaly_cut_off
    for outlier in data:
        if outlier > upper_limit or outlier < lower_limit:
            anomalies.append(outlier)
            print(f"The value of the outlier: {outlier}")
    return anomalies

def list_outliers(data, columns):
    anomalies = []
    for col in columns:
        outliers = find_anomalies(data[col])
        print(f"In the {col} column there are {len(outliers)} outliers")
        anomalies.append((col, outliers))
    return anomalies

In [54]:
print("Train outliers:")
train_anomalies = list_outliers(X_train, num_without_id)
print("Test outliers:")
test_anomalies = list_outliers(X_test, num_without_id)

Train outliers:
The value of the outlier: 325.24
In the Episode_Length_minutes column there are 1 outliers
In the Host_Popularity_percentage column there are 0 outliers
In the Guest_Popularity_percentage column there are 0 outliers
The value of the outlier: 53.37
The value of the outlier: 103.91
The value of the outlier: 103.0
The value of the outlier: 53.42
The value of the outlier: 103.75
The value of the outlier: 12.0
The value of the outlier: 103.25
The value of the outlier: 103.25
The value of the outlier: 103.88
In the Number_of_Ads column there are 9 outliers
Test outliers:
The value of the outlier: 78486264.0
In the Episode_Length_minutes column there are 1 outliers
In the Host_Popularity_percentage column there are 0 outliers
In the Guest_Popularity_percentage column there are 0 outliers
The value of the outlier: 89.12
The value of the outlier: 2063.0
In the Number_of_Ads column there are 2 outliers


In [55]:
def replace(data, value_up, value_low, columns):
    data_copy = data.copy()
    for column in columns:
        up_mask = data_copy[column] > value_up
        if up_mask.any():
            data_copy.loc[up_mask, column] = value_up
            print(f"{column}: replaced {up_mask.sum()} values with {value_up}")
        low_mask = data_copy[column] < value_low
        if low_mask.any():
            data_copy.loc[low_mask, column] = value_low
            print(f"{column}: replaced {low_mask.sum()} values with {value_low}")
    return data_copy

In [56]:
X_train_num = X_train[num_without_id]
X_test_num = X_test[num_without_id]
percentage_cols = ['Guest_Popularity_percentage', 'Host_Popularity_percentage']
X_train[num_without_id]= replace(X_train_num, 100,0,percentage_cols)
X_test[num_without_id] = replace(X_test_num, 100,0,percentage_cols)

Guest_Popularity_percentage: replaced 19 values with 100
Host_Popularity_percentage: replaced 25 values with 100
Guest_Popularity_percentage: replaced 5 values with 100
Host_Popularity_percentage: replaced 12 values with 100


In [57]:
def replace_outliers(train_data, test_data, columns):
    data_replaced_train = train_data.copy()
    data_replaced_test = test_data.copy()
    for column in columns:
        col_data_train = train_data[column]
        data_std = statistics.stdev(col_data_train)
        data_mean = statistics.mean(col_data_train)
        anomaly_cut_off = data_std * 3
        lower_limit = round(data_mean - anomaly_cut_off, 2)
        upper_limit = round(data_mean + anomaly_cut_off, 2)
        data_replaced_train = replace(data_replaced_train, upper_limit, lower_limit, [column])
        data_replaced_test = replace(data_replaced_test, upper_limit, lower_limit, [column])
    return data_replaced_train, data_replaced_test

In [58]:
outlier_cols = ['Episode_Length_minutes', 'Number_of_Ads']
X_train_replaced, X_test_replaced = replace_outliers(X_train, X_test, outlier_cols)
X_train_replaced = X_train_replaced.round({'Number_of_Ads':0})
X_test_replaced = X_test_replaced.round({'Number_of_Ads':0})
X_test = X_test_replaced
X_train = X_train_replaced

Episode_Length_minutes: replaced 1 values with 157.42
Episode_Length_minutes: replaced 2 values with 157.42
Number_of_Ads: replaced 9 values with 4.8
Number_of_Ads: replaced 2 values with 4.8


**`Episode_Title` to numerical column**

In [59]:
X_train['Episode_Title'] = X_train['Episode_Title'].str.extract('(\d+)').astype(int)
X_train['Episode_Title'].head()

0    98
1    26
2    16
3    45
4    86
Name: Episode_Title, dtype: int64

In [60]:
X_test['Episode_Title'] = X_test['Episode_Title'].str.extract('(\d+)').astype(int)
X_test['Episode_Title'].head()

0    73
1    23
2    11
3    73
4    50
Name: Episode_Title, dtype: int64

**Feature Binning**

In [61]:
X_train['Ads_per_Popularity'] = X_train['Number_of_Ads'] / (X_train['Host_Popularity_percentage'] + 0.1)
X_test['Ads_per_Popularity'] = X_test['Number_of_Ads'] / (X_test['Host_Popularity_percentage'] + 0.1)
X_train['Length_Ads_Ratio'] = X_train['Episode_Length_minutes'] / (X_train['Number_of_Ads'] + 1)
X_test['Length_Ads_Ratio'] = X_test['Episode_Length_minutes'] / (X_test['Number_of_Ads'] + 1)
X_train['Length_Popularity_Interaction'] = X_train['Episode_Length_minutes'] * (X_train['Host_Popularity_percentage']+X_train['Guest_Popularity_percentage'])
X_test['Length_Popularity_Interaction'] = X_test['Episode_Length_minutes'] * (X_test['Host_Popularity_percentage']+X_test['Guest_Popularity_percentage'])

**Label Encoding**

In [62]:
ordinal_columns = ['Episode_Sentiment', 'Publication_Day', 'Publication_Time', 'Podcast_Name', 'Genre']

label_encoders = {}
for col in ordinal_columns:
    le = LabelEncoder()
    X_train[f'{col}_Encoded'] = le.fit_transform(X_train[col])
    X_test[f'{col}_Encoded'] = le.transform(X_test[col])
    label_encoders[col] = le
X_train = X_train.drop(ordinal_columns, axis=1)
X_test = X_test.drop(ordinal_columns, axis=1)

**3. Model Choosing**
--

In [None]:
np.random.seed(42)
def create_and_validate_ensemble(X_train, y_train, test_size=0.2):
    X_tr, X_val, y_tr, y_val = train_test_split(
        X_train, y_train, test_size=test_size, random_state=42
    )
    param_grid_lgd = {
        'n_estimators': randint(100, 300),
        'learning_rate': [0.005, 0.01, 0.05],
        'max_depth': [5, 7, 9],
        'num_leaves': [63, 127, 255],
        'subsample': [0.7, 0.8, 0.9],
        'colsample_bytree': [0.7, 0.8, 0.9],
        'reg_alpha': [0, 0.1, 0.5],
        'reg_lambda': [0.1, 1, 5]
    }

    param_grid_xgb = {
        'n_estimators': randint(100, 300),
        'max_depth': [5, 7, 9],
        'learning_rate': [0.005, 0.01, 0.05],
        'colsample_bytree': [0.7, 0.8, 0.9],
        'subsample': [0.7, 0.8, 0.9],
        'reg_alpha': [0, 0.1, 0.5],
        'reg_lambda': [1, 5, 10],
        'gamma': [0, 0.1, 0.5]
    }

    param_grid_cat = {
        'iterations': randint(200, 500),
        'learning_rate': [0.005, 0.01, 0.05],
        'depth': [6, 8, 10],
        'l2_leaf_reg': [1, 3, 5, 10],
        'random_strength': [0.5, 1, 2],
        'bagging_temperature': [0, 0.5, 1],
        'leaf_estimation_iterations': [5, 10]
    }
    lgb = LGBMRegressor(random_state=42, verbose=-1, n_jobs=1)
    random_search_lgd = RandomizedSearchCV(
        estimator=lgb,
        param_distributions=param_grid_lgd,
        n_iter=8,
        cv=3,
        scoring='neg_root_mean_squared_error',
        n_jobs=-1,
        verbose=1,
        random_state=42,
        return_train_score=True
    )

    random_search_lgd.fit(X_tr, y_tr)
    best_params_lgd = random_search_lgd.best_params_
    print("Best parameters for LGBMRegressor:", best_params_lgd)

    xgb = XGBRegressor(random_state=42, n_jobs=1)

    random_search_xgb = RandomizedSearchCV(
        estimator=xgb,
        param_distributions=param_grid_xgb,
        n_iter=8,
        scoring='neg_root_mean_squared_error', 
        cv=3,
        n_jobs=-1,
        random_state=42,
        verbose=1,
        return_train_score=True
    )
    random_search_xgb.fit(X_tr, y_tr)
    best_params_xgb = random_search_xgb.best_params_
    print("Best parameters for XGBRegressor:", best_params_xgb)


    catboost = CatBoostRegressor(
        random_seed=42,
        verbose=False,
        thread_count=-1
    )

    random_search_cat = RandomizedSearchCV(
        estimator=catboost,
        param_distributions=param_grid_cat,
        n_iter=8,
        scoring='neg_root_mean_squared_error',
        cv=3,
        n_jobs=1,
        random_state=42,
        verbose=1,
        return_train_score=True
    )
    random_search_cat.fit(X_tr, y_tr)
    best_params_cat = random_search_cat.best_params_
    print("Best parameters for CatBoostRegressor:", best_params_cat)

    param_dist = {
        'n_estimators': randint(50, 200),
        'max_depth': [10, 15, None],
        'min_samples_split': randint(2, 10),
        'max_features': ['sqrt', 'log2']
    }

    rf = RandomForestRegressor(random_state=42)
    random_search = RandomizedSearchCV(
        rf, param_dist, n_iter=8, cv=3,
        n_jobs=-1, verbose=1,scoring='neg_root_mean_squared_error', random_state=42
    )
    random_search.fit(X_tr, y_tr)
    best_rf_params = random_search.best_params_
    print(f'Best parameters for RandomForest: {best_rf_params}')

    
    voting_ensemble = VotingRegressor([
        ('lgb', LGBMRegressor(**best_params_lgd,
            random_state=42
        )),
        ('xgb', XGBRegressor(**best_params_xgb,
            random_state=42
        )),
        ('cat', CatBoostRegressor(**best_params_cat,
            random_seed=42
        )), 
        ('rf', RandomForestRegressor(**best_rf_params,
            random_state=42
        ))
    ])
    stacking_ensemble = StackingRegressor([
        ('lgb', LGBMRegressor(**best_params_lgd,
            random_state=42
        )),
        ('xgb', XGBRegressor(**best_params_xgb,
            random_state=42
        )),
        ('cat', CatBoostRegressor(**best_params_cat,
            random_seed=42
        )), 
        ('rf', RandomForestRegressor(**best_rf_params,
            random_state=42
        ))
    ], final_estimator=Ridge(alpha=1.0), cv=3)
    
    
    models = {
        'LightGBM': LGBMRegressor(**best_params_lgd, random_state=42),
        'XGBoost': XGBRegressor(**best_params_xgb, random_state=42),
        'CatBoost': CatBoostRegressor(**best_params_cat, random_state=42),
        'RandomForest': RandomForestRegressor(**best_rf_params, random_state=42),
        'Voting_Ensemble': voting_ensemble,
        'Stacking_Ensemble': stacking_ensemble,
    }
    return models, X_tr, X_val, y_tr, y_val

models, X_tr, X_val, y_tr, y_val = create_and_validate_ensemble(X_train, Y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
Best parameters for LGBMRegressor: {'colsample_bytree': 0.7, 'learning_rate': 0.05, 'max_depth': 9, 'n_estimators': 180, 'num_leaves': 127, 'reg_alpha': 0.1, 'reg_lambda': 1, 'subsample': 0.8}
Fitting 3 folds for each of 8 candidates, totalling 24 fits
Best parameters for XGBRegressor: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.05, 'max_depth': 9, 'n_estimators': 171, 'reg_alpha': 0, 'reg_lambda': 1, 'subsample': 0.9}
Fitting 3 folds for each of 8 candidates, totalling 24 fits
Best parameters for CatBoostRegressor: {'bagging_temperature': 0, 'depth': 10, 'iterations': 334, 'l2_leaf_reg': 1, 'leaf_estimation_iterations': 5, 'learning_rate': 0.05, 'random_strength': 1}
Fitting 3 folds for each of 8 candidates, totalling 24 fits
Best parameters for RandomForest: {'max_depth': 20, 'max_features': 0.3, 'min_samples_leaf': 2, 'min_samples_split': 4, 'n_estimators': 207}


In [76]:
def evaluate_models(models, X_tr, X_val, y_tr, y_val):
    results = {}
    
    print("=== TRAINING AND MODEL EVALUATION ===")
    
    for name, model in models.items():
        print(f"\n--- MODEL TRAINING {name} ---")
        
        model.fit(X_tr, y_tr)
        
        y_pred = model.predict(X_val)
        
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        
        results[name] = {
            'model': model,
            'rmse': rmse,
            'predictions': y_pred
        }
        
        print(f"RMSE: {rmse:.4f}")
    
    return results

results = evaluate_models(models, X_tr, X_val, y_tr, y_val)

=== TRAINING AND MODEL EVALUATION ===

--- MODEL TRAINING LightGBM ---
RMSE: 12.9951

--- MODEL TRAINING XGBoost ---
RMSE: 12.9512

--- MODEL TRAINING CatBoost ---
0:	learn: 26.1359168	total: 186ms	remaining: 1m 1s
1:	learn: 25.1915761	total: 366ms	remaining: 1m
2:	learn: 24.3191000	total: 565ms	remaining: 1m 2s
3:	learn: 23.5027033	total: 745ms	remaining: 1m 1s
4:	learn: 22.7232281	total: 938ms	remaining: 1m 1s
5:	learn: 21.9999716	total: 1.11s	remaining: 1m
6:	learn: 21.3231512	total: 1.27s	remaining: 59.6s
7:	learn: 20.6914874	total: 1.45s	remaining: 59.2s
8:	learn: 20.1028890	total: 1.64s	remaining: 59.1s
9:	learn: 19.5554012	total: 1.86s	remaining: 1m
10:	learn: 19.0444330	total: 2.05s	remaining: 1m
11:	learn: 18.5764583	total: 2.26s	remaining: 1m
12:	learn: 18.1401108	total: 2.55s	remaining: 1m 3s
13:	learn: 17.7368229	total: 2.78s	remaining: 1m 3s
14:	learn: 17.3589889	total: 3.03s	remaining: 1m 4s
15:	learn: 17.0126061	total: 3.28s	remaining: 1m 5s
16:	learn: 16.6939723	total: 

In [39]:
best_model_name = min(results, key=lambda x: results[x]['rmse'])
best_result = results[best_model_name]
best_model = best_result['model']
y_pred = best_model.predict(X_test)

In [42]:
results = pd.DataFrame({
    'id': range(750000, 750000 + len(y_pred)),
    'Listening_Time_minutes': y_pred
})

results.to_csv('data/sample_submission.csv', index=False, float_format='%.3f')
print("Predictions were saved in predictions.csv")

Predictions were saved in predictions.csv
