In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
# Import models
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, HistGradientBoostingRegressor
from sklearn.svm import SVR

# Import XGBoost and CatBoost
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

# Import LightGBM
from lightgbm import LGBMRegressor

# Import metrics for evaluation
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Import numpy and pandas (for preprocessing and evaluation)
import numpy as np
import pandas as pd

In [4]:
df=pd.read_csv("data/train.csv")
test_df=pd.read_csv("data/test.csv")
submission_df=pd.read_csv("data/sample_submission.csv")

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 12 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   id                           750000 non-null  int64  
 1   Podcast_Name                 750000 non-null  object 
 2   Episode_Title                750000 non-null  object 
 3   Episode_Length_minutes       662907 non-null  float64
 4   Genre                        750000 non-null  object 
 5   Host_Popularity_percentage   750000 non-null  float64
 6   Publication_Day              750000 non-null  object 
 7   Publication_Time             750000 non-null  object 
 8   Guest_Popularity_percentage  603970 non-null  float64
 9   Number_of_Ads                749999 non-null  float64
 10  Episode_Sentiment            750000 non-null  object 
 11  Listening_Time_minutes       750000 non-null  float64
dtypes: float64(5), int64(1), object(6)
memory usage: 68.7+ MB


# Missing Values

In [6]:

## Here we will check the percentage of nan values present in each feature
## 1 -step make the list of features which has missing values
features_with_na=[features for features in df.columns if df[features].isnull().sum()>0]
## 2- step print the feature name and the percentage of missing values

for feature in features_with_na:
    print(feature, np.round(df[feature].isnull().mean(), 4),  ' % missing values')

Episode_Length_minutes 0.1161  % missing values
Guest_Popularity_percentage 0.1947  % missing values
Number_of_Ads 0.0  % missing values


there are some missing value but since i don't want to remove and loose any information i will fill them with median value

In [7]:
# Fill missing values with median for specified columns in both dataframes
columns_to_fill = ['Number_of_Ads', 'Episode_Length_minutes', 'Guest_Popularity_percentage']
# Calculate medians from TRAINING DATA only
train_medians = df[columns_to_fill].median()

# Apply training medians to BOTH datasets
df[columns_to_fill] = df[columns_to_fill].fillna(train_medians)
test_df[columns_to_fill] = test_df[columns_to_fill].fillna(train_medians)  # Use train's median

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 12 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   id                           750000 non-null  int64  
 1   Podcast_Name                 750000 non-null  object 
 2   Episode_Title                750000 non-null  object 
 3   Episode_Length_minutes       750000 non-null  float64
 4   Genre                        750000 non-null  object 
 5   Host_Popularity_percentage   750000 non-null  float64
 6   Publication_Day              750000 non-null  object 
 7   Publication_Time             750000 non-null  object 
 8   Guest_Popularity_percentage  750000 non-null  float64
 9   Number_of_Ads                750000 non-null  float64
 10  Episode_Sentiment            750000 non-null  object 
 11  Listening_Time_minutes       750000 non-null  float64
dtypes: float64(5), int64(1), object(6)
memory usage: 68.7+ MB


In [9]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 11 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   id                           250000 non-null  int64  
 1   Podcast_Name                 250000 non-null  object 
 2   Episode_Title                250000 non-null  object 
 3   Episode_Length_minutes       250000 non-null  float64
 4   Genre                        250000 non-null  object 
 5   Host_Popularity_percentage   250000 non-null  float64
 6   Publication_Day              250000 non-null  object 
 7   Publication_Time             250000 non-null  object 
 8   Guest_Popularity_percentage  250000 non-null  float64
 9   Number_of_Ads                250000 non-null  float64
 10  Episode_Sentiment            250000 non-null  object 
dtypes: float64(4), int64(1), object(6)
memory usage: 21.0+ MB


In [10]:
df.head()

Unnamed: 0,id,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes
0,0,Mystery Matters,Episode 98,63.84,True Crime,74.81,Thursday,Night,53.58,0.0,Positive,31.41998
1,1,Joke Junction,Episode 26,119.8,Comedy,66.95,Saturday,Afternoon,75.95,2.0,Negative,88.01241
2,2,Study Sessions,Episode 16,73.9,Education,69.97,Tuesday,Evening,8.97,0.0,Negative,44.92531
3,3,Digital Digest,Episode 45,67.17,Technology,57.22,Monday,Morning,78.7,2.0,Positive,46.27824
4,4,Mind & Body,Episode 86,110.51,Health,80.07,Monday,Afternoon,58.68,3.0,Neutral,75.61031


In [11]:
df.nunique()

id                             750000
Podcast_Name                       48
Episode_Title                     100
Episode_Length_minutes          12268
Genre                              10
Host_Popularity_percentage       8038
Publication_Day                     7
Publication_Time                    4
Guest_Popularity_percentage     10019
Number_of_Ads                      12
Episode_Sentiment                   3
Listening_Time_minutes          42807
dtype: int64

In [12]:
df.duplicated().sum()

0

In [13]:
df.describe()

Unnamed: 0,id,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads,Listening_Time_minutes
count,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0
mean,374999.5,64.427546,59.859901,52.498047,1.348854,45.437406
std,216506.495284,30.996996,22.873098,25.537152,1.15113,27.138306
min,0.0,0.0,1.3,0.0,0.0,0.0
25%,187499.75,39.42,39.41,34.55,0.0,23.17835
50%,374999.5,63.84,60.05,53.58,1.0,43.37946
75%,562499.25,90.31,79.53,71.04,2.0,64.81158
max,749999.0,325.24,119.46,119.91,103.91,119.97


In [14]:
numeric_features = [feature for feature in df.columns if df[feature].dtype != 'O']
categorical_features = [feature for feature in df.columns if df[feature].dtype == 'O']

# print columns
print('We have {} numerical features : {}'.format(len(numeric_features), numeric_features))
print('\nWe have {} categorical features : {}'.format(len(categorical_features), categorical_features))

We have 6 numerical features : ['id', 'Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads', 'Listening_Time_minutes']

We have 6 categorical features : ['Podcast_Name', 'Episode_Title', 'Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment']


# Feature Engineering

In [15]:
day_to_num = {
    'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 
    'Thursday': 3, 'Friday': 4, 'Saturday': 5, 'Sunday': 6
}

df['Publication_Day'] = df['Publication_Day'].str.capitalize()
df['Day_num'] = df['Publication_Day'].map(day_to_num)
if df['Day_num'].isna().any():
    print("Warning: Missing or invalid Publication_Day values")
    df['Day_num'].fillna(0, inplace=True)  # Default to Monday

In [16]:
df.head()

Unnamed: 0,id,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes,Day_num
0,0,Mystery Matters,Episode 98,63.84,True Crime,74.81,Thursday,Night,53.58,0.0,Positive,31.41998,3
1,1,Joke Junction,Episode 26,119.8,Comedy,66.95,Saturday,Afternoon,75.95,2.0,Negative,88.01241,5
2,2,Study Sessions,Episode 16,73.9,Education,69.97,Tuesday,Evening,8.97,0.0,Negative,44.92531,1
3,3,Digital Digest,Episode 45,67.17,Technology,57.22,Monday,Morning,78.7,2.0,Positive,46.27824,0
4,4,Mind & Body,Episode 86,110.51,Health,80.07,Monday,Afternoon,58.68,3.0,Neutral,75.61031,0


# Model Training

In [17]:
import numpy as np
import pandas as pd
from scipy.sparse import hstack
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import StackingRegressor
from textblob import TextBlob
import optuna
import warnings

# Suppress warnings
warnings.filterwarnings('ignore', category=FutureWarning)

# Global flag for target transformation
TARGET_TRANSFORMED = False

In [18]:
def load_data():
    try:
        train_df = pd.read_csv('data/train.csv')
        test_df = pd.read_csv('data/test.csv')
        print("Data loaded successfully.")
        return train_df, test_df
    except FileNotFoundError:
        print("Error: train.csv or test.csv not found. Make sure they are in a 'data' subdirectory.")
        exit()

# Load data
train_df, test_df = load_data()

# Keep test IDs
test_ids = test_df['id']

# Prepare data
X = train_df.drop(columns=['Listening_Time_minutes', 'id'])
y = train_df['Listening_Time_minutes']
X_submission_test_features = test_df.drop(columns=['id'])

# Split data
X_train_df, X_val_df, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(f"Training set shape: {X_train_df.shape}, Validation set shape: {X_val_df.shape}")

# Check target skewness
print("Original y_train skewness:", y_train.skew())

Data loaded successfully.
Training set shape: (600000, 10), Validation set shape: (150000, 10)
Original y_train skewness: 0.3502503762244965


In [19]:
def engineer_features(df, is_train=True, target_series=None, vectorizer=None, mean_listen_time=None, mean_genre=None, scaler=None):
    df = df.copy()

    # Cyclical encoding for Publication_Day
    day_to_num = {'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3, 'Friday': 4, 'Saturday': 5, 'Sunday': 6}
    df['Publication_Day'] = df['Publication_Day'].str.capitalize().fillna('Unknown')
    df['Day_num'] = df['Publication_Day'].map(day_to_num).fillna(0)
    df['Day_sin'] = np.sin(2 * np.pi * df['Day_num'] / 7)
    df['Day_cos'] = np.cos(2 * np.pi * df['Day_num'] / 7)

    # Cyclical encoding for Publication_Time
    time_to_hour = {'Night': 20, 'Morning': 8, 'Afternoon': 14, 'Evening': 18}
    df['Publication_Time'] = df['Publication_Time'].fillna('Unknown')
    df['Hour'] = df['Publication_Time'].map(time_to_hour).fillna(12)
    df['Hour_sin'] = np.sin(2 * np.pi * df['Hour'] / 24)
    df['Hour_cos'] = np.cos(2 * np.pi * df['Hour'] / 24)

    # Title features
    df['Episode_Title'] = df['Episode_Title'].fillna('')
    df['Title_Length'] = df['Episode_Title'].apply(len)
    df['Title_Word_Count'] = df['Episode_Title'].apply(lambda x: len(x.split()))
    df['Title_Sentiment'] = df['Episode_Title'].apply(lambda x: TextBlob(x).sentiment.polarity if x else 0)

    # New features
    df['Podcast_Frequency'] = df['Podcast_Name'].map(df['Podcast_Name'].value_counts())
    df['Genre_Length'] = df['Genre'].astype(str) + '_' + pd.cut(df['Episode_Length_minutes'], bins=3, labels=['short', 'medium', 'long']).astype(str)
    df['Genre_Length'] = df['Genre_Length'].astype('category').cat.codes

    # Target encoding
    if is_train:
        if target_series is None:
            raise ValueError("target_series must be provided when is_train=True.")
        temp_target = target_series.copy()
        global_mean = temp_target.mean()
        mean_listen_time = temp_target.groupby(df['Podcast_Name'].fillna('Unknown')).mean().to_dict()
        mean_genre = temp_target.groupby(df['Genre'].fillna('Unknown')).mean().to_dict()
        mean_listen_time['global'] = global_mean
        mean_genre['global'] = global_mean
    else:
        if mean_listen_time is None or mean_genre is None:
            raise ValueError("mean_listen_time and mean_genre must be provided when is_train=False.")
        global_mean = mean_listen_time.get('global', 0)

    df['Podcast_Mean_Listen'] = df['Podcast_Name'].fillna('Unknown').map(mean_listen_time).fillna(global_mean)
    df['Genre_Mean_Listen'] = df['Genre'].fillna('Unknown').map(mean_genre).fillna(global_mean)

    # Interaction features
    num_cols_for_interactions = ['Host_Popularity_percentage', 'Episode_Length_minutes', 'Guest_Popularity_percentage', 'Number_of_Ads']
    for col in num_cols_for_interactions:
        if col in df.columns:
            df[col] = df[col].fillna(0)
    df['Host_Pop_x_Length'] = df['Host_Popularity_percentage'] * df['Episode_Length_minutes']
    df['Guest_Pop_x_Length'] = df['Guest_Popularity_percentage'] * df['Episode_Length_minutes']
    df['Ads_x_Length'] = df['Number_of_Ads'] * df['Episode_Length_minutes']
    df['Host_Guest_Pop'] = df['Host_Popularity_percentage'] * df['Guest_Popularity_percentage']

    # Normalize interactions
    interaction_cols = ['Host_Pop_x_Length', 'Guest_Pop_x_Length', 'Ads_x_Length', 'Host_Guest_Pop']
    df[interaction_cols] = df[interaction_cols].fillna(0)
    if is_train:
        scaler = StandardScaler()
        df[interaction_cols] = scaler.fit_transform(df[interaction_cols])
    else:
        if scaler is None:
            raise ValueError("scaler must be provided when is_train=False.")
        df[interaction_cols] = scaler.transform(df[interaction_cols])

    # Episode sentiment
    df['Episode_Sentiment_Score'] = df['Episode_Sentiment'].map({'Negative': -1, 'Neutral': 0, 'Positive': 1}).fillna(0)

    # TF-IDF vectorization
    if is_train:
        vectorizer = TfidfVectorizer(max_features=500, ngram_range=(1,3), min_df=5, stop_words='english')
        X_text = vectorizer.fit_transform(df['Episode_Title'])
    else:
        if vectorizer is None:
            raise ValueError("vectorizer must be provided when is_train=False.")
        X_text = vectorizer.transform(df['Episode_Title'])

    # Combine features
    numerical_cols = [
        'Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage',
        'Number_of_Ads', 'Day_sin', 'Day_cos', 'Hour_sin', 'Hour_cos', 'Title_Length',
        'Title_Word_Count', 'Title_Sentiment', 'Podcast_Mean_Listen', 'Genre_Mean_Listen',
        'Host_Pop_x_Length', 'Guest_Pop_x_Length', 'Ads_x_Length', 'Host_Guest_Pop',
        'Episode_Sentiment_Score', 'Podcast_Frequency', 'Genre_Length'
    ]
    existing_numerical_cols = [col for col in numerical_cols if col in df.columns]
    X_numerical = df[existing_numerical_cols].fillna(0).values
    X = hstack([X_text, X_numerical]).tocsr()

    if is_train:
        return X, vectorizer, mean_listen_time, mean_genre, scaler
    return X, None, None, None, None

# Apply feature engineering
print("Engineering features for training data...")

Engineering features for training data...


In [20]:
train=pd.read_csv("data/train.csv")
test=pd.read_csv("data/test.csv")

In [25]:
from scipy.sparse import hstack
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import StackingRegressor
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from textblob import TextBlob
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Feature engineering function
def engineer_features(df, is_train=True, target_series=None, vectorizer=None, mean_listen_time=None, mean_genre=None, scaler=None):
    df = df.copy()

    # Cyclical encoding for Publication_Day
    day_to_num = {'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3, 'Friday': 4, 'Saturday': 5, 'Sunday': 6}
    df['Publication_Day'] = df['Publication_Day'].str.capitalize().fillna('Unknown')
    df['Day_num'] = df['Publication_Day'].map(day_to_num).fillna(0)
    df['Day_sin'] = np.sin(2 * np.pi * df['Day_num'] / 7)
    df['Day_cos'] = np.cos(2 * np.pi * df['Day_num'] / 7)

    # Cyclical encoding for Publication_Time
    time_to_hour = {'Night': 20, 'Morning': 8, 'Afternoon': 14, 'Evening': 18}
    df['Publication_Time'] = df['Publication_Time'].fillna('Unknown')
    df['Hour'] = df['Publication_Time'].map(time_to_hour).fillna(12)
    df['Hour_sin'] = np.sin(2 * np.pi * df['Hour'] / 24)
    df['Hour_cos'] = np.cos(2 * np.pi * df['Hour'] / 24)

    # Title features
    df['Episode_Title'] = df['Episode_Title'].fillna('')
    df['Title_Length'] = df['Episode_Title'].apply(len)
    df['Title_Word_Count'] = df['Episode_Title'].apply(lambda x: len(x.split()))
    df['Title_Sentiment'] = df['Episode_Title'].apply(lambda x: TextBlob(x).sentiment.polarity if x else 0)

    # New features
    df['Podcast_Frequency'] = df['Podcast_Name'].map(df['Podcast_Name'].value_counts())
    df['Genre_Length'] = df['Genre'].astype(str) + '_' + pd.cut(df['Episode_Length_minutes'], bins=3, labels=['short', 'medium', 'long']).astype(str)
    df['Genre_Length'] = df['Genre_Length'].astype('category').cat.codes

    # Target encoding
    if is_train:
        if target_series is None:
            raise ValueError("target_series must be provided when is_train=True.")
        temp_target = target_series.copy()
        global_mean = temp_target.mean()
        mean_listen_time = temp_target.groupby(df['Podcast_Name'].fillna('Unknown')).mean().to_dict()
        mean_genre = temp_target.groupby(df['Genre'].fillna('Unknown')).mean().to_dict()
        mean_listen_time['global'] = global_mean
        mean_genre['global'] = global_mean
    else:
        if mean_listen_time is None or mean_genre is None:
            raise ValueError("mean_listen_time and mean_genre must be provided when is_train=False.")
        global_mean = mean_listen_time.get('global', 0)

    df['Podcast_Mean_Listen'] = df['Podcast_Name'].fillna('Unknown').map(mean_listen_time).fillna(global_mean)
    df['Genre_Mean_Listen'] = df['Genre'].fillna('Unknown').map(mean_genre).fillna(global_mean)

    # Interaction features
    num_cols_for_interactions = ['Host_Popularity_percentage', 'Episode_Length_minutes', 'Guest_Popularity_percentage', 'Number_of_Ads']
    for col in num_cols_for_interactions:
        if col in df.columns:
            df[col] = df[col].fillna(0)
    df['Host_Pop_x_Length'] = df['Host_Popularity_percentage'] * df['Episode_Length_minutes']
    df['Guest_Pop_x_Length'] = df['Guest_Popularity_percentage'] * df['Episode_Length_minutes']
    df['Ads_x_Length'] = df['Number_of_Ads'] * df['Episode_Length_minutes']
    df['Host_Guest_Pop'] = df['Host_Popularity_percentage'] * df['Guest_Popularity_percentage']

    # Normalize interactions
    interaction_cols = ['Host_Pop_x_Length', 'Guest_Pop_x_Length', 'Ads_x_Length', 'Host_Guest_Pop']
    df[interaction_cols] = df[interaction_cols].fillna(0)
    if is_train:
        scaler = StandardScaler()
        df[interaction_cols] = scaler.fit_transform(df[interaction_cols])
    else:
        if scaler is None:
            raise ValueError("scaler must be provided when is_train=False.")
        df[interaction_cols] = scaler.transform(df[interaction_cols])

    # Episode sentiment
    df['Episode_Sentiment_Score'] = df['Episode_Sentiment'].map({'Negative': -1, 'Neutral': 0, 'Positive': 1}).fillna(0)

    # TF-IDF vectorization
    if is_train:
        vectorizer = TfidfVectorizer(max_features=500, ngram_range=(1,3), min_df=5, stop_words='english')
        X_text = vectorizer.fit_transform(df['Episode_Title'])
    else:
        if vectorizer is None:
            raise ValueError("vectorizer must be provided when is_train=False.")
        X_text = vectorizer.transform(df['Episode_Title'])

    # Combine features
    numerical_cols = [
        'Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage',
        'Number_of_Ads', 'Day_sin', 'Day_cos', 'Hour_sin', 'Hour_cos', 'Title_Length',
        'Title_Word_Count', 'Title_Sentiment', 'Podcast_Mean_Listen', 'Genre_Mean_Listen',
        'Host_Pop_x_Length', 'Guest_Pop_x_Length', 'Ads_x_Length', 'Host_Guest_Pop',
        'Episode_Sentiment_Score', 'Podcast_Frequency', 'Genre_Length'
    ]
    existing_numerical_cols = [col for col in numerical_cols if col in df.columns]
    X_numerical = df[existing_numerical_cols].fillna(0).values
    X = hstack([X_text, X_numerical]).tocsr()

    if is_train:
        return X, vectorizer, mean_listen_time, mean_genre, scaler
    return X, None, None, None, None

# Load data
print("Loading data...")
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

# Apply feature engineering
print("Engineering features for training data...")
X, vectorizer, mean_listen_time, mean_genre, scaler = engineer_features(
    df=train,
    is_train=True,
    target_series=train['Listening_Time_minutes']
)

print("Engineering features for test data...")
X_test, _, _, _, _ = engineer_features(
    df=test,
    is_train=False,
    target_series=None,
    vectorizer=vectorizer,
    mean_listen_time=mean_listen_time,
    mean_genre=mean_genre,
    scaler=scaler
)

y = train['Listening_Time_minutes']

# Define models
xgb = XGBRegressor(
    n_estimators=101,
    max_depth=6,
    learning_rate=0.01018372008142483,
    tree_method='hist',
    eval_metric='mae',
    reg_lambda=0.1,
    random_state=42,
    n_jobs=-1,
    objective='reg:squarederror'
)

lgb = LGBMRegressor(
    n_estimators=320,
    max_depth=6,
    learning_rate=0.22042348987717186,
    force_row_wise=True,
    num_leaves=31,
    min_child_samples=20,
    reg_lambda=0.1,
    random_state=42,
    n_jobs=-1
)

# Cross-validation setup
print("Starting cross-validation...")
kf = KFold(n_splits=5, shuffle=True, random_state=42)
stacking_preds = np.zeros(X.shape[0])
test_preds = np.zeros(X_test.shape[0])
rmses = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f'Fold {fold + 1}')
    X_train = X[train_idx]
    X_val = X[val_idx]
    y_train = y.iloc[train_idx]
    y_val = y.iloc[val_idx]

    # Train individual models
    xgb.fit(X_train, y_train)
    lgb.fit(X_train, y_train)

    # Stacking
    stacking = StackingRegressor(
        estimators=[
            ('xgb', XGBRegressor(
                n_estimators=101,
                max_depth=6,
                learning_rate=0.01018372008142483,
                tree_method='hist',
                eval_metric='mae',
                reg_lambda=0.1,
                random_state=42,
                n_jobs=-1,
                objective='reg:squarederror'
            )),
            ('lgb', LGBMRegressor(
                n_estimators=320,
                max_depth=6,
                learning_rate=0.22042348987717186,
                force_row_wise=True,
                num_leaves=31,
                min_child_samples=20,
                reg_lambda=0.1,
                random_state=42,
                n_jobs=-1
            ))
        ],
        final_estimator=XGBRegressor(
            n_estimators=100,
            learning_rate=0.1,
            tree_method='hist',
            random_state=42,
            n_jobs=-1
        ),
        cv=5
    )
    stacking.fit(X_train, y_train)

    # Validate
    val_preds = stacking.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, val_preds))
    print(f'Fold {fold + 1} RMSE: {rmse}')
    rmses.append(rmse)

    # Store predictions
    stacking_preds[val_idx] = val_preds
    test_preds += stacking.predict(X_test) / kf.n_splits

# Blend with a single LightGBM model
print("Training final LightGBM model...")
lgb_final = LGBMRegressor(
    n_estimators=320,
    max_depth=6,
    learning_rate=0.22042348987717186,
    force_row_wise=True,
    num_leaves=31,
    min_child_samples=20,
    reg_lambda=0.1,
    random_state=42,
    n_jobs=-1
)
lgb_final.fit(X, y)
lgb_test_preds = lgb_final.predict(X_test)
final_test_preds = 0.7 * test_preds + 0.3 * lgb_test_preds

# Print CV RMSE
print(f'Mean CV RMSE: {np.mean(rmses)}')

Loading data...
Engineering features for training data...
Engineering features for test data...
Starting cross-validation...
Fold 1
[LightGBM] [Info] Total Bins 2416
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 201
[LightGBM] [Info] Start training from score 45.447808
[LightGBM] [Info] Total Bins 2416
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 201
[LightGBM] [Info] Start training from score 45.447808
[LightGBM] [Info] Total Bins 2416
[LightGBM] [Info] Number of data points in the train set: 480000, number of used features: 201
[LightGBM] [Info] Start training from score 45.438867
[LightGBM] [Info] Total Bins 2416
[LightGBM] [Info] Number of data points in the train set: 480000, number of used features: 201
[LightGBM] [Info] Start training from score 45.438309
[LightGBM] [Info] Total Bins 2416
[LightGBM] [Info] Number of data points in the train set: 480000, number of used features: 201
[LightG

In [26]:
# Create submission
print("Creating submission file...")
submission = pd.read_csv('data/sample_submission.csv')
submission['Listening_Time_minutes'] = final_test_preds
submission.to_csv('submission.csv', index=False)
print("Submission file created: submission.csv")

Creating submission file...
Submission file created: submission.csv
