# Deep Neural Network

In [None]:
%%time
!pip install -q --no-index -U --find-links=/kaggle/input/tensorflow-2-15/tensorflow tensorflow==2.15.0
!pip install -q --no-index -U --find-links=/kaggle/input/deeptables-v0-2-5/deeptables-0.2.5 deeptables==0.2.5
!pip install -q --no-index -U --find-links=/kaggle/input/fix-deeptables/deeptables-0.2.6 deeptables==0.2.6

In [None]:
import os
import gc
import shap
import math
import ctypes
import random
import warnings
import matplotlib.pyplot as plt
import numpy as np, pandas as pd
from colorama import Fore, Style
from itertools import combinations
from numpy.typing import ArrayLike
from joblib import Parallel, delayed
from sklearn.base import BaseEstimator
from sklearn.model_selection import KFold
from category_encoders import TargetEncoder
from sklearn.preprocessing import QuantileTransformer

import tensorflow as tf, deeptables as dt
from tensorflow.keras import backend as K
from tensorflow.keras.utils import plot_model
from tensorflow.keras.optimizers.legacy import Adam
from deeptables.utils.shap import DeepTablesExplainer
from deeptables.models import DeepTable, ModelConfig, deepnets

warnings.filterwarnings('ignore')
print('TensorFlow version:',tf.__version__+',',
      'GPU =',tf.test.is_gpu_available())
print('DeepTables version:',dt.__version__)

In [None]:
def seed_everything(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
seed_everything(seed=42)

def clean_memory():
    gc.collect()
    ctypes.CDLL("libc.so.6").malloc_trim(0)
clean_memory()

def print_memory_usage(X, X_test, wording='default'):
    print(f"Memory usage {wording}      X: {X.memory_usage(deep=True).sum() / (1024*1024):.2f} MB")
    print(f"Memory usage {wording} X_test: {X_test.memory_usage(deep=True).sum() / (1024*1024):.2f} MB\n")

In [None]:
train = pd.read_csv("/kaggle/input/playground-series-s5e4/train.csv")
test = pd.read_csv("/kaggle/input/playground-series-s5e4/test.csv")
train_orig = pd.read_csv('/kaggle/input/podcast-listening-time-prediction-dataset/podcast_dataset.csv')
print("Train shape:", train.shape)
print("Test shape:", test.shape)
print("Train original shape:", train_orig.shape, '\n')

train = pd.concat([train, train_orig], ignore_index=True).drop_duplicates()
train.dropna(subset=['Listening_Time_minutes'], inplace=True)
train.reset_index(drop=True, inplace=True)
print("Train combied shape:", train.shape)

In [None]:
ELM = []
for k in range(3):
    col_name = f'ELm_r{k}'
    train[col_name] = train['Episode_Length_minutes'].round(k)
    test[col_name] = test['Episode_Length_minutes'].round(k)
    ELM.append(col_name)

In [None]:
def target_encoding(train, target, test=None, feat_to_encode=None, min_samples_leaf=1, smoothing=0.1):
    train.sort_index(inplace=True)
    if feat_to_encode is None:
        feat_to_encode = train.columns.tolist()
    encoder_params = dict(cols=feat_to_encode, min_samples_leaf=min_samples_leaf, smoothing=smoothing)
    
    oof_parts = []
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    for tr_idx, val_idx in kf.split(train, target):
        encoder = TargetEncoder(**encoder_params)
        encoder.fit(train.iloc[tr_idx], target.iloc[tr_idx])
        encoded = encoder.transform(train.iloc[val_idx])
        encoded[feat_to_encode] = encoded[feat_to_encode].astype('float32')
        encoded.index = train.index[val_idx]
        oof_parts.append(encoded)
        
    final_encoder = encoder = TargetEncoder(**encoder_params)
    final_encoder.fit(train, target)
    if test is not None:
        test = final_encoder.transform(test)
        test[feat_to_encode] = test[feat_to_encode].astype('float32')
        
    train = pd.concat(oof_parts).sort_index()
    return train, test

In [None]:
%%time
X = train.drop(['id', 'Listening_Time_minutes'], axis=1)
y = train.Listening_Time_minutes
X_test = test.drop(['id'], axis=1)
del train, test
print("X      shape:", X.shape)
print("X_test shape:", X_test.shape, '\n')

cat_cols = X.select_dtypes(include=['object']).columns.tolist()
num_cols = X.select_dtypes(exclude=['object']).columns.tolist()
print("init len(cat_cols):", len(cat_cols))
print("init len(num_cols):", len(num_cols), '\n')

# Only 1 missing value to fill
m_ = X['Number_of_Ads'].mode()[0] 
X['Number_of_Ads'] = X['Number_of_Ads'].fillna(m_)
# Fill missing values and create an indicator column
for c in num_cols:
    if X[c].isna().any():
        m = X[c].mean()
        X[f'NA_{c}'] = X[c].isna().astype('int8')
        X[c] = X[c].fillna(m)
        X_test[f'NA_{c}'] = X_test[c].isna().astype('int8')
        X_test[c] = X_test[c].fillna(m)
        num_cols.append(f'NA_{c}')


pair_size = [2, 3, 4]
encode_cols = ['Episode_Length_minutes',
               'Number_of_Ads',
#               'Episode_Title',
               'Episode_Sentiment',
               'Publication_Day',
               'Publication_Time',
               'Podcast_Name',
#               'Genre',
               'Guest_Popularity_percentage',
               'Host_Popularity_percentage']

def eng_combos(df):
    df_str_np = df[encode_cols].astype(str).values.astype('U')
    encoded_columns = []
    selected_comb = [
        ['Episode_Num', 'Host_Popularity_percentage'],
        ['Episode_Num', 'Guest_Popularity_percentage'],
        ['Episode_Num', 'Number_of_Ads'],    
        ['ELm_r1', 'Episode_Num'],
        ['ELm_r1', 'Host_Popularity_percentage'], 
        ['ELm_r1', 'Guest_Popularity_percentage'],
        ['ELm_r2', 'Episode_Num'],
        ['ELm_r2', 'Episode_Sentiment'],
        ['ELm_r2', 'Publication_Day'],
        ['ELm_r1', 'Number_of_Ads', 'Episode_Sentiment'],
        ['ELm_r2', 'Number_of_Ads', 'Podcast_Name'],
        ['Episode_Num', 'Podcast_Name'],
        ['Episode_Length_minutes', 'Episode_Num', 'Host_Popularity_percentage'],
        ['Episode_Length_minutes', 'Episode_Num', 'Guest_Popularity_percentage'],
        ['Episode_Length_minutes', 'Episode_Num', 'Number_of_Ads'],
        ['Episode_Length_minutes', 'Episode_Num', 'Episode_Sentiment'],
        ['Episode_Length_minutes', 'Episode_Num', 'Publication_Day'],
        ['Episode_Num', 'Host_Popularity_percentage', 'Guest_Popularity_percentage'],
        ['Episode_Num', 'Host_Popularity_percentage', 'Number_of_Ads'],
        ['Episode_Num', 'Host_Popularity_percentage', 'Episode_Sentiment'],
        ['Episode_Num', 'Host_Popularity_percentage', 'Publication_Day'],
        ['Episode_Num', 'Host_Popularity_percentage', 'Publication_Time'],
        ['Episode_Num', 'Host_Popularity_percentage', 'Genre'],
        ['Episode_Num', 'Guest_Popularity_percentage', 'Number_of_Ads'],
        ['Episode_Num', 'Guest_Popularity_percentage', 'Episode_Sentiment'],
        ['Episode_Num', 'Guest_Popularity_percentage', 'Publication_Day'],
        ['Episode_Num', 'Guest_Popularity_percentage', 'Publication_Time'],
        ['Episode_Num', 'Guest_Popularity_percentage', 'Genre'],
        ['Episode_Num', 'Number_of_Ads', 'Episode_Sentiment'],
        ['Episode_Length_minutes', 'Episode_Num', 'Host_Popularity_percentage', 'Guest_Popularity_percentage'],
        ['Episode_Length_minutes', 'Episode_Num', 'Host_Popularity_percentage', 'Number_of_Ads'],
        ['Episode_Length_minutes', 'Episode_Num', 'Host_Popularity_percentage', 'Episode_Sentiment'],
        ['Episode_Length_minutes', 'Episode_Num', 'Host_Popularity_percentage', 'Publication_Day'],
        ['Episode_Length_minutes', 'Episode_Num', 'Host_Popularity_percentage', 'Publication_Time'],
        ['Episode_Length_minutes', 'Episode_Num', 'Host_Popularity_percentage', 'Genre'],
        ['Episode_Length_minutes', 'Episode_Num', 'Guest_Popularity_percentage', 'Number_of_Ads'],
        ['Episode_Length_minutes', 'Episode_Num', 'Guest_Popularity_percentage', 'Episode_Sentiment'],
        ['Episode_Length_minutes', 'Episode_Num', 'Guest_Popularity_percentage', 'Publication_Day'],
        ['Episode_Length_minutes', 'Episode_Num', 'Guest_Popularity_percentage', 'Publication_Time'],
        ['Episode_Length_minutes', 'Episode_Num', 'Number_of_Ads', 'Episode_Sentiment'],
        ['Episode_Length_minutes', 'Episode_Num', 'Number_of_Ads', 'Publication_Day'],
        ['Episode_Length_minutes', 'Episode_Num', 'Number_of_Ads', 'Publication_Time'],
        ['Episode_Length_minutes', 'Episode_Num', 'Publication_Day', 'Publication_Time'],
        ['Episode_Length_minutes', 'Episode_Num', 'Publication_Day', 'Genre'],    
        ['Episode_Length_minutes', 'Episode_Num', 'Publication_Time', 'Podcast_Name'],
        ['Episode_Num', 'Host_Popularity_percentage', 'Number_of_Ads', 'Publication_Day'],
        ['Episode_Num', 'Host_Popularity_percentage', 'Number_of_Ads', 'Publication_Time'],
        ['Episode_Num', 'Host_Popularity_percentage', 'Episode_Sentiment', 'Publication_Day'],
        ['Episode_Num', 'Host_Popularity_percentage', 'Episode_Sentiment', 'Publication_Time'],
        ['Episode_Num', 'Host_Popularity_percentage', 'Episode_Sentiment', 'Genre'],
        ['Episode_Num', 'Host_Popularity_percentage', 'Publication_Day', 'Publication_Time'],
        ['Episode_Num', 'Host_Popularity_percentage', 'Publication_Time', 'Genre'],
        ['Episode_Num', 'Guest_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment'],
        ['Episode_Num', 'Guest_Popularity_percentage', 'Number_of_Ads', 'Genre'],
    ]

    for comb in selected_comb:
        name = '.' + '_'.join(comb)
            
        if len(comb) == 2:
            df[name] = df[comb[0]].astype(str) + '_' + df[comb[1]].astype(str)
            
        elif len(comb) == 3:
            df[name] = (df[comb[0]].astype(str) + '_' +
                           df[comb[1]].astype(str) + '_' +
                           df[comb[2]].astype(str))
            
        elif len(comb) == 4:
            df[name] = (df[comb[0]].astype(str) + '_' +
                           df[comb[1]].astype(str) + '_' +
                           df[comb[2]].astype(str) + '_' +
                           df[comb[3]].astype(str))
    
        encoded_columns.append(name)

    df[encoded_columns] = df[encoded_columns].astype('category')
    for r in pair_size:
        for cols in combinations(range(len(encode_cols)), r):
            col_names = [encode_cols[i] for i in cols]
            new_col_name = '._' + '_'.join(col_names)
            concat = df_str_np[:, cols[0]]
            for i in range(1, r):
                concat = np.char.add(np.char.add(concat, '_'), df_str_np[:, cols[i]])
            df[new_col_name] = pd.Categorical(concat)
    
    return df



def feat_eng(df, num_chunks=4, n_jobs=4):
    df['_Has_Ads'] = (df['Number_of_Ads'] > 0).astype('int8')
    df['_Is_Weekend'] = df['Publication_Day'].isin(['Saturday', 'Sunday']).astype('int8')
    df['_sqrt_Episode_Length_minutes'] = np.sqrt(df['Episode_Length_minutes']).astype('float32')
    df['_squared_Episode_Length_minutes'] = (df['Episode_Length_minutes'] ** 2).astype('float32')
    df['_sin_Episode_Length_minutes'] = np.sin(2*np.pi * df['Episode_Length_minutes'] / 60).astype('float32')
    df['_cos_Episode_Length_minutes'] = np.cos(2*np.pi * df['Episode_Length_minutes'] / 60).astype('float32')
    df['_sin_Host_Popularity_percentage'] = np.sin(2*np.pi * df['Host_Popularity_percentage'] / 20).astype('float32')
    df['_cos_Host_Popularity_percentage'] = np.cos(2*np.pi * df['Host_Popularity_percentage'] / 20).astype('float32')
    df['Episode_Num'] = df['Episode_Title'].str[8:]     
    df['is_weekend']   = df['Publication_Day'].isin(['Saturday', 'Sunday']).astype(int)
    
    time_dict = {'Morning': 0, 'Afternoon': 1, 'Evening': 2, 'Night': 3}
    df['Publication_Time_enc'] = df['Publication_Time'].replace(time_dict)
    df['_sin_Publication_Time'] = np.sin(2*np.pi * df['Publication_Time_enc'] / 2).astype('float32')
    df['_cos_Publication_Time'] = np.cos(2*np.pi * df['Publication_Time_enc'] / 2).astype('float32')
    df = df.drop(['Publication_Time_enc'], axis=1)
    
    chunks = np.array_split(df, num_chunks)
    results = Parallel(n_jobs=n_jobs)(delayed(eng_combos)(chunk) for chunk in chunks)
    df = pd.concat(results, ignore_index=True)
            
    new_cat_cols = [col for col in df.columns if col.endswith('_')]
    new_num_cols = [col for col in df.columns if col.startswith('_')]
    new_enc_cols = [col for col in df.columns if col.startswith('.')]
    return df, new_cat_cols, new_num_cols, new_enc_cols
    
X, new_cat_cols, new_num_cols, new_enc_cols = feat_eng(X)
X_test, new_cat_cols, new_num_cols, new_enc_cols = feat_eng(X_test)
num_cols += new_num_cols; cat_cols += new_cat_cols
print("len(new_cat_cols):", len(new_cat_cols))
print("len(new_num_cols):", len(new_num_cols)+2)  # +2 NA indicator columns
print("len(new_enc_cols):", len(new_enc_cols), '\n')
print_memory_usage(X, X_test, wording='after feat eng')
clean_memory()

# Reduce memory usage
X_all = pd.concat([X, X_test])
for col in X_all.columns:
    if col.startswith('._'):
        X_all[col] = X_all[col].astype('category').cat.codes.astype('int32')
X = X_all.iloc[:len(X)]; X_test = X_all.iloc[len(X):]
print_memory_usage(X, X_test, wording='after reduce')
del X_all; clean_memory()

X, X_test = target_encoding(X, y, X_test, feat_to_encode=new_enc_cols)
print_memory_usage(X, X_test, wording='after encode')
clean_memory()

scaler = QuantileTransformer(subsample=10**9)
X[num_cols] = scaler.fit_transform(X[num_cols]).astype(np.float32)
X_test[num_cols] = scaler.transform(X_test[num_cols]).astype(np.float32)
print_memory_usage(X, X_test, wording='after scale')
clean_memory()

num_cols += new_enc_cols
print("prep len(cat_cols):", len(cat_cols))
print("prep len(num_cols):", len(num_cols), '\n')

In [None]:
# https://www.kaggle.com/code/cdeotte/tensorflow-transformer-0-790/notebook
LR_START = 1e-7
LR_MAX = 1e-3
LR_MIN = 1e-7
LR_RAMPUP_EPOCHS = 2
LR_SUSTAIN_EPOCHS = 3
EPOCHS = 9

def lrfn(epoch):
    if epoch < LR_RAMPUP_EPOCHS:
        lr = (LR_MAX - LR_START) / LR_RAMPUP_EPOCHS * epoch + LR_START
    elif epoch < LR_RAMPUP_EPOCHS + LR_SUSTAIN_EPOCHS:
        lr = LR_MAX
    else:
        decay_total_epochs = EPOCHS - LR_RAMPUP_EPOCHS - LR_SUSTAIN_EPOCHS - 1
        decay_epoch_index = epoch - LR_RAMPUP_EPOCHS - LR_SUSTAIN_EPOCHS
        phase = math.pi * decay_epoch_index / decay_total_epochs
        cosine_decay = 0.5 * (1 + math.cos(phase))
        lr = (LR_MAX - LR_MIN) * cosine_decay + LR_MIN    
    return lr

rng = [i for i in range(EPOCHS)]
lr_y = [lrfn(x) for x in rng]
plt.figure(figsize=(10, 4))
plt.plot(rng, lr_y, '-o')
plt.xlabel('Epoch'); plt.ylabel('LR')
print("Learning rate schedule: {:.3g} to {:.3g} to {:.3g}". \
      format(lr_y[0], max(lr_y), lr_y[-1]))
LR_Scheduler = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose=False)

In [None]:
class CFG:
    TRAIN = True
    FIT_VERBOSE = 2
    folds = 5
    epochs = 9
    batch_size = 128
    LR_Scheduler = [LR_Scheduler]
    optimizer = Adam(learning_rate=1e-3)

    conf = ModelConfig(auto_imputation=False,
                       auto_discrete=True,
                       fixed_embedding_dim=True,
                       embeddings_output_dim=4,
                       embedding_dropout=0.3,
                       nets=['dnn_nets'],
                       dnn_params={
                           'hidden_units': ((1024, 0.3, True),
                                             (512, 0.3, True),
                                             (256, 0.3, True)),
                           'dnn_activation': 'relu',
                       },
                       autoint_params={
                            'num_attention': 3,
                            'num_heads': 1,
                            'dropout_rate': 0.0,
                            'use_residual': True,
                       },
                       stacking_op='concat',
                       output_use_bias=False,
                       optimizer=optimizer,
                       task='regression',
                       loss='auto',
                       metrics=['RootMeanSquaredError'],
                       earlystopping_patience=1,
                       )

In [None]:
os.makedirs(f"/tmp/workdir/kaggle/input/predict-listening-deep-nn/models/", exist_ok=True)

In [None]:
os.system(f"cp -r /kaggle/input/predict-listening-deep-nn/models/* /tmp/workdir/kaggle/input/predict-listening-deep-nn/models/")

In [None]:
os.listdir('/tmp/workdir/kaggle/input/predict-listening-deep-nn/models/')

In [None]:
# Load models
def load_model(paths):
    models = []
    for fold in sorted(os.listdir(paths)):
        path = os.path.join(paths, fold)
        for file in os.listdir(path):
            if file.endswith('.h5'):
                models.append(DeepTable.load(path, file))
    return models

models = load_model("/kaggle/input/predict-listening-deep-nn/models")
print("\nmodels:", models)

In [None]:
# Inference
class AvgModel:
    def __init__(self, models: list[BaseEstimator]):
        self.models = models
    def predict(self, X: ArrayLike):
        preds = []
        for model in self.models:
            pred = model.predict(X, verbose=1, batch_size=512).flatten()
            preds.append(pred)
        return np.mean(preds, axis=0)

avg_model = AvgModel(models)
test_pred = avg_model.predict(X_test)

In [None]:
sub = pd.read_csv("/kaggle/input/playground-series-s5e4/sample_submission.csv")
sub.Listening_Time_minutes = test_pred
sub.to_csv("submission_1.csv", index=False)
sub.head()

In [None]:
%reset -f

# CATBoost

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import pandas as pd
pd.options.mode.copy_on_write = True
import numpy as np
import xgboost as xgb
from sklearn.model_selection import KFold
from cuml.preprocessing import TargetEncoder
from sklearn.preprocessing import LabelEncoder
from tqdm.auto import tqdm
from itertools import combinations
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import mean_squared_error

# from lightgbm import LGBMRegressor
import lightgbm as lgb

def process_combinations_fast(df, columns_to_encode, pair_size, max_batch_size=2000):
    # Precompute string versions of all columns once
    str_df = df[columns_to_encode]
    le = LabelEncoder()
    str_df = str_df.astype(str)
    total_new_cols = 0
    
    for r in pair_size:
        print(f"Processing {r}-combinations")
        
        # Count total combinations for this r-value
        n_combinations = np.math.comb(len(columns_to_encode), r)
        print(f"Total {r}-combinations to process: {n_combinations}")
        
        # Process combinations in batches to manage memory
        combos_iter = combinations(columns_to_encode, r)
        batch_cols = []
        batch_names = []
        
        with tqdm(total=n_combinations) as pbar:
            while True:
                # Collect a batch of combinations
                batch_cols.clear()
                batch_names.clear()
                
                # Fill the current batch
                for _ in range(max_batch_size):
                    try:
                        cols = next(combos_iter)
                        batch_cols.append(list(cols))
                        batch_names.append('+'.join(cols))
                    except StopIteration:
                        break
                
                if not batch_cols:  # No more combinations
                    break
                
                # Process this batch vectorized
                for i, (cols, new_name) in enumerate(zip(batch_cols, batch_names)):
                    # Fast vectorized concatenation
                    result = str_df[cols[0]].copy()
                    for col in cols[1:]:
                        result += '' + str_df[col]
                    
                    df[new_name] = le.fit_transform(result) + 1
                    pbar.update(1)
                
                total_new_cols += len(batch_cols)
                if len(batch_cols) == max_batch_size:  # Only print on full batches
                    print(f"Progress: {total_new_cols}/{n_combinations} combinations processed")
        
        print(f"Completed all {r}-combinations. Total columns now: {len(df.columns)}")
    
    return df

TARGET = 'Listening_Time_minutes'
# Load data
df_train = pd.read_csv("/kaggle/input/playground-series-s5e4/train.csv")
df_train.drop(columns=['id'], inplace=True)
df_test = pd.read_csv('/kaggle/input/playground-series-s5e4/test.csv')
df_test.drop(columns=['id'], inplace=True)

original = pd.read_csv('/kaggle/input/podcast-listening-time-prediction-dataset/podcast_dataset.csv')

original_clean = original.dropna(subset=[TARGET]).drop_duplicates()
df_train = pd.concat([df_train, original_clean], axis=0, ignore_index=True)

df = pd.concat([df_train, df_test], axis=0, ignore_index=True)

# df.drop(columns=['id'], inplace=True)
df = df.drop_duplicates()

# outlier removal
df['Episode_Length_minutes'] = np.maximum(0, np.minimum(120, df['Episode_Length_minutes']))
df['Host_Popularity_percentage'] = np.maximum(20, np.minimum(100, df['Host_Popularity_percentage']))
df['Guest_Popularity_percentage'] = np.maximum(0, np.minimum(100, df['Guest_Popularity_percentage']))
df['Host_Popularity_bin'] = pd.cut(df['Host_Popularity_percentage'], bins=[20,40,60,80,100], labels=[1,2,3,4])
df.loc[df['Number_of_Ads'] > 3, 'Number_of_Ads'] = 0

# Encode categorical features
day_mapping = {'Monday': 1, 'Tuesday': 2, 'Wednesday': 3, 'Thursday': 4, 'Friday': 5, 'Saturday': 6, 'Sunday': 7}
df['Publication_Day'] = df['Publication_Day'].map(day_mapping)

time_mapping = {'Morning': 1, 'Afternoon': 2, 'Evening': 3, 'Night': 4}
df['Publication_Time'] = df['Publication_Time'].map(time_mapping)

sentiment_map = {'Negative': 1, 'Neutral': 2, 'Positive': 3}
df['Episode_Sentiment'] = df['Episode_Sentiment'].map(sentiment_map)

df['Episode_Title'] = df['Episode_Title'].str.replace('Episode ', '', regex=True)
df['Episode_Title'] = df['Episode_Title'].astype('int')
df['Title_Episode_Length'] = df['Episode_Title'] / (df['Episode_Length_minutes'] + 1)
le = LabelEncoder()
for col in df.select_dtypes('object').columns:
    df[col] = le.fit_transform(df[col]) + 1

# Some Feature engineering
for col in ['Episode_Length_minutes']:
    df[[col + '_sqrt', col + '_squared']] = np.column_stack([
    np.sqrt(df[col]),
    df[col] ** 2
    ])

for col in tqdm(['Episode_Sentiment', 'Genre', 'Publication_Day', 'Podcast_Name', 'Episode_Title',
                 'Guest_Popularity_percentage', 'Host_Popularity_percentage', 'Number_of_Ads']):
    df[col + '_EP'] = df.groupby(col)['Episode_Length_minutes'].transform('mean')

df = process_combinations_fast(df, ['Episode_Length_minutes', 'Episode_Title', 'Publication_Time', 'Host_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment', 
                     'Publication_Day', 'Podcast_Name','Genre','Guest_Popularity_percentage'], [2, 3, 5, 7], 1000) # [2, 3, 5, 7]

df = df.astype('float32')

df_train = df.iloc[:-len(df_test)]
df_test = df.iloc[-len(df_test):].reset_index(drop=True)

df_train = df_train[df_train['Listening_Time_minutes'].notnull()]

target = df_train.pop('Listening_Time_minutes')
df_test.pop('Listening_Time_minutes')

df_train.shape, df_test.shape

## Save and load model by pickle
"""
import pickle

# Suppose `model` is your trained LightGBM (or scikit-learn) estimator

# 1) Save to file
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

# 2) Later—load from file
with open('model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

# 3) Use it exactly like the original
y_pred = loaded_model.predict(X_new)

"""

import pickle
import gc 

import numpy as np
from sklearn.model_selection import KFold
from catboost import CatBoostRegressor, Pool
from tqdm import tqdm
import pickle

# seed and CV
seed = 42
cv = KFold(n_splits=7, random_state=seed, shuffle=True)
pred_test = np.zeros(df_test.shape[0])

# CatBoost params
catboost_params = {
    'iterations': 1_000_00,  # Large number, use early stopping
    'learning_rate': 0.02,    # You can experiment with decaying manually
    'depth': 8,               # CatBoost max_depth is 'depth'
    'loss_function': 'RMSE',
    'eval_metric': 'RMSE',
    'random_seed': seed,
    'verbose': 500,
    'early_stopping_rounds': 30,
    'task_type': 'GPU',       # Use 'CPU' if GPU not available
    'od_type': 'Iter',
}

all_histories = []
features = df_train.columns.tolist()

for fold, (trn_idx, val_idx) in enumerate(cv.split(df_train), 1):
    print(f"Starting Fold {fold}")
    X_trn, y_trn = df_train.iloc[trn_idx].copy(), target.iloc[trn_idx]
    X_val, y_val = df_train.iloc[val_idx].copy(), target.iloc[val_idx]
    X_sub = df_test[X_trn.columns.tolist()].copy()

    # === Target Encoding ===
    encoder = TargetEncoder(n_folds=5, seed=seed, stat="mean")
    print(f"Fold {fold}: Applying Target Encoding...")

    # first 20 new cols
    for col in tqdm(features[:20], desc=f"Fold {fold} TE‐add"):
        # Ensure the column exists in the dataframes before encoding
        if col in X_trn.columns:
            X_trn[f"{col}_te"] = encoder.fit_transform(X_trn[[col]], y_trn)
            X_val[f"{col}_te"] = encoder.transform(X_val[[col]])
            X_sub[f"{col}_te"] = encoder.transform(X_sub[[col]])
        else:
            print(f"Warning: Column '{col}' not found in training data for TE-add.")

    # remaining, in‐place
    for col in tqdm(features[20:], desc=f"Fold {fold} TE‐replace"):
         # Ensure the column exists in the dataframes before encoding
        if col in X_trn.columns:
            X_trn[col] = encoder.fit_transform(X_trn[[col]], y_trn)
            X_val[col] = encoder.transform(X_val[[col]])
            X_sub[col] = encoder.transform(X_sub[[col]])
        else:
             print(f"Warning: Column '{col}' not found in training data for TE-replace.")

    # Create CatBoost pools
    train_pool = Pool(X_trn, label=y_trn)
    valid_pool = Pool(X_val, label=y_val)

    print(f"Fold {fold}: Training CatBoost model...")
    model = CatBoostRegressor(**catboost_params)
    model.fit(train_pool, eval_set=valid_pool, use_best_model=True)
    

    # Predict on test
    test_pred = model.predict(X_sub)
    pred_test += np.clip(test_pred, 0, 120)

    print(f"Fold {fold} finished, best_iteration={model.get_best_iteration()}")
    print("-" * 60)

    del model
    del encoder
    del X_trn, y_trn, X_val, y_val, X_sub
    gc.collect()

# average over folds
pred_test /= cv.n_splits

print("Training complete. Test predictions averaged across folds.")


pred_test

df_sub = pd.read_csv("/kaggle/input/playground-series-s5e4/sample_submission.csv")
df_sub.Listening_Time_minutes = pred_test
df_sub.to_csv('submission_2.csv', index=False)

In [None]:
%reset -f

# XGBoost

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.simplefilter('ignore')
from sklearn.base import BaseEstimator, TransformerMixin

class OrderedTargetEncoder(BaseEstimator, TransformerMixin):
    """
    Out‑of‑fold **mean‑rank** encoder with optional smoothing.
    • Encodes each category by the *rank* of its target mean within a fold.
    • Unseen categories get the global mean rank (or −1 if you prefer).
    """
    def __init__(self, cat_cols=None, n_splits=5, smoothing=0):
        self.cat_cols   = cat_cols
        self.n_splits   = n_splits
        self.smoothing  = smoothing       # 0 = no smoothing
        self.maps_      = {}              # per‑fold maps
        self.global_map = {}              # fit on full data for test set

    def _make_fold_map(self, X_col, y):
        means = y.groupby(X_col, dropna=False).mean()
        if self.smoothing > 0:
            counts = y.groupby(X_col, dropna=False).count()
            smooth = (counts * means + self.smoothing * y.mean()) / (counts + self.smoothing)
            means  = smooth
        return {k: r for r, k in enumerate(means.sort_values().index)}

    def fit(self, X, y):
        X, y = X.reset_index(drop=True), y.reset_index(drop=True)
        if self.cat_cols is None:
            self.cat_cols = X.select_dtypes(include='object').columns.tolist()

        kf = KFold(self.n_splits, shuffle=True, random_state=42)
        self.maps_ = {col: [None]*self.n_splits for col in self.cat_cols}

        for fold, (tr_idx, _) in enumerate(kf.split(X)):
            X_tr, y_tr = X.loc[tr_idx], y.loc[tr_idx]
            for col in self.cat_cols:
                self.maps_[col][fold] = self._make_fold_map(X_tr[col], y_tr)

        for col in self.cat_cols:
            self.global_map[col] = self._make_fold_map(X[col], y)

        return self

    def transform(self, X, y=None, fold=None):
        """
        • During CV pass fold index to use fold‑specific maps (leak‑free).
        • At inference time (fold=None) uses global map.
        """
        X = X.copy()
        tgt_maps = {col: (self.global_map[col] if fold is None else self.maps_[col][fold])
                    for col in self.cat_cols}
        for col, mapping in tgt_maps.items():
            X[col] = X[col].map(mapping).fillna(-1).astype(int)
        return X

def target_encode(df_train, df_val, col, target, stats='mean', prefix='TE'):
    df_val = df_val.copy()
    agg = df_train.groupby(col)[target].agg(stats)    
    if isinstance(stats, (list, tuple)):
        for s in stats:
            colname = f"{prefix}_{col}_{s}"
            df_val[colname] = df_val[col].map(agg[s]).astype(float)
            # df_val[colname].fillna(agg[s].mean(), inplace=True)
    else:
        suffix = stats if isinstance(stats, str) else stats.__name__
        colname = f"{prefix}_{col}_{suffix}"
        df_val[colname] = df_val[col].map(agg).astype(float)
        df_val[colname].fillna(agg.mean(), inplace=True)
    return df_val

encoded_columns = []

selected_comb = [
     ['Episode_Length_minutes', 'Host_Popularity_percentage'],
    ['Episode_Length_minutes', 'Guest_Popularity_percentage'],
    ['Episode_Length_minutes', 'Number_of_Ads'],
    ['Episode_Num', 'Host_Popularity_percentage'],
    ['Episode_Num', 'Guest_Popularity_percentage'],
    ['Episode_Num', 'Number_of_Ads'],    
    ['Host_Popularity_percentage', 'Guest_Popularity_percentage'],
    ['Host_Popularity_percentage', 'Number_of_Ads'],
    ['Host_Popularity_percentage', 'Episode_Sentiment'],
    ['Episode_Length_minutes', 'Podcast_Name'],
    ['Episode_Num', 'Podcast_Name'],  
    ['Guest_Popularity_percentage', 'Podcast_Name'],
    ['ELm_r1', 'Episode_Num'],
    ['ELm_r1', 'Host_Popularity_percentage'], 
    ['ELm_r1', 'Guest_Popularity_percentage'],
    ['ELm_r2', 'Episode_Num'],
    ['ELm_r2', 'Episode_Sentiment'],
    ['ELm_r2', 'Publication_Day'],
    ['Linear_Feature', 'Number_of_Ads'],
    ['Linear_Feature', 'Genre'],
    ['Linear_Feature', 'Episode_Sentiment'],

    
    # 3-interaction
    ['Episode_Length_minutes', 'Episode_Num', 'Host_Popularity_percentage'],
    ['Episode_Length_minutes', 'Episode_Num', 'Guest_Popularity_percentage'],
    ['Episode_Length_minutes', 'Episode_Num', 'Number_of_Ads'],
    ['Episode_Length_minutes', 'Episode_Num', 'Episode_Sentiment'],
    ['Episode_Length_minutes', 'Episode_Num', 'Publication_Day'],
    ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage'],
    ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Number_of_Ads'],
    ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Episode_Sentiment'],
    ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Publication_Day'],
    ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Publication_Time'],
    ['Episode_Length_minutes', 'Guest_Popularity_percentage', 'Number_of_Ads'],
    ['Episode_Length_minutes', 'Guest_Popularity_percentage', 'Publication_Day'],
    ['Episode_Length_minutes', 'Guest_Popularity_percentage', 'Publication_Time'],
    ['Episode_Length_minutes', 'Number_of_Ads', 'Episode_Sentiment'],
    ['Episode_Length_minutes', 'Number_of_Ads', 'Publication_Day'],
    ['Episode_Length_minutes', 'Episode_Sentiment', 'Publication_Time'],
    ['Episode_Num', 'Host_Popularity_percentage', 'Guest_Popularity_percentage'],
    ['Episode_Num', 'Host_Popularity_percentage', 'Number_of_Ads'],
    ['Episode_Num', 'Host_Popularity_percentage', 'Episode_Sentiment'],
    ['Episode_Num', 'Host_Popularity_percentage', 'Publication_Day'],
    ['Episode_Num', 'Host_Popularity_percentage', 'Publication_Time'],
    ['Episode_Num', 'Host_Popularity_percentage', 'Genre'],
    ['Episode_Num', 'Guest_Popularity_percentage', 'Number_of_Ads'],
    ['Episode_Num', 'Guest_Popularity_percentage', 'Episode_Sentiment'],
    ['Episode_Num', 'Guest_Popularity_percentage', 'Publication_Day'],
    ['Episode_Num', 'Guest_Popularity_percentage', 'Publication_Time'],
    ['Episode_Num', 'Guest_Popularity_percentage', 'Genre'],
    ['Episode_Num', 'Number_of_Ads', 'Episode_Sentiment'],
    ['Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads'],
    ['Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Episode_Sentiment'],
    ['Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Publication_Day'],
    ['Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Publication_Time'],
    ['Host_Popularity_percentage', 'Number_of_Ads', 'Publication_Day'],
    ['Guest_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment'],
    ['Guest_Popularity_percentage', 'Number_of_Ads', 'Genre'],   
    ['ELm_r1', 'Number_of_Ads', 'Episode_Sentiment'],
    ['ELm_r2', 'Number_of_Ads', 'Podcast_Name'],
    ['Linear_Feature', 'Podcast_Name', 'Episode_Num'], # add  for test
    ['Linear_Feature', 'Genre', 'Number_of_Ads'], # add for test
    
    # 4-interaction
    ['Episode_Length_minutes', 'Episode_Num', 'Host_Popularity_percentage', 'Guest_Popularity_percentage'],
    ['Episode_Length_minutes', 'Episode_Num', 'Host_Popularity_percentage', 'Number_of_Ads'],
    ['Episode_Length_minutes', 'Episode_Num', 'Host_Popularity_percentage', 'Episode_Sentiment'],
    ['Episode_Length_minutes', 'Episode_Num', 'Host_Popularity_percentage', 'Publication_Day'],
    ['Episode_Length_minutes', 'Episode_Num', 'Host_Popularity_percentage', 'Publication_Time'],
    ['Episode_Length_minutes', 'Episode_Num', 'Host_Popularity_percentage', 'Genre'],
    ['Episode_Length_minutes', 'Episode_Num', 'Guest_Popularity_percentage', 'Number_of_Ads'],
    ['Episode_Length_minutes', 'Episode_Num', 'Guest_Popularity_percentage', 'Episode_Sentiment'],
    ['Episode_Length_minutes', 'Episode_Num', 'Guest_Popularity_percentage', 'Publication_Day'],
    ['Episode_Length_minutes', 'Episode_Num', 'Guest_Popularity_percentage', 'Publication_Time'],
    ['Episode_Length_minutes', 'Episode_Num', 'Number_of_Ads', 'Episode_Sentiment'],
    ['Episode_Length_minutes', 'Episode_Num', 'Number_of_Ads', 'Publication_Day'],
    ['Episode_Length_minutes', 'Episode_Num', 'Number_of_Ads', 'Publication_Time'],
    ['Episode_Length_minutes', 'Episode_Num', 'Publication_Day', 'Publication_Time'],
    ['Episode_Length_minutes', 'Episode_Num', 'Publication_Day', 'Genre'],    
    ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads'],
    ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Episode_Sentiment'],
    ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Publication_Day'],
    ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Publication_Time'],
    ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment'],
    ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Number_of_Ads', 'Publication_Day'],
    ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Publication_Day', 'Publication_Time'],
    ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Publication_Day', 'Genre'],
    ['Episode_Length_minutes', 'Guest_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment'],
    ['Episode_Length_minutes', 'Guest_Popularity_percentage', 'Number_of_Ads', 'Publication_Day'],
    ['Episode_Length_minutes', 'Guest_Popularity_percentage', 'Number_of_Ads', 'Publication_Time'],
    ['Episode_Length_minutes', 'Guest_Popularity_percentage', 'Number_of_Ads', 'Genre'],
    ['Episode_Length_minutes', 'Episode_Num', 'Publication_Time', 'Podcast_Name'],
    ['Episode_Num', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads'],
    ['Episode_Num', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Episode_Sentiment'],
    ['Episode_Num', 'Host_Popularity_percentage', 'Number_of_Ads', 'Publication_Day'],
    ['Episode_Num', 'Host_Popularity_percentage', 'Number_of_Ads', 'Publication_Time'],
    ['Episode_Num', 'Host_Popularity_percentage', 'Episode_Sentiment', 'Publication_Day'],
    ['Episode_Num', 'Host_Popularity_percentage', 'Episode_Sentiment', 'Publication_Time'],
    ['Episode_Num', 'Host_Popularity_percentage', 'Episode_Sentiment', 'Genre'],
    ['Episode_Num', 'Host_Popularity_percentage', 'Publication_Day', 'Publication_Time'],
    ['Episode_Num', 'Host_Popularity_percentage', 'Publication_Time', 'Genre'],
    ['Episode_Num', 'Guest_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment'],
    ['Episode_Num', 'Guest_Popularity_percentage', 'Number_of_Ads', 'Genre'],
    ['Episode_Num', 'Host_Popularity_percentage', 'Episode_Sentiment', 'Podcast_Name'],
    ['Host_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment', 'Podcast_Name'],
    ['Host_Popularity_percentage', 'Number_of_Ads', 'Publication_Day', 'Podcast_Name'],
    ['Host_Popularity_percentage', 'Number_of_Ads', 'Publication_Time', 'Podcast_Name'],
    
]

def feature_engineer(data1, data2):
    data1['is_train'] = 1
    data2['is_train'] = 0
    combined_dataset = pd.concat([data1, data2], ignore_index = True)
    combined_dataset['Episode_Num'] = combined_dataset['Episode_Title'].str[8:]
    combined_dataset['is_weekend'] = combined_dataset['Publication_Day'].isin(['Saturday', 'Sunday']).astype(int)
    combined_dataset = combined_dataset.drop(columns = ['Episode_Title'])
    # fill nan value
    combined_dataset['Episode_Length_minutes'] = combined_dataset['Episode_Length_minutes'].fillna(combined_dataset['Episode_Length_minutes'].median())
    combined_dataset['Guest_Popularity_percentage'] = combined_dataset['Guest_Popularity_percentage'].fillna(combined_dataset['Guest_Popularity_percentage'].median())
    combined_dataset['Number_of_Ads'] = combined_dataset['Number_of_Ads'].fillna(combined_dataset['Number_of_Ads'].median())
    # add linear feature
    combined_dataset['Linear_Feature'] = 0.72 * combined_dataset['Episode_Length_minutes']
    ELM = []
    for k in range(3):
        col_name = f'ELm_r{k}'
        combined_dataset[col_name] = combined_dataset['Episode_Length_minutes'].round(k)
        ELM.append(col_name)
    for comb in selected_comb:
        name = '_'.join(comb)
            
        if len(comb) == 2:
            combined_dataset[name] = combined_dataset[comb[0]].astype(str) + '_' + combined_dataset[comb[1]].astype(str)
        elif len(comb) == 3:
            combined_dataset[name] = (combined_dataset[comb[0]].astype(str) + '_' +
                                   combined_dataset[comb[1]].astype(str) + '_' +
                                   combined_dataset[comb[2]].astype(str))
        elif len(comb) == 4:
            combined_dataset[name] = (combined_dataset[comb[0]].astype(str) + '_' +
                                   combined_dataset[comb[1]].astype(str) + '_' +
                                   combined_dataset[comb[2]].astype(str) + '_' +
                                   combined_dataset[comb[3]].astype(str))
    
        encoded_columns.append(name)
    combined_dataset[encoded_columns] = combined_dataset[encoded_columns].astype('category')
    # divide back to train and test dataset
    train_dataset = combined_dataset[combined_dataset['is_train'] == 1]
    test_dataset = combined_dataset[combined_dataset['is_train'] == 0]
    train_dataset = train_dataset.drop(columns=['is_train'])
    test_dataset = test_dataset.drop(columns = ['is_train'])
    return train_dataset, test_dataset

TARGET = 'Listening_Time_minutes'
CATS = ['Podcast_Name', 'Episode_Num', 'Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment']
NUMS = ['Episode_Length_minutes', 'Host_Popularity_percentage', 
        'Guest_Popularity_percentage', 'Number_of_Ads', 'Linear_Feature']
from xgboost import XGBRegressor
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd
encode_stats = ['mean']
FOLDS = 7 # Both 2 model got 7 folds
import gc
import pickle
def xgBoost118Predict():
    train_dataset = pd.read_csv('/kaggle/input/playground-series-s5e4/train.csv')
    test_dataset = pd.read_csv('/kaggle/input/playground-series-s5e4/test.csv')
    original_dataset = pd.read_csv('/kaggle/input/podcast-listening-time-prediction-dataset/podcast_dataset.csv')
    original_dataset = original_dataset.dropna(subset=[TARGET]).drop_duplicates()
    
    # for col in CATS:
    #     train_dataset[col] = train_dataset[col].astype('category')
    #     test_dataset[col]  = test_dataset[col].astype('category')

    train_dataset = pd.concat([train_dataset, original_dataset], axis=0, ignore_index=True)
    print('Starting Engineering')
    train_dataset, test_dataset = feature_engineer(train_dataset, test_dataset)
    print('Finish Engineering')
    # y_train = train_dataset[TARGET]; train_dataset.drop(TARGET, axis=1, inplace=True)
    # test_ids = test_dataset['id'].values
    test_dataset.drop(TARGET, axis=1, inplace=True)

    FEATURES = NUMS + CATS + encoded_columns

    
    print('Start Predict model')
    pred = np.zeros(len(test_dataset))
    outer_kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)
    for fold, (tr_idx, vl_idx) in enumerate(outer_kf.split(train_dataset), 1):
        print(f"-- Fold {fold}/{FOLDS} --")
        X_tr_raw = train_dataset.loc[tr_idx, FEATURES].reset_index(drop=True)
        y_tr     = train_dataset.loc[tr_idx, TARGET].reset_index(drop=True)
        X_vl_raw = train_dataset.loc[vl_idx, FEATURES].reset_index(drop=True)
        y_vl     = train_dataset.loc[vl_idx, TARGET].reset_index(drop=True)
        X_ts_raw = test_dataset[FEATURES].copy()

        # 5) Make fresh copies for encoding
        X_tr = X_tr_raw.copy()
        X_vl = X_vl_raw.copy()
        X_ts = X_ts_raw.copy()

        # 6) Inner CV for target‐encode
        inner_kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)
        for in_tr_idx, in_vl_idx in inner_kf.split(X_tr_raw):
            # build small training fold (features + target)
            in_tr = pd.concat([
                X_tr_raw.loc[in_tr_idx].reset_index(drop=True),
                y_tr.loc[in_tr_idx].reset_index(drop=True)
            ], axis=1)
            in_vl = X_tr_raw.loc[in_vl_idx].reset_index(drop=True)
        
            for col in encoded_columns:
                for stat in encode_stats:
                    te_tmp = target_encode(
                        in_tr, in_vl.copy(),
                        col, TARGET,
                        stats=stat,
                        prefix='TE'
                    )
                    te_col = f"TE_{col}_{stat}"
                    X_tr.loc[in_vl_idx, te_col] = te_tmp[te_col].values
        
            # free inner-fold intermediates
            del in_tr, in_vl, te_tmp
        gc.collect()

        # 7) Encode valid & test just once
        full_tr = pd.concat([X_tr_raw, y_tr], axis=1)
        for col in encoded_columns:
            for stat in encode_stats:
                X_vl = target_encode(
                    full_tr, X_vl, col, TARGET,
                    stats=stat,
                    prefix='TE'
                )
                X_ts = target_encode(
                    full_tr, X_ts, col, TARGET,
                    stats=stat,
                    prefix='TE'
                )
        gc.collect()

        X_tr.drop(columns=encoded_columns, inplace=True)
        X_vl.drop(columns=encoded_columns, inplace=True)
        X_ts.drop(columns=encoded_columns, inplace=True)
        
        enc = OrderedTargetEncoder(cat_cols=CATS, n_splits=FOLDS, smoothing=20)
        enc.fit(X_tr, y_tr)
        X_tr[CATS] = enc.transform(X_tr[CATS])
        X_vl[CATS] = enc.transform(X_vl[CATS])
        X_ts[CATS] = enc.transform(X_ts[CATS])

        with open(f'/kaggle/input/xgboostver11-8/xgb_model{fold}.pkl','rb') as f:
            model = pickle.load(f)
        pred += model.predict(X_ts)

        del X_tr_raw, X_vl_raw
        del X_tr, X_vl, X_ts
        del y_tr, y_vl, enc, model
        gc.collect()
        print(f'Finish fold {fold}')
    print('Finish predicting task')
    del train_dataset, test_dataset, original_dataset
    gc.collect()
    return pred / FOLDS

def runThis():
    predict_from_model_1 = xgBoost118Predict()
    print(predict_from_model_1)
    test_dataset = pd.read_csv('/kaggle/input/playground-series-s5e4/test.csv')
    test_id = test_dataset['id'].unique()
    # predict_from_model_2 = xgboost117predict()
    # print(predict_from_model_2)
    pred_final = predict_from_model_1
    print(pred_final)
    df_sub = pd.read_csv("/kaggle/input/playground-series-s5e4/sample_submission.csv")
    df_sub.Listening_Time_minutes = pred_final
    df_sub.to_csv('submission_3.csv', index = False)
runThis()

In [None]:
%reset -f

# LightGBM

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import pandas as pd
pd.options.mode.copy_on_write = True
import numpy as np
import xgboost as xgb
from sklearn.model_selection import KFold
from cuml.preprocessing import TargetEncoder
from sklearn.preprocessing import LabelEncoder
from tqdm.auto import tqdm
from itertools import combinations
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import mean_squared_error

# from lightgbm import LGBMRegressor
import lightgbm as lgb

def process_combinations_fast(df, columns_to_encode, pair_size, max_batch_size=2000):
    # Precompute string versions of all columns once
    str_df = df[columns_to_encode]
    le = LabelEncoder()
    str_df = str_df.astype(str)
    total_new_cols = 0
    
    for r in pair_size:
        print(f"Processing {r}-combinations")
        
        # Count total combinations for this r-value
        n_combinations = np.math.comb(len(columns_to_encode), r)
        print(f"Total {r}-combinations to process: {n_combinations}")
        
        # Process combinations in batches to manage memory
        combos_iter = combinations(columns_to_encode, r)
        batch_cols = []
        batch_names = []
        
        with tqdm(total=n_combinations) as pbar:
            while True:
                # Collect a batch of combinations
                batch_cols.clear()
                batch_names.clear()
                
                # Fill the current batch
                for _ in range(max_batch_size):
                    try:
                        cols = next(combos_iter)
                        batch_cols.append(list(cols))
                        batch_names.append('+'.join(cols))
                    except StopIteration:
                        break
                
                if not batch_cols:  # No more combinations
                    break
                
                # Process this batch vectorized
                for i, (cols, new_name) in enumerate(zip(batch_cols, batch_names)):
                    # Fast vectorized concatenation
                    result = str_df[cols[0]].copy()
                    for col in cols[1:]:
                        result += '' + str_df[col]
                    
                    df[new_name] = le.fit_transform(result) + 1
                    pbar.update(1)
                
                total_new_cols += len(batch_cols)
                if len(batch_cols) == max_batch_size:  # Only print on full batches
                    print(f"Progress: {total_new_cols}/{n_combinations} combinations processed")
        
        print(f"Completed all {r}-combinations. Total columns now: {len(df.columns)}")
    
    return df

TARGET = 'Listening_Time_minutes'
# Load data
df_train = pd.read_csv("/kaggle/input/playground-series-s5e4/train.csv")
df_train.drop(columns=['id'], inplace=True)
df_test = pd.read_csv('/kaggle/input/playground-series-s5e4/test.csv')
df_test.drop(columns=['id'], inplace=True)

original = pd.read_csv('/kaggle/input/podcast-listening-time-prediction-dataset/podcast_dataset.csv')

original_clean = original.dropna(subset=[TARGET]).drop_duplicates()
df_train = pd.concat([df_train, original_clean], axis=0, ignore_index=True)

df = pd.concat([df_train, df_test], axis=0, ignore_index=True)

# df.drop(columns=['id'], inplace=True)
df = df.drop_duplicates()

# outlier removal
df['Episode_Length_minutes'] = np.maximum(0, np.minimum(120, df['Episode_Length_minutes']))
df['Host_Popularity_percentage'] = np.maximum(20, np.minimum(100, df['Host_Popularity_percentage']))
df['Guest_Popularity_percentage'] = np.maximum(0, np.minimum(100, df['Guest_Popularity_percentage']))
df['Host_Popularity_bin'] = pd.cut(df['Host_Popularity_percentage'], bins=[20,40,60,80,100], labels=[1,2,3,4])
df.loc[df['Number_of_Ads'] > 3, 'Number_of_Ads'] = 0

# Encode categorical features
day_mapping = {'Monday': 1, 'Tuesday': 2, 'Wednesday': 3, 'Thursday': 4, 'Friday': 5, 'Saturday': 6, 'Sunday': 7}
df['Publication_Day'] = df['Publication_Day'].map(day_mapping)

time_mapping = {'Morning': 1, 'Afternoon': 2, 'Evening': 3, 'Night': 4}
df['Publication_Time'] = df['Publication_Time'].map(time_mapping)

sentiment_map = {'Negative': 1, 'Neutral': 2, 'Positive': 3}
df['Episode_Sentiment'] = df['Episode_Sentiment'].map(sentiment_map)

df['Episode_Title'] = df['Episode_Title'].str.replace('Episode ', '', regex=True)
df['Episode_Title'] = df['Episode_Title'].astype('int')
df['Title_Episode_Length'] = df['Episode_Title'] / (df['Episode_Length_minutes'] + 1)
le = LabelEncoder()
for col in df.select_dtypes('object').columns:
    df[col] = le.fit_transform(df[col]) + 1

# Some Feature engineering
for col in ['Episode_Length_minutes']:
    df[[col + '_sqrt', col + '_squared']] = np.column_stack([
    np.sqrt(df[col]),
    df[col] ** 2
    ])

for col in tqdm(['Episode_Sentiment', 'Genre', 'Publication_Day', 'Podcast_Name', 'Episode_Title',
                 'Guest_Popularity_percentage', 'Host_Popularity_percentage', 'Number_of_Ads']):
    df[col + '_EP'] = df.groupby(col)['Episode_Length_minutes'].transform('mean')

df = process_combinations_fast(df, ['Episode_Length_minutes', 'Episode_Title', 'Publication_Time', 'Host_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment', 
                     'Publication_Day', 'Podcast_Name','Genre','Guest_Popularity_percentage'], [2, 3, 5, 7], 1000) # [2, 3, 5, 7]

df = df.astype('float32')

df_train = df.iloc[:-len(df_test)]
df_test = df.iloc[-len(df_test):].reset_index(drop=True)

df_train = df_train[df_train['Listening_Time_minutes'].notnull()]

target = df_train.pop('Listening_Time_minutes')
df_test.pop('Listening_Time_minutes')

df_train.shape, df_test.shape

import pickle

import numpy as np
from sklearn.model_selection import KFold
import lightgbm as lgb
from tqdm import tqdm
# Assuming TargetEncoder is imported from somewhere else, e.g., category_encoders
# from category_encoders import TargetEncoder

seed = 42
cv = KFold(n_splits=7, random_state=seed, shuffle=True)
pred_test = np.zeros(df_test.shape[0])

# exponentially decaying LR schedule
# Corrected: Accept CallbackEnv object and extract iteration
def lr_decay(env):
    """
    Exponentially decaying learning rate schedule.
    Args:
        env (lgb.CallbackEnv): The callback environment object.
    Returns:
        float: The learning rate for the current round.
    """
    current_round = env.iteration # Extract current iteration from CallbackEnv
    lr_start, lr_end, decay_speed = 0.02, 0.005, 0.01
    return lr_end + (lr_start - lr_end) * np.exp(-decay_speed * current_round)

# callbacks
# Pass the lr_decay function directly, not wrapped in LearningRateScheduler
# The evals_result dictionary will be populated by the early_stopping callback
evals_result = {} # Define evals_result dictionary here
# early_stopping callback handles verbosity
early_stop_callback = lgb.callback.early_stopping(stopping_rounds=30, first_metric_only=True, verbose=500) # Set verbose here

# LightGBM params
lgbm_params = {
    'objective':        'regression_l2',
    'metric':           'rmse',
    'seed':             seed,
    'max_depth':        -1,
    # The initial learning rate set here will be overridden by the scheduler
    'learning_rate':    0.04,
    'num_leaves':       512,
    'colsample_bytree': 0.2,
    # Corrected: Reduced max_bin for GPU compatibility
    'max_bin':          255, # Changed from 512 to 255 for GPU
    'verbosity':        -1, # This controls general LightGBM output, not evaluation print frequency
    'device':           'gpu'  # use 'cpu' if you don’t have GPU support
}

all_histories = []
# Assuming 'features' is correctly derived from df_train columns before the loop
features = df_train.columns.tolist()

for fold, (trn_idx, val_idx) in enumerate(cv.split(df_train), 1):
    print(f"Starting Fold {fold}")
    X_trn, y_trn = df_train.iloc[trn_idx].copy(), target.iloc[trn_idx]
    X_val, y_val = df_train.iloc[val_idx].copy(), target.iloc[val_idx]
    # Ensure X_sub has the same columns as X_trn before adding TE features
    X_sub  = df_test[X_trn.columns.tolist()].copy()


    # === Target‐encoding ===
    # Assuming TargetEncoder is defined and imported correctly
    encoder = TargetEncoder(n_folds=5, seed=seed, stat="mean")
    print(f"Fold {fold}: Applying Target Encoding...")

    # first 20 new cols
    for col in tqdm(features[:20], desc=f"Fold {fold} TE‐add"):
        # Ensure the column exists in the dataframes before encoding
        if col in X_trn.columns:
            X_trn[f"{col}_te"] = encoder.fit_transform(X_trn[[col]], y_trn)
            X_val[f"{col}_te"] = encoder.transform(X_val[[col]])
            X_sub[f"{col}_te"] = encoder.transform(X_sub[[col]])
        else:
            print(f"Warning: Column '{col}' not found in training data for TE-add.")

    # remaining, in‐place
    for col in tqdm(features[20:], desc=f"Fold {fold} TE‐replace"):
         # Ensure the column exists in the dataframes before encoding
        if col in X_trn.columns:
            X_trn[col] = encoder.fit_transform(X_trn[[col]], y_trn)
            X_val[col] = encoder.transform(X_val[[col]])
            X_sub[col] = encoder.transform(X_sub[[col]])
        else:
             print(f"Warning: Column '{col}' not found in training data for TE-replace.")


    # === Create LightGBM datasets ===
    dtrain = lgb.Dataset(X_trn, label=y_trn)
    dvalid = lgb.Dataset(X_val, label=y_val, reference=dtrain)

    # evals_result dictionary is defined before the loop and populated by callbacks
    # evals_result = {} # Removed: Defined outside the loop

    print(f"Fold {fold}: Training LightGBM model...")
    model = lgb.train(
        params              = lgbm_params,
        train_set           = dtrain,
        num_boost_round     = 1_000_000, # Set a large number, early stopping will stop it
        valid_sets          = [dtrain, dvalid],
        valid_names         = ['train', 'valid'],
        # early_stopping_rounds is now handled by the callback
        # early_stopping_rounds = 30,
        callbacks           = [lr_decay, early_stop_callback], # Pass the function directly
        # Removed: evals_result is not a direct keyword argument for lgb.train
        # evals_result        = evals_result,
        # Removed: verbose_eval is handled by callbacks
        # verbose_eval        = 500
    )


    # The evals_result dictionary defined before the loop will now contain the training history
    all_histories.append(evals_result.copy()) # Append a copy of the results for this fold

    # If you have a plotting utility that expects a history dict:
    # plot_training_history(evals_result) # Assuming plot_training_history is defined

    # Predict on validation (if you need it for evaluation metrics within the loop)
    # val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    # Predict on test fold and clip
    test_pred = model.predict(X_sub, num_iteration=model.best_iteration)
    pred_test += np.clip(test_pred, 0, 120)

    print(f"Fold {fold} finished, best_iteration={model.best_iteration}")
    print("-" * 60)

# average over folds
pred_test /= cv.n_splits

print("Training complete. Test predictions averaged across folds.")

pred_test

df_sub = pd.read_csv("/kaggle/input/playground-series-s5e4/sample_submission.csv")
df_sub.Listening_Time_minutes = pred_test
df_sub.to_csv('submission_4.csv', index=False)

In [None]:
# Read file here

df1 = pd.read_csv('/kaggle/working/submission_1.csv')
df2 = pd.read_csv('/kaggle/working/submission_2.csv')
df3 = pd.read_csv('/kaggle/working/submission_3.csv')
df4 = pd.read_csv('/kaggle/working/submission_4.csv')

df = pd.read_csv("/kaggle/input/playground-series-s5e4/sample_submission.csv")
df['Listening_Time_minutes'] = 0.23 * df1['Listening_Time_minutes'] + 0.15 * df2['Listening_Time_minutes'] + 0.32 * df3['Listening_Time_minutes'] + 0.3 * df4['Listening_Time_minutes']

df.to_csv('submission.csv', index=False)