<a href="https://colab.research.google.com/github/John1495/BBC-NYOK/blob/main/Another_copy_of_podcast_prediction_pipeline_combinedtrain12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🎧 Podcast Listening Time Prediction
This notebook performs data preprocessing, exploratory data analysis (EDA), feature engineering, and model training to predict podcast listening time.

In [None]:
import dask.dataframe as dd
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
from sklearn.impute import SimpleImputer
from datetime import datetime, timedelta
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

# Load data with optimized dtypes
df = dd.read_csv("/kaggle/train.csv", dtype={
    'Podcast_Name': 'category',
    'Episode_Title': 'category',
    'Genre': 'category',
    'Publication_Day': 'category',
    'Publication_Time': 'category',
    'Episode_Sentiment': 'category'
}).compute()

# Separate feature types
numerical_features = ['Episode_Length_minutes', 'Guest_Popularity_percentage', 'Number_of_Ads']
categorical_features = ['Podcast_Name', 'Episode_Title', 'Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment']

# Impute missing values
df[numerical_features] = SimpleImputer(strategy="median").fit_transform(df[numerical_features])
df[categorical_features] = SimpleImputer(strategy="most_frequent").fit_transform(df[categorical_features])

print("✅ Missing Values After Imputation:")
print(df.isnull().sum())

✅ Missing Values After Imputation:
id                             0
Podcast_Name                   0
Episode_Title                  0
Episode_Length_minutes         0
Genre                          0
Host_Popularity_percentage     0
Publication_Day                0
Publication_Time               0
Guest_Popularity_percentage    0
Number_of_Ads                  0
Episode_Sentiment              0
Listening_Time_minutes         0
dtype: int64


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## 🧠 Feature Engineering

In [None]:
# Aggregations
podcast_features = df.groupby("Podcast_Name").agg(
    avg_podcast_listening_time=('Listening_Time_minutes', 'mean'),
    median_podcast_listening_time=('Listening_Time_minutes', 'median'),
    total_podcast_listens=('Listening_Time_minutes', 'count'),
    std_podcast_listening_time=('Listening_Time_minutes', 'std'),
    avg_episode_length=('Episode_Length_minutes', 'mean'),
    avg_host_popularity=('Host_Popularity_percentage', 'mean'),
    avg_guest_popularity=('Guest_Popularity_percentage', 'mean'),
    avg_num_ads=('Number_of_Ads', 'mean')
).reset_index()

genre_features = df.groupby("Genre").agg(
    avg_genre_listening_time=('Listening_Time_minutes', 'mean'),
    avg_genre_episode_length=('Episode_Length_minutes', 'mean'),
    total_genre_listens=('Listening_Time_minutes', 'count')
).reset_index()

sentiment_features = df.groupby("Episode_Sentiment").agg(
    avg_sentiment_listening_time=('Listening_Time_minutes', 'mean'),
    total_sentiment_listens=('Listening_Time_minutes', 'count')
).reset_index()

day_features = df.groupby("Publication_Day").agg(
    avg_day_listening_time=('Listening_Time_minutes', 'mean'),
    total_day_listens=('Listening_Time_minutes', 'count')
).reset_index()

# Merge engineered features
df = df.merge(podcast_features, on="Podcast_Name", how="left")
df = df.merge(genre_features, on="Genre", how="left")
df = df.merge(sentiment_features, on="Episode_Sentiment", how="left")
df = df.merge(day_features, on="Publication_Day", how="left")
df.fillna(0, inplace=True)  # Handle any remaining NaNs


In [None]:
# Mapping weekday names to dates
def get_date_from_weekday(weekday):
    # Weekday names as they appear in df: 'Monday', 'Tuesday', etc.
    weekday_mapping = {
        'Monday': 0,
        'Tuesday': 1,
        'Wednesday': 2,
        'Thursday': 3,
        'Friday': 4,
        'Saturday': 5,
        'Sunday': 6
    }
    # Get the weekday number from the dictionary
    weekday_number = weekday_mapping[weekday]

    # Calculate the date difference from the start date
    delta_days = weekday_number - start_date.weekday()
    date_of_weekday = start_date + timedelta(days=delta_days)

    return date_of_weekday.strftime('%Y-%m-%d')  # Return date as a string

# Applying the conversion function to Publication_Day column
df['Publication_Day'] = df['Publication_Day'].apply(get_date_from_weekday)

# Now parse the date correctly
df['Publication_Day'] = pd.to_datetime(df['Publication_Day'])

# Check if the conversion is successful
print(df['Publication_Day'].head()) '

0   2025-04-03
1   2025-04-05
2   2025-04-01
3   2025-03-31
4   2025-03-31
Name: Publication_Day, dtype: datetime64[ns]


In [None]:
# --- Ordinal Time Features ---
df = df.sort_values('Publication_Day')

# 💡 Days since start
df['Days_Since_Start'] = (df['Publication_Day'] - df['Publication_Day'].min()).dt.days

# 💡 Lag features
df['Lag_1_Listening'] = df['Listening_Time_minutes'].shift(1)
df['Lag_1_Podcast'] = df.groupby('Podcast_Name')['Listening_Time_minutes'].shift(1)

# 💡 Diff features
df['Listening_Diff'] = df['Listening_Time_minutes'].diff()
df['Podcast_Diff'] = df.groupby('Podcast_Name')['Listening_Time_minutes'].diff()

# 💡 Rolling features (3-day trend)
df['Rolling_3'] = df['Listening_Time_minutes'].rolling(window=3).mean()
df['Rolling_3_Podcast'] = df.groupby('Podcast_Name')['Listening_Time_minutes'].transform(lambda x: x.rolling(window=3).mean())

# 💡 Seasonality: Weekday
df['Weekday'] = df['Publication_Day'].dt.weekday
df = pd.get_dummies(df, columns=['Weekday'], prefix='Day')

# 💡 Optional: sine/cosine transformation for day-of-year seasonality
df['Day_Sin'] = np.sin(2 * np.pi * df['Publication_Day'].dt.dayofyear / 365)
df['Day_Cos'] = np.cos(2 * np.pi * df['Publication_Day'].dt.dayofyear / 365)

# --- Final clean-up ---
df.fillna(0, inplace=True)

## 🤖 Model Training and Evaluation

In [None]:
#Prepare data
df_model = df.copy()
df_model.drop(['id', 'Episode_Title'], axis=1, inplace=True)  # Dropping non-relevant columns

# Encode categorical variables
for col in ['Podcast_Name', 'Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment']:
    df_model[col] = LabelEncoder().fit_transform(df_model[col])

# Define features and target variable
X = df_model.drop('Listening_Time_minutes', axis=1)
y = df_model['Listening_Time_minutes']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the Random Forest model
rf_model.fit(X_train, y_train)

# Make predictions
preds = rf_model.predict(X_test)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, preds))

# Output the RMSE
print(f"Random Forest RMSE: {rmse:.2f}")

Random Forest RMSE: 0.45


In [None]:
 # Save the trained model
joblib.dump(rf_model, 'random_forest_model.pkl')

['random_forest_model.pkl']