<a href="https://colab.research.google.com/github/John1495/BBC-NYOK/blob/main/Copy_of_Linear_R.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import LabelEncoder
from datetime import datetime, timedelta

# Load your trained model
model_path = '/content/drive/MyDrive/TESTING/rf_model.joblib'
rf_model = joblib.load(model_path)

# Load your datasets (replace paths with your actual paths)
train_df = pd.read_csv('/kaggle/train.csv')
test_df = pd.read_csv('/kaggle/test.csv')

In [4]:
def clean_data(df, is_train=True):
    """Clean the dataset with appropriate handling for train vs test"""

    # 1. Handle missing values
    numerical_features = ['Episode_Length_minutes', 'Guest_Popularity_percentage', 'Number_of_Ads']
    categorical_features = ['Podcast_Name', 'Episode_Title', 'Genre', 'Publication_Day',
                          'Publication_Time', 'Episode_Sentiment']

    if is_train:
        # For training data - fit imputers
        df[numerical_features] = df[numerical_features].fillna(df[numerical_features].median())
        df[categorical_features] = df[categorical_features].fillna(df[categorical_features].mode().iloc[0])

        # Save imputation values for test data
        global train_medians, train_modes
        train_medians = df[numerical_features].median()
        train_modes = df[categorical_features].mode().iloc[0]
    else:
        # For test data - use training imputation values
        df[numerical_features] = df[numerical_features].fillna(train_medians)
        df[categorical_features] = df[categorical_features].fillna(train_modes)

    # 2. Convert Publication_Day to datetime
    def get_date_from_weekday(weekday_str):
        weekday_mapping = {
            'Monday': 0, 'Tuesday': 1, 'Wednesday': 2,
            'Thursday': 3, 'Friday': 4, 'Saturday': 5, 'Sunday': 6
        }
        start_date = datetime(2023, 1, 1)  # Same as training
        try:
            target_weekday = weekday_mapping[weekday_str]
            delta_days = (target_weekday - start_date.weekday()) % 7
            return start_date + timedelta(days=delta_days)
        except KeyError:
            return pd.NaT

    df['Publication_Day'] = df['Publication_Day'].apply(get_date_from_weekday)
    df['Publication_Day'] = pd.to_datetime(df['Publication_Day'])

    return df

# Clean training data
train_df = clean_data(train_df, is_train=True)

# Clean test data (using training imputation values)
test_df = clean_data(test_df, is_train=False)

In [5]:
def engineer_features(df, is_train=True):
    """Add all engineered features"""

    # 1. Aggregated features
    if is_train:
        # Calculate aggregations from training data
        global podcast_features, genre_features, sentiment_features, day_features
        podcast_features = df.groupby("Podcast_Name").agg(
            avg_podcast_listening_time=('Listening_Time_minutes', 'mean'),
            median_podcast_listening_time=('Listening_Time_minutes', 'median'),
            total_podcast_listens=('Listening_Time_minutes', 'count'),
            std_podcast_listening_time=('Listening_Time_minutes', 'std'),
            avg_episode_length=('Episode_Length_minutes', 'mean'),
            avg_host_popularity=('Host_Popularity_percentage', 'mean'),
            avg_guest_popularity=('Guest_Popularity_percentage', 'mean'),
            avg_num_ads=('Number_of_Ads', 'mean')
        ).reset_index()

        genre_features = df.groupby("Genre").agg(
            avg_genre_listening_time=('Listening_Time_minutes', 'mean'),
            avg_genre_episode_length=('Episode_Length_minutes', 'mean'),
            total_genre_listens=('Listening_Time_minutes', 'count')
        ).reset_index()

        sentiment_features = df.groupby("Episode_Sentiment").agg(
            avg_sentiment_listening_time=('Listening_Time_minutes', 'mean'),
            total_sentiment_listens=('Listening_Time_minutes', 'count')
        ).reset_index()

        day_features = df.groupby("Publication_Day").agg(
            avg_day_listening_time=('Listening_Time_minutes', 'mean'),
            total_day_listens=('Listening_Time_minutes', 'count')
        ).reset_index()

    # Merge features (for both train and test)
    df = df.merge(podcast_features, on="Podcast_Name", how="left")
    df = df.merge(genre_features, on="Genre", how="left")
    df = df.merge(sentiment_features, on="Episode_Sentiment", how="left")
    df = df.merge(day_features, on="Publication_Day", how="left")
    df.fillna(0, inplace=True)

    # 2. Time-based features
    df = df.sort_values('Publication_Day')
    min_date = train_df['Publication_Day'].min()  # Always use training min date
    df['Days_Since_Start'] = (df['Publication_Day'] - min_date).dt.days

    # Lag features (only for training)
    if is_train:
        df['Lag_1_Listening'] = df['Listening_Time_minutes'].shift(1)
        df['Lag_1_Podcast'] = df.groupby('Podcast_Name')['Listening_Time_minutes'].shift(1)

    # Rolling features (only for training)
    if is_train:
        df['Rolling_3'] = df['Listening_Time_minutes'].rolling(window=3).mean()
        df['Rolling_3_Podcast'] = df.groupby('Podcast_Name')['Listening_Time_minutes'].transform(
            lambda x: x.rolling(window=3).mean())

    # Weekday features
    df['Weekday'] = df['Publication_Day'].dt.weekday
    df = pd.get_dummies(df, columns=['Weekday'], prefix='Day')

    # Day of year seasonality
    df['Day_Sin'] = np.sin(2 * np.pi * df['Publication_Day'].dt.dayofyear / 365)
    df['Day_Cos'] = np.cos(2 * np.pi * df['Publication_Day'].dt.dayofyear / 365)

    # Fill any remaining NaNs
    df.fillna(0, inplace=True)

    return df

# Engineer features for both datasets
train_df = engineer_features(train_df, is_train=True)
test_df = engineer_features(test_df, is_train=False)

In [6]:
def prepare_for_modeling(df, is_train=True):
    """Prepare data for modeling/prediction"""

    # Drop non-feature columns
    df_model = df.drop(['id', 'Episode_Title'], axis=1, errors='ignore')

    # Encode categorical variables
    if is_train:
        global label_encoders
        label_encoders = {}
        for col in ['Podcast_Name', 'Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment']:
            le = LabelEncoder()
            df_model[col] = le.fit_transform(df_model[col])
            label_encoders[col] = le
    else:
        # For test data - use training label encoders
        for col in ['Podcast_Name', 'Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment']:
            if col in df_model.columns:
                le = label_encoders[col]
                # Handle unseen categories by mapping to 'unknown'
                unseen_mask = ~df_model[col].isin(le.classes_)
                if unseen_mask.any():
                    df_model.loc[unseen_mask, col] = le.classes_[0]  # default to first category
                df_model[col] = le.transform(df_model[col])

    # Ensure all expected columns are present
    if not is_train:
        # Add missing columns with 0 values
        expected_columns = set(train_df.columns) - {'id', 'Episode_Title', 'Listening_Time_minutes'}
        missing_cols = expected_columns - set(df_model.columns)
        for col in missing_cols:
            df_model[col] = 0

        # Reorder columns to match training
        df_model = df_model[list(expected_columns)]

    return df_model

# Prepare training data (if you want to retrain)
X_train = prepare_for_modeling(train_df, is_train=True)
y_train = train_df['Listening_Time_minutes']

# Prepare test data
X_test = prepare_for_modeling(test_df, is_train=False)

In [11]:
def prepare_for_modeling(df, is_train=True):
    """Prepare data for modeling/prediction with all required features"""

    # Drop non-feature columns
    df_model = df.drop(['id', 'Episode_Title'], axis=1, errors='ignore')

    # Add missing diff features if they don't exist
    if 'Listening_Diff' not in df_model.columns:
        df_model['Listening_Diff'] = 0
    if 'Podcast_Diff' not in df_model.columns:
        df_model['Podcast_Diff'] = 0

    # Encode categorical variables
    if is_train:
        global label_encoders
        label_encoders = {}
        for col in ['Podcast_Name', 'Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment']:
            le = LabelEncoder()
            df_model[col] = le.fit_transform(df_model[col])
            label_encoders[col] = le
    else:
        # For test data - use training label encoders
        for col in ['Podcast_Name', 'Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment']:
            if col in df_model.columns:
                le = label_encoders[col]
                # Handle unseen categories
                unseen_mask = ~df_model[col].isin(le.classes_)
                if unseen_mask.any():
                    df_model.loc[unseen_mask, col] = le.classes_[0]
                df_model[col] = le.transform(df_model[col])

    # Get the expected columns from the trained model
    expected_columns = rf_model.feature_names_in_

    # Add missing columns with 0 values
    missing_cols = set(expected_columns) - set(df_model.columns)
    for col in missing_cols:
        df_model[col] = 0

    # Remove extra columns not in training
    extra_cols = set(df_model.columns) - set(expected_columns)
    if extra_cols:
        df_model = df_model.drop(columns=list(extra_cols))

    # Ensure exact column order from training
    df_model = df_model[list(expected_columns)]

    return df_model

# Re-prepare the test data with the fixed function
X_test = prepare_for_modeling(test_df, is_train=False)

# Now make predictions
test_predictions = rf_model.predict(X_test)
test_df['Predicted_Listening_Time'] = test_predictions

# Show results
print("Predictions made successfully!")
print(test_df[['Podcast_Name', 'Episode_Title', 'Predicted_Listening_Time']].head(100))

Predictions made successfully!
             Podcast_Name Episode_Title  Predicted_Listening_Time
160512     Healthy Living   Episode 100                  0.518241
112886      Wellness Wave    Episode 81                 31.540785
240842        Funny Folks     Episode 3                 33.066673
112888        Mind & Body    Episode 69                  6.647227
208502    Fashion Forward    Episode 51                 44.435585
...                   ...           ...                       ...
27505     Mystery Matters     Episode 9                 27.897782
73357   Business Insights    Episode 67                  9.276149
197454       Learning Lab    Episode 58                 30.228786
73358   Detective Diaries    Episode 40                 38.356808
112799  Business Insights    Episode 72                 36.253707

[100 rows x 3 columns]


In [15]:
from google.colab import drive
import pandas as pd

# Mount Google Drive
drive.mount('/content/drive')

# Prepare predictions output (without actual values)
output_df = test_df[['id', 'Podcast_Name', 'Episode_Title', 'Genre',
                    'Episode_Length_minutes', 'Predicted_Listening_Time']].copy()

# Save path
save_path = '/content/drive/My Drive/podcast_predictions.csv'

# Save to CSV
output_df.to_csv(save_path, index=False)

print(f"✅ Predictions saved to: {save_path}")
print("\nSample of saved predictions:")
print(output_df.head())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Predictions saved to: /content/drive/My Drive/podcast_predictions.csv

Sample of saved predictions:
            id     Podcast_Name Episode_Title      Genre  \
160512  910512   Healthy Living   Episode 100     Health   
112886  862886    Wellness Wave    Episode 81     Health   
240842  990842      Funny Folks     Episode 3     Comedy   
112888  862888      Mind & Body    Episode 69     Health   
208502  958502  Fashion Forward    Episode 51  Lifestyle   

        Episode_Length_minutes  Predicted_Listening_Time  
160512                   13.04                  0.518241  
112886                   53.76                 31.540785  
240842                   44.66                 33.066673  
112888                   26.94                  6.647227  
208502                  106.09                 44.435585  


In [16]:
# Check if file exists
import os
if os.path.exists(save_path):
    print("\nVerification:")
    print(f"File size: {os.path.getsize(save_path)/1024:.1f} KB")
    print(f"Created at: {pd.to_datetime(os.path.getctime(save_path), unit='s')}")

    # Show first few lines
    print("\nFile preview:")
    !head "{save_path}"
else:
    print("⚠️ Error: File not saved successfully")


Verification:
File size: 15526.3 KB
Created at: 2025-04-16 15:15:43

File preview:
id,Podcast_Name,Episode_Title,Genre,Episode_Length_minutes,Predicted_Listening_Time
910512,Healthy Living,Episode 100,Health,13.04,0.5182407999999998
862886,Wellness Wave,Episode 81,Health,53.76,31.540785300000024
990842,Funny Folks,Episode 3,Comedy,44.66,33.066672700000005
862888,Mind & Body,Episode 69,Health,26.94,6.647227199999998
958502,Fashion Forward,Episode 51,Lifestyle,106.09,44.43558499999998
823277,Study Sessions,Episode 90,Education,63.84,33.6878509
796449,Style Guide,Episode 48,Lifestyle,10.27,0.3816811000000001
910039,Finance Focus,Episode 92,Business,15.96,1.1290542
823275,Game Day,Episode 32,Sports,52.99,31.845013800000014


In [17]:
# Example organized path
save_path = '/content/drive/My Drive/DataScience/PodcastProject/predictions_2023.csv'

In [18]:
from datetime import datetime
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
save_path = f'/content/drive/My Drive/podcast_predictions_{timestamp}.csv'