<a href="https://colab.research.google.com/github/John1495/BBC-NYOK/blob/main/Linear_R.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install category-encoders

Collecting category-encoders
  Downloading category_encoders-2.8.1-py3-none-any.whl.metadata (7.9 kB)
Downloading category_encoders-2.8.1-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category-encoders
Successfully installed category-encoders-2.8.1


In [5]:
# Linear Regression with Custom Interaction Features

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from category_encoders import TargetEncoder
import warnings
warnings.filterwarnings('ignore')

# ===============================
# 1. Load Data
# ===============================
import dask.dataframe as dd

df = dd.read_csv("/kaggle/train.csv", dtype={
    'Podcast_Name': 'category',
    'Episode_Title': 'category',
    'Genre': 'category',
    'Publication_Day': 'category',
    'Publication_Time': 'category',
    'Episode_Sentiment': 'category'
}).compute()

# ===============================
# 2. Basic Cleaning
# ===============================
numerical_features = ['Episode_Length_minutes', 'Guest_Popularity_percentage', 'Number_of_Ads']
categorical_features = ['Podcast_Name', 'Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment']

# Drop unused columns
df.drop(['id', 'Episode_Title'], axis=1, inplace=True)

# Impute missing values
numerical_imputer = SimpleImputer(strategy="median")
categorical_imputer = SimpleImputer(strategy="most_frequent")
df[numerical_features] = numerical_imputer.fit_transform(df[numerical_features])
df[categorical_features] = categorical_imputer.fit_transform(df[categorical_features])

# ===============================
# 3. Train/Test Split
# ===============================
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# ===============================
# 4. Feature Engineering
# ===============================
# Aggregated features from training data
def create_aggregates(df, base_df):
    podcast_features = base_df.groupby("Podcast_Name").agg(
        avg_podcast_listening_time=('Listening_Time_minutes', 'mean'),
        median_podcast_listening_time=('Listening_Time_minutes', 'median'),
        total_podcast_listens=('Listening_Time_minutes', 'count')
    ).reset_index()

    genre_features = base_df.groupby("Genre").agg(
        avg_genre_listening_time=('Listening_Time_minutes', 'mean'),
        avg_genre_episode_length=('Episode_Length_minutes', 'mean')
    ).reset_index()

    df = df.merge(podcast_features, on="Podcast_Name", how="left")
    df = df.merge(genre_features, on="Genre", how="left")

    for col in ['avg_podcast_listening_time', 'median_podcast_listening_time', 'total_podcast_listens',
                'avg_genre_listening_time', 'avg_genre_episode_length']:
        df[col].fillna(base_df['Listening_Time_minutes'].mean(), inplace=True)

    return df

train_df = create_aggregates(train_df, train_df)
test_df = create_aggregates(test_df, train_df)

# ===============================
# 5. Target Encoding
# ===============================
target_encoder = TargetEncoder(cols=['Podcast_Name'])
train_df['Podcast_Name'] = target_encoder.fit_transform(train_df['Podcast_Name'], train_df['Listening_Time_minutes'])
test_df['Podcast_Name'] = target_encoder.transform(test_df['Podcast_Name'])

# ===============================
# 6. Custom Interaction Features
# ===============================
def create_interactions(df):
    df['Length_x_Ads'] = df['Episode_Length_minutes'] * df['Number_of_Ads']
    df['Guest_x_GenreTime'] = df['Guest_Popularity_percentage'] * df['avg_genre_listening_time']
    df['Podcast_x_AvgTime'] = df['Podcast_Name'] * df['avg_podcast_listening_time']
    return df

train_df = create_interactions(train_df)
test_df = create_interactions(test_df)

# ===============================
# 7. Final Feature List
# ===============================
numeric_features = [
    'Episode_Length_minutes', 'Guest_Popularity_percentage', 'Number_of_Ads',
    'Podcast_Name', 'avg_podcast_listening_time', 'median_podcast_listening_time',
    'total_podcast_listens', 'avg_genre_listening_time', 'avg_genre_episode_length',
    'Length_x_Ads', 'Guest_x_GenreTime', 'Podcast_x_AvgTime'
]
one_hot_features = ['Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment']

X_train = train_df[numeric_features + one_hot_features]
y_train = train_df['Listening_Time_minutes']
X_test = test_df[numeric_features + one_hot_features]
y_test = test_df['Listening_Time_minutes']

# ===============================
# 8. Preprocessing Pipeline
# ===============================
numerical_pipeline = Pipeline([
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), one_hot_features)
    ]
)

# ===============================
# 9. Linear Regression Pipeline
# ===============================
model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

model.fit(X_train, y_train)

# ===============================
# 10. Evaluation
# ===============================
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f"📉 Final Test RMSE with Custom Interaction Features: {rmse:.2f}")


📉 Final Test RMSE with Custom Interaction Features: 13.37
