In [2]:
import pandas as pd
import numpy as np
import joblib
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# --- 1. CONFIGURATION ---
DATA_PATH = '../data/Social Media Engagement Dataset.csv'
MODEL_DIR = '../models'
MODEL_PATH = os.path.join(MODEL_DIR, 'reach_model.pkl')
METADATA_PATH = os.path.join(MODEL_DIR, 'reach_metadata.pkl')

RANDOM_SEED = 42
os.makedirs(MODEL_DIR, exist_ok=True)

In [3]:
# --- 2. LOAD DATA ---
print("Loading Data...")
try:
    df = pd.read_csv(DATA_PATH)
except FileNotFoundError:
    # Fallback to absolute path
    df = pd.read_csv(r'd:\projects\StudioFlowAI\backend\app\ml\data\Social Media Engagement Dataset.csv')

# --- 3. FEATURE ENGINEERING ---

# Extract Time Features
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['hour_of_day'] = df['timestamp'].dt.hour
# Day of week is already in dataset as 'day_of_week'

print(f"Data loaded. Shape: {df.shape}")

# Check for follower_count
has_followers = 'follower_count' in df.columns
if has_followers:
    print("Using 'follower_count' feature.")
else:
    print("WARNING: 'follower_count' column not found. Training without it.")

Loading Data...
Data loaded. Shape: (12000, 29)


In [4]:
# --- 4. PREPROCESSING ---
categorical_features = ['platform', 'topic_category', 'day_of_week'] # time to post (day), topic
numerical_features = ['hour_of_day'] # time to post (hour)

if has_followers:
    numerical_features.append('follower_count')

target = 'impressions' # Reach

X = df[categorical_features + numerical_features]
y = df[target]

# Split Data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_SEED
)

# Preprocessing Pipeline
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Regression Model
model = RandomForestRegressor(
    n_estimators=200,
    random_state=RANDOM_SEED,
    n_jobs=-1
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', model)
])

In [5]:
# --- 5. TRAIN & EVALUATE ---
print("Training Reach Prediction Model...")
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error: {mae:.2f}")
print(f"R2 Score: {r2:.4f}")

Training Reach Prediction Model...
Mean Absolute Error: 26963.73
R2 Score: -0.2943


In [None]:
# --- 6. SAVE ---
print(f"Saving model to {MODEL_PATH}...")
joblib.dump(pipeline, MODEL_PATH)

metadata = {
    'model_type': 'RandomForestRegressor',
    'features': categorical_features + numerical_features,
    'target': target
}
joblib.dump(metadata, METADATA_PATH)
print("Model saved.")