In [25]:
import pandas as pd
import numpy as np
import os
import joblib

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


In [4]:
import pandas as pd
import numpy as np

# Load data
df = pd.read_csv("../notebook/data/OnlineNewsPopularity.csv")

# Clean column names
df.columns = df.columns.str.strip()

# Optional: Drop unwanted columns
df = df.drop(columns=['url', 'timedelta'], errors='ignore')

# Confirm 'shares' exists before log-transforming
if 'shares' in df.columns:
    df['log_shares'] = np.log1p(df['shares'])
else:
    raise KeyError("'shares' column not found in the dataset")



In [10]:
df.replace({'TRUE': 1, 'FALSE': 0}, inplace=True)

In [11]:
df = df.apply(pd.to_numeric, errors='coerce')

In [12]:
df.fillna(0, inplace=True)

In [13]:
X = df.drop(columns=['shares', 'log_shares'], errors='ignore')
y = df['log_shares']

In [17]:
def classify_popularity(shares):
    if shares < 1400:
        return 0  # Low
    elif shares < 5000:
        return 1  # Medium
    else:
        return 2  # High

df['popularity_class'] = df['shares'].apply(classify_popularity)

In [18]:
X = df.drop(columns=['shares', 'log_shares', 'popularity_class'], errors='ignore')

In [14]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [19]:
y_reg = df['log_shares']
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_scaled, y_reg, test_size=0.2, random_state=42)

reg_model = RandomForestRegressor(random_state=42)
reg_model.fit(X_train_r, y_train_r)

y_pred_r = reg_model.predict(X_test_r)
rmse = np.sqrt(mean_squared_error(y_test_r, y_pred_r))
r2 = r2_score(y_test_r, y_pred_r)

print("\n🔢 Regression Model:")
print(f"✅ RMSE: {rmse}")
print(f"✅ R² Score: {r2}")


🔢 Regression Model:
✅ RMSE: 0.8500820301624693
✅ R² Score: 0.15698251686960707


In [26]:
y_cls = df['popularity_class']
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_scaled, y_cls, test_size=0.2, random_state=42)

cls_model = RandomForestClassifier(random_state=42)
cls_model.fit(X_train_c, y_train_c)

y_pred_c = cls_model.predict(X_test_c)
accuracy = accuracy_score(y_test_c, y_pred_c)

print("\n🏷️ Classification Model:")
print(f"✅ Accuracy: {accuracy}")
print("✅ Classification Report:")
labels = sorted(y_test_c.unique())  
target_names = ["Low", "Medium", "High"]
selected_names = [target_names[i] for i in labels]

print(classification_report(y_test_c, y_pred_c, target_names=selected_names))


🏷️ Classification Model:
✅ Accuracy: 0.5686719636776391
✅ Classification Report:
              precision    recall  f1-score   support

         Low       0.62      0.72      0.66      3774
      Medium       0.51      0.56      0.53      3118
        High       0.42      0.05      0.09      1037

    accuracy                           0.57      7929
   macro avg       0.51      0.44      0.43      7929
weighted avg       0.55      0.57      0.54      7929

