In [4]:
# Imports
import json
import joblib
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge, LogisticRegression 
from sklearn.metrics import (
                            mean_absolute_error, mean_squared_error, r2_score, 
                            accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
                            )
from sklearn.tree import DecisionTreeClassifier
warnings.filterwarnings('ignore')
print("✅ Imports loaded")

✅ Imports loaded


In [5]:
ARTIFACTS_DIR = Path("artifacts")
FIGURES_DIR = Path("artifacts/figures")

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [6]:
X_train_reg_views = np.load(ARTIFACTS_DIR / 'X_train_reg_Views.npz')['data']
X_test_reg_views = np.load(ARTIFACTS_DIR / 'X_test_reg_Views.npz')['data']
y_train_reg_views = np.load(ARTIFACTS_DIR / 'y_train_reg_Views.npz')['data']
y_test_reg_views = np.load(ARTIFACTS_DIR / 'y_test_reg_Views.npz')['data']

X_train_reg_likes = np.load(ARTIFACTS_DIR / 'X_train_reg_Likes.npz')['data']
X_test_reg_likes = np.load(ARTIFACTS_DIR / 'X_test_reg_Likes.npz')['data']
y_train_reg_likes = np.load(ARTIFACTS_DIR / 'y_train_reg_Likes.npz')['data']
y_test_reg_likes = np.load(ARTIFACTS_DIR / 'y_test_reg_Likes.npz')['data']

X_train_reg_comments = np.load(ARTIFACTS_DIR / 'X_train_reg_Comments.npz')['data']
X_test_reg_comments = np.load(ARTIFACTS_DIR / 'X_test_reg_Comments.npz')['data']
y_train_reg_comments = np.load(ARTIFACTS_DIR / 'y_train_reg_Comments.npz')['data']
y_test_reg_comments = np.load(ARTIFACTS_DIR / 'y_test_reg_Comments.npz')['data']

X_train_reg_shares = np.load(ARTIFACTS_DIR / 'X_train_reg_Shares.npz')['data']
X_test_reg_shares = np.load(ARTIFACTS_DIR / 'X_test_reg_Shares.npz')['data']
y_train_reg_shares = np.load(ARTIFACTS_DIR / 'y_train_reg_Shares.npz')['data']
y_test_reg_shares = np.load(ARTIFACTS_DIR / 'y_test_reg_Shares.npz')['data']

X_train_cls = np.load(ARTIFACTS_DIR / 'X_train_cls_classification.npz')['data']
X_test_cls = np.load(ARTIFACTS_DIR / 'X_test_cls_classification.npz')['data']
y_train_cls = np.load(ARTIFACTS_DIR / 'y_train_cls_classification.npz')['data']
y_test_cls = np.load(ARTIFACTS_DIR / 'y_test_cls_classification.npz')['data']




print("Regression arrays loaded for Views:")
print(f"   X_train : {X_train_reg_views.shape}, X_test : {X_test_reg_views.shape}")
print(f"   y_train : {y_train_reg_views.shape}, y_test : {y_test_reg_views.shape}")

print("Regression arrays loaded for Likes:")
print(f"   X_train : {X_train_reg_likes.shape}, X_test : {X_test_reg_likes.shape}")
print(f"   y_train : {y_train_reg_likes.shape}, y_test : {y_test_reg_likes.shape}")

print("Regression arrays loaded for Shares:")
print(f"   X_train : {X_train_reg_shares.shape}, X_test : {X_test_reg_shares.shape}")
print(f"   y_train : {y_train_reg_shares.shape}, y_test : {y_test_reg_shares.shape}")

print("Regression arrays loaded for Comments:")
print(f"   X_train : {X_train_reg_comments.shape}, X_test : {X_test_reg_comments.shape}")
print(f"   y_train : {y_train_reg_comments.shape}, y_test : {y_test_reg_comments.shape}")

print("\nClassification arrays loaded:")
print(f"   X_train : {X_train_cls.shape}, X_test : {X_test_cls.shape}")
print(f"   y_train : {y_train_cls.shape}, y_test : {y_test_cls.shape}")



# TODO : Implement confusion matrix visualization function

Regression arrays loaded for Views:
   X_train : (3784, 76), X_test : (946, 76)
   y_train : (3784,), y_test : (946,)
Regression arrays loaded for Likes:
   X_train : (3784, 77), X_test : (946, 77)
   y_train : (3784,), y_test : (946,)
Regression arrays loaded for Shares:
   X_train : (3784, 81), X_test : (946, 81)
   y_train : (3784,), y_test : (946,)
Regression arrays loaded for Comments:
   X_train : (3784, 82), X_test : (946, 82)
   y_train : (3784,), y_test : (946,)

Classification arrays loaded:
   X_train : (3784, 84), X_test : (946, 84)
   y_train : (3784,), y_test : (946,)


In [7]:
X_train_df = pd.DataFrame(X_train_reg_views)
X_train_df.head(100)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,66,67,68,69,70,71,72,73,74,75
0,-0.785723,0.468561,1.979219,-0.142781,1.791705,-1.031785,-1.294538,-1.200865,-1.405355,1.197827,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0
1,0.573057,-1.786181,-0.883814,-0.142781,1.551390,0.718034,-0.471216,-1.200865,1.755303,0.035870,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
2,-0.589055,-1.786181,1.979219,0.298661,0.562334,0.718034,-1.294538,0.808203,-1.454774,0.035870,...,0.0,1.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,1.0
3,0.923092,-1.786181,0.014246,0.298661,-0.139697,0.718034,-1.684183,0.808203,0.012497,0.035870,...,0.0,1.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,1.0
4,-0.246234,0.735380,0.014246,-0.142781,-1.100668,-0.956804,-1.684183,-1.200865,-0.773091,0.369740,...,0.0,0.0,0.0,1.0,0.0,0.0,2.0,2.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.243388,0.470941,-0.883814,-0.142781,-0.066416,1.200298,-0.471216,-1.200865,-0.124690,-1.496268,...,0.0,0.0,0.0,1.0,0.0,0.0,2.0,1.0,2.0,2.0
96,0.573057,-1.786181,-0.608507,-0.142781,1.551390,0.718034,0.681186,-1.200865,1.755303,0.035870,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,1.0,2.0
97,-0.246234,0.470941,-0.131382,-1.613750,-1.100668,1.200298,1.269466,-0.368889,-0.773091,-1.496268,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
98,0.573057,0.735380,-0.883814,-0.142781,1.551390,-0.956804,-0.471216,-1.200865,1.755303,0.369740,...,0.0,0.0,0.0,1.0,0.0,0.0,2.0,1.0,2.0,0.0


In [8]:
y_train_df = pd.DataFrame(y_train_reg_views)
y_train_df.head(100)

Unnamed: 0,0
0,826021
1,338623
2,4754509
3,4892331
4,3399161
...,...
95,4290567
96,459654
97,3079923
98,4171736


## Regression Baseline

In [9]:
class SupervisedBaselineModels:
    def __init__(self, X_train, X_test, y_train, y_test, task):
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.task = task
        
        # Baseline Regression Models
        self.reg_models = {
            'LinearRegression': LinearRegression(),
            'Ridge_1': Ridge(alpha=1, random_state=RANDOM_STATE),
            'Ridge_10': Ridge(alpha=10, random_state=RANDOM_STATE),
            'Ridge_100': Ridge(alpha=100, random_state=RANDOM_STATE),
        }
        
        # Baseline Classification Models
        self.cls_models = {
            'LogisticRegression': LogisticRegression(
                multi_class='multinomial',
                max_iter=200,
                random_state=RANDOM_STATE
            ),
            'DecisionTree': DecisionTreeClassifier(
                max_depth=6,
                random_state=RANDOM_STATE
            ),
        }
        
        self.reg_results = []
        self.cls_results = []
        
        
    # -------------------------------------------------
    # TRAIN & EVALUATE
    # -------------------------------------------------
    def train_and_evaluate(self):
        
        # -------------------------------------------------
        # REGRESSION
        # -------------------------------------------------
        if self.task == 'regression':
            for name, model in self.reg_models.items():
                print(f"\n🔵 Training {name} ...")
                model.fit(self.X_train, self.y_train)

                # Predictions
                y_pred_train = model.predict(self.X_train)
                y_pred_test = model.predict(self.X_test)

                # Metrics
                train_mae = mean_absolute_error(self.y_train, y_pred_train)
                test_mae = mean_absolute_error(self.y_test, y_pred_test)

                train_rmse = np.sqrt(mean_squared_error(self.y_train, y_pred_train))
                test_rmse = np.sqrt(mean_squared_error(self.y_test, y_pred_test))

                train_r2 = r2_score(self.y_train, y_pred_train)
                test_r2 = r2_score(self.y_test, y_pred_test)

                # Save Results
                self.reg_results.append({
                    'Model': name,
                    'Train MAE': train_mae,
                    'Train RMSE': train_rmse,
                    'Train R2': train_r2,
                    'Test MAE': test_mae,
                    'Test RMSE': test_rmse,   # FIXED
                    'Test R2': test_r2
                })

                print(f"   Test MAE: {test_mae:.2f} | Test RMSE: {test_rmse:.2f} | Test R2: {test_r2:.2f}")

        
        # -------------------------------------------------
        # CLASSIFICATION
        # -------------------------------------------------
        if self.task == 'classification':
            for name, model in self.cls_models.items():
                print(f"\n🟢 Training {name} ...")
                model.fit(self.X_train, self.y_train)

                # Predictions
                y_pred_train = model.predict(self.X_train)
                y_pred_test = model.predict(self.X_test)

                # Metrics
                train_acc = accuracy_score(self.y_train, y_pred_train)
                test_acc = accuracy_score(self.y_test, y_pred_test)

                train_precision = precision_score(self.y_train, y_pred_train, average='weighted')
                test_precision = precision_score(self.y_test, y_pred_test, average='weighted')

                train_recall = recall_score(self.y_train, y_pred_train, average='weighted')
                test_recall = recall_score(self.y_test, y_pred_test, average='weighted')

                train_f1 = f1_score(self.y_train, y_pred_train, average='weighted')
                test_f1 = f1_score(self.y_test, y_pred_test, average='weighted')

                # Save Results
                self.cls_results.append({
                    'Model': name,
                    'Train Accuracy': train_acc,
                    'Train Precision': train_precision,
                    'Train Recall': train_recall,
                    'Train F1 Score': train_f1,
                    'Test Accuracy': test_acc,
                    'Test Precision': test_precision,
                    'Test Recall': test_recall,
                    'Test F1 Score': test_f1
                })

                # FIXED PRINT LINE (Recall label corrected)
                print(f"   Test Accuracy: {test_acc:.2f} | Test Precision: {test_precision:.2f} | Test Recall: {test_recall:.2f} | Test F1: {test_f1:.2f}")


    # -------------------------------------------------
    # RESULTS DISPLAY & SAVE
    # -------------------------------------------------
    def results(self):
        
        # -------------------------------------------------
        # REGRESSION RESULTS
        # -------------------------------------------------
        if self.task == 'regression':
            df_reg = pd.DataFrame(self.reg_results)
            print("\n📘 Regression Baseline Results:")
            print(df_reg.to_string(index=False))

            with open(ARTIFACTS_DIR / "metrics_baseline_regression.json", "w") as f:
                json.dump(df_reg.to_dict(orient="records"), f, indent=2)
        
        # -------------------------------------------------
        # CLASSIFICATION RESULTS
        # -------------------------------------------------
        if self.task == 'classification':
            df_cls = pd.DataFrame(self.cls_results)
            print("\n📗 Classification Baseline Results:")
            print(df_cls.to_string(index=False))

            with open(ARTIFACTS_DIR / "metrics_baseline_classification.json", "w") as f:
                json.dump(df_cls.to_dict(orient="records"), f, indent=2)


In [10]:
# Regression Models on Views
print("\n=== Regression Baseline Models on Views ===")
reg_views_model = SupervisedBaselineModels(X_train_reg_views, X_test_reg_views, y_train_reg_views, y_test_reg_views, task='regression')
reg_views_model.train_and_evaluate()
reg_views_model.results()

# Regression Models on Likes
print("\n=== Regression Baseline Models on Likes ===")
reg_likes_model = SupervisedBaselineModels(X_train_reg_likes, X_test_reg_likes, y_train_reg_likes, y_test_reg_likes, task='regression')
reg_likes_model.train_and_evaluate()
reg_likes_model.results()

# Regression Models on Comments
print("\n=== Regression Baseline Models on Comments ===")
reg_comments_model = SupervisedBaselineModels(X_train_reg_comments, X_test_reg_comments, y_train_reg_comments, y_test_reg_comments, task='regression')
reg_comments_model.train_and_evaluate()
reg_comments_model.results()

# Regression Models on Shares
print("\n=== Regression Baseline Models on Shares ===")
reg_shares_model = SupervisedBaselineModels(X_train_reg_shares, X_test_reg_shares, y_train_reg_shares, y_test_reg_shares, task='regression')
reg_shares_model.train_and_evaluate()
reg_shares_model.results()

# Classification Models
print("\n=== Classification Baseline Models ===")
cls_model = SupervisedBaselineModels(X_train_cls, X_test_cls, y_train_cls, y_test_cls, task='classification')
cls_model.train_and_evaluate()  
cls_model.results()


=== Regression Baseline Models on Views ===

🔵 Training LinearRegression ...
   Test MAE: 6682.80 | Test RMSE: 9585.24 | Test R2: 1.00

🔵 Training Ridge_1 ...
   Test MAE: 6853.87 | Test RMSE: 9816.51 | Test R2: 1.00

🔵 Training Ridge_10 ...
   Test MAE: 7947.51 | Test RMSE: 11191.24 | Test R2: 1.00

🔵 Training Ridge_100 ...
   Test MAE: 21894.32 | Test RMSE: 26816.50 | Test R2: 1.00

📘 Regression Baseline Results:
           Model    Train MAE   Train RMSE  Train R2     Test MAE    Test RMSE  Test R2
LinearRegression  6348.595080  8987.237761  0.999958  6682.800156  9585.237979 0.999954
         Ridge_1  6537.300372  9267.650628  0.999955  6853.867176  9816.506214 0.999952
        Ridge_10  7596.515959 10695.834017  0.999940  7947.514799 11191.240611 0.999937
       Ridge_100 20989.415773 26186.073688  0.999640 21894.322587 26816.502403 0.999639

=== Regression Baseline Models on Likes ===

🔵 Training LinearRegression ...
   Test MAE: 208.76 | Test RMSE: 287.68 | Test R2: 1.00

🔵 Tra