In [1]:
# Imports
import json
import joblib
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, KFold, StratifiedKFold
from sklearn.metrics import (
                            mean_absolute_error, mean_squared_error, r2_score, 
                            accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
                            )
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from xgboost import XGBRegressor, XGBClassifier
warnings.filterwarnings('ignore')
print("âœ… Imports loaded")

âœ… Imports loaded


In [2]:
# Constants
ARTIFACTS_DIR = Path("artifacts")
FIGURES_DIR = Path("artifacts/figures")
N_SPLITS = 5

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [3]:
X_train_reg_views = np.load(ARTIFACTS_DIR / 'X_train_reg_Views.npz')['data']
X_test_reg_views = np.load(ARTIFACTS_DIR / 'X_test_reg_Views.npz')['data']
y_train_reg_views = np.load(ARTIFACTS_DIR / 'y_train_reg_Views.npz')['data']
y_test_reg_views = np.load(ARTIFACTS_DIR / 'y_test_reg_Views.npz')['data']

X_train_reg_likes = np.load(ARTIFACTS_DIR / 'X_train_reg_Likes.npz')['data']
X_test_reg_likes = np.load(ARTIFACTS_DIR / 'X_test_reg_Likes.npz')['data']
y_train_reg_likes = np.load(ARTIFACTS_DIR / 'y_train_reg_Likes.npz')['data']
y_test_reg_likes = np.load(ARTIFACTS_DIR / 'y_test_reg_Likes.npz')['data']

X_train_reg_comments = np.load(ARTIFACTS_DIR / 'X_train_reg_Comments.npz')['data']
X_test_reg_comments = np.load(ARTIFACTS_DIR / 'X_test_reg_Comments.npz')['data']
y_train_reg_comments = np.load(ARTIFACTS_DIR / 'y_train_reg_Comments.npz')['data']
y_test_reg_comments = np.load(ARTIFACTS_DIR / 'y_test_reg_Comments.npz')['data']

X_train_reg_shares = np.load(ARTIFACTS_DIR / 'X_train_reg_Shares.npz')['data']
X_test_reg_shares = np.load(ARTIFACTS_DIR / 'X_test_reg_Shares.npz')['data']
y_train_reg_shares = np.load(ARTIFACTS_DIR / 'y_train_reg_Shares.npz')['data']
y_test_reg_shares = np.load(ARTIFACTS_DIR / 'y_test_reg_Shares.npz')['data']

X_train_cls = np.load(ARTIFACTS_DIR / 'X_train_cls_classification.npz')['data']
X_test_cls = np.load(ARTIFACTS_DIR / 'X_test_cls_classification.npz')['data']
y_train_cls = np.load(ARTIFACTS_DIR / 'y_train_cls_classification.npz')['data']
y_test_cls = np.load(ARTIFACTS_DIR / 'y_test_cls_classification.npz')['data']



print("Regression arrays loaded for Views:")
print(f"   X_train : {X_train_reg_views.shape}, X_test : {X_test_reg_views.shape}")
print(f"   y_train : {y_train_reg_views.shape}, y_test : {y_test_reg_views.shape}")

print("Regression arrays loaded for Likes:")
print(f"   X_train : {X_train_reg_likes.shape}, X_test : {X_test_reg_likes.shape}")
print(f"   y_train : {y_train_reg_likes.shape}, y_test : {y_test_reg_likes.shape}")

print("Regression arrays loaded for Shares:")
print(f"   X_train : {X_train_reg_shares.shape}, X_test : {X_test_reg_shares.shape}")
print(f"   y_train : {y_train_reg_shares.shape}, y_test : {y_test_reg_shares.shape}")

print("Regression arrays loaded for Comments:")
print(f"   X_train : {X_train_reg_comments.shape}, X_test : {X_test_reg_comments.shape}")
print(f"   y_train : {y_train_reg_comments.shape}, y_test : {y_test_reg_comments.shape}")

print("\nClassification arrays loaded:")
print(f"   X_train : {X_train_cls.shape}, X_test : {X_test_cls.shape}")
print(f"   y_train : {y_train_cls.shape}, y_test : {y_test_cls.shape}")

print("âœ… Data arrays loaded")

Regression arrays loaded for Views:
   X_train : (3784, 54), X_test : (946, 54)
   y_train : (3784,), y_test : (946,)
Regression arrays loaded for Likes:
   X_train : (3784, 55), X_test : (946, 55)
   y_train : (3784,), y_test : (946,)
Regression arrays loaded for Shares:
   X_train : (3784, 56), X_test : (946, 56)
   y_train : (3784,), y_test : (946,)
Regression arrays loaded for Comments:
   X_train : (3784, 57), X_test : (946, 57)
   y_train : (3784,), y_test : (946,)

Classification arrays loaded:
   X_train : (3784, 54), X_test : (946, 54)
   y_train : (3784,), y_test : (946,)
âœ… Data arrays loaded


In [4]:
class HyparamModels:
    def __init__(self, X_train, X_test, y_train, y_test, task, pred_f):
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.task = task
        self.pred_f = pred_f
        
        # --------------------
        # Regression param grids
        # --------------------
        self.param_grids_reg = {
            'RF' : {
                'model' : RandomForestRegressor(random_state=RANDOM_STATE),
                'params' : {
                    'n_estimators' : [10, 20],
                    'min_samples_leaf' : [1, 5],
                    'max_depth' : [6, 8, 12]
                }
            },
            'XGB' : {
                'model' : XGBRegressor(random_state=RANDOM_STATE),
                'params' : {
                    'n_estimators' : [10, 20],
                    'learning_rate' : [0.05, 0.1],
                    'max_depth' : [8, 10, 12]
                }
            }
        }

        # --------------------
        # Classification param grids
        # --------------------
        self.param_grids_cls = {
            'RF' : {
                'model' : RandomForestClassifier(class_weight='balanced',random_state=RANDOM_STATE),
                'params' : {
                    'n_estimators' : [10, 20],
                    'min_samples_leaf' : [1, 5],
                    'max_depth' : [6, 8, 12]
                }
            },
            'XGB' : {
                'model' : XGBClassifier(random_state=RANDOM_STATE),
                'params' : {
                    'n_estimators' : [10, 20],
                    'learning_rate' : [0.05, 0.1],
                    'max_depth' : [8, 10, 12]
                }
            }
        }

        # --------------------
        # Cross-validation
        # --------------------
        if task == "regression":
            self.kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
        else:
            self.kf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)

        self.reg_tuning_results = {}
        self.cls_tuning_results = {}
        self.all_results = []

    # -------------------------------------------------
    # TRAIN & EVALUATE
    # -------------------------------------------------
    def train_and_evaluate(self):
        if self.task == 'regression':
            for name, config in self.param_grids_reg.items():
                print(f"\nðŸ”µ Training {name} ...")
                grid_search = GridSearchCV(
                    estimator=config['model'],
                    param_grid=config['params'],
                    cv=self.kf,
                    scoring='neg_root_mean_squared_error',
                    verbose=2
                )
                grid_search.fit(self.X_train, self.y_train)

                # Save results
                self.reg_tuning_results[name] = {
                    'best_params': grid_search.best_params_,
                    'best_cv_score': -grid_search.best_score_,
                    'best_estimator': grid_search.best_estimator_
                }

                print(f"  Best params : {grid_search.best_params_}")
                print(f"  Best CV RMSE : {-grid_search.best_score_}")

        elif self.task == 'classification':
            for name, config in self.param_grids_cls.items():
                print(f"\nðŸŸ¢ Training {name} ...")
                grid_search = GridSearchCV(
                    estimator=config['model'],
                    param_grid=config['params'],
                    cv=self.kf,
                    scoring='f1_weighted',
                    verbose=2
                )
                grid_search.fit(self.X_train, self.y_train)

                # Save results
                self.cls_tuning_results[name] = {
                    'best_params': grid_search.best_params_,
                    'best_cv_score': grid_search.best_score_,
                    'best_estimator': grid_search.best_estimator_
                }

                print(f"  Best params : {grid_search.best_params_}")
                print(f"  Best CV F1 Score : {grid_search.best_score_}")

    # -------------------------------------------------
    # RESULTS DISPLAY & SAVE
    # -------------------------------------------------
    def results(self):
        if self.task == 'regression':
            print(f"\nðŸ“˜ Regression Hyperparameter Tuning Results for: {self.pred_f}")
            results_reg_list = []

            for name, res in self.reg_tuning_results.items():
                print(f"\nModel: {name}")
                print(f"  Best Params: {res['best_params']}")
                print(f"  CV RMSE: {res['best_cv_score']:.2f}")

                # Evaluate on test set
                y_pred = res['best_estimator'].predict(self.X_test)
                test_rmse = np.sqrt(mean_squared_error(self.y_test, y_pred))
                test_mae = mean_absolute_error(self.y_test, y_pred)
                test_r2 = r2_score(self.y_test, y_pred)
                print(f"  Test MAE: {test_mae:.2f}")
                print(f"  Test RMSE: {test_rmse:.2f}")
                print(f"  Test R2: {test_r2:.4f}")

                results_reg_list.append({
                    'Model': name,
                    'Target': self.pred_f,
                    'Best_Params': res['best_params'],
                    'CV_RMSE': res['best_cv_score'],
                    'Test_MAE': test_mae,
                    'Test_RMSE': test_rmse,
                    'Test_R2': test_r2
                })

            # Display results
            results_reg_df = pd.DataFrame(results_reg_list)
            print("\nðŸ“˜ Regression Hyperparameter Tuning Results Table:")
            print(results_reg_df.to_string(index=False))

            # Save JSON
            with open(ARTIFACTS_DIR / "metrics_hyperparam_reg.json", "w") as f:
                json.dump(results_reg_df.to_dict(orient="records"), f, indent=2)

        elif self.task == 'classification':
            print("\nðŸ“— Classification Hyperparameter Tuning Results:")
            results_cls_list = []

            for name, res in self.cls_tuning_results.items():
                print(f"\nModel: {name}")
                print(f"  Best Params: {res['best_params']}")
                print(f"  CV F1 Score: {res['best_cv_score']:.4f}")

                # Evaluate on test set
                y_pred = res['best_estimator'].predict(self.X_test)
                test_acc = accuracy_score(self.y_test, y_pred)
                test_precision = precision_score(self.y_test, y_pred, average='weighted')
                test_recall = recall_score(self.y_test, y_pred, average='weighted')
                test_f1 = f1_score(self.y_test, y_pred, average='weighted')
                print(f"  Test Accuracy: {test_acc:.4f}")
                print(f"  Test Precision: {test_precision:.4f}")
                print(f"  Test Recall: {test_recall:.4f}")
                print(f"  Test F1 Score: {test_f1:.4f}")

                results_cls_list.append({
                    'Model': name,
                    'Best_Params': res['best_params'],
                    'CV_F1_Score': res['best_cv_score'],
                    'Test_Accuracy': test_acc,
                    'Test_Precision': test_precision,
                    'Test_Recall': test_recall,
                    'Test_F1_Score': test_f1
                })

            # Display results
            results_cls_df = pd.DataFrame(results_cls_list)
            print("\nðŸ“— Classification Hyperparameter Tuning Results Table:")
            print(results_cls_df.to_string(index=False))

            # Save JSON
            with open(ARTIFACTS_DIR / "metrics_hyperparam_classification.json", "w") as f:
                json.dump(results_cls_df.to_dict(orient="records"), f, indent=2)

# Implement the name of the predicting feature to be displayed

print("âœ… HyparamModels class defined successfully.")

âœ… HyparamModels class defined successfully.


In [5]:


print("===Model Training and Hyperparameter Tuning==")
# Regression for Views
hyparam_views = HyparamModels(X_train_reg_views, X_test_reg_views, y_train_reg_views, y_test_reg_views, task='regression', pred_f='Views')
hyparam_views.train_and_evaluate()
hyparam_views.results()

# Regression for Likes
hyparam_likes = HyparamModels(X_train_reg_likes, X_test_reg_likes, y_train_reg_likes, y_test_reg_likes, task='regression', pred_f='Likes')
hyparam_likes.train_and_evaluate()
hyparam_likes.results()

# Regression for Shares
hyparam_shares = HyparamModels(X_train_reg_shares, X_test_reg_shares, y_train_reg_shares, y_test_reg_shares, task='regression', pred_f='Shares')
hyparam_shares.train_and_evaluate()
hyparam_shares.results()

# Regression for Comments
hyparam_comments = HyparamModels(X_train_reg_comments, X_test_reg_comments, y_train_reg_comments, y_test_reg_comments, task='regression', pred_f='Comments')
hyparam_comments.train_and_evaluate()
hyparam_comments.results()

# Classification
hyparam_cls = HyparamModels(X_train_cls, X_test_cls, y_train_cls, y_test_cls, task='classification', pred_f='Engagement_Level')
hyparam_cls.train_and_evaluate()    
hyparam_cls.results()

===Model Training and Hyperparameter Tuning==

ðŸ”µ Training RF ...
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[CV] END ...max_depth=6, min_samples_leaf=1, n_estimators=10; total time=   0.1s
[CV] END ...max_depth=6, min_samples_leaf=1, n_estimators=10; total time=   0.1s
[CV] END ...max_depth=6, min_samples_leaf=1, n_estimators=10; total time=   0.1s
[CV] END ...max_depth=6, min_samples_leaf=1, n_estimators=10; total time=   0.0s
[CV] END ...max_depth=6, min_samples_leaf=1, n_estimators=10; total time=   0.0s
[CV] END ...max_depth=6, min_samples_leaf=1, n_estimators=20; total time=   0.2s
[CV] END ...max_depth=6, min_samples_leaf=1, n_estimators=20; total time=   0.2s
[CV] END ...max_depth=6, min_samples_leaf=1, n_estimators=20; total time=   0.2s
[CV] END ...max_depth=6, min_samples_leaf=1, n_estimators=20; total time=   0.2s
[CV] END ...max_depth=6, min_samples_leaf=1, n_estimators=20; total time=   0.2s
[CV] END ...max_depth=6, min_samples_leaf=5, n_estimators=10; total time=   0.0s
[CV] END ...max_depth=6, min_samples_leaf=5, n_estimators=10; total time=   0.1s
[CV] END ...max_depth=6, min