In [22]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

In [23]:
original_news_df = pd.read_csv('online_news_original.csv')
original_news_df

Unnamed: 0,url,timedelta,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,...,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity,shares
0,http://mashable.com/2013/01/07/amazon-instant-...,731.0,12.0,219.0,0.663594,1.0,0.815385,4.0,2.0,1.0,...,0.100000,0.70,-0.350000,-0.600,-0.200000,0.500000,-0.187500,0.000000,0.187500,593
1,http://mashable.com/2013/01/07/ap-samsung-spon...,731.0,9.0,255.0,0.604743,1.0,0.791946,3.0,1.0,1.0,...,0.033333,0.70,-0.118750,-0.125,-0.100000,0.000000,0.000000,0.500000,0.000000,711
2,http://mashable.com/2013/01/07/apple-40-billio...,731.0,9.0,211.0,0.575130,1.0,0.663866,3.0,1.0,1.0,...,0.100000,1.00,-0.466667,-0.800,-0.133333,0.000000,0.000000,0.500000,0.000000,1500
3,http://mashable.com/2013/01/07/astronaut-notre...,731.0,9.0,531.0,0.503788,1.0,0.665635,9.0,0.0,1.0,...,0.136364,0.80,-0.369697,-0.600,-0.166667,0.000000,0.000000,0.500000,0.000000,1200
4,http://mashable.com/2013/01/07/att-u-verse-apps/,731.0,13.0,1072.0,0.415646,1.0,0.540890,19.0,19.0,20.0,...,0.033333,1.00,-0.220192,-0.500,-0.050000,0.454545,0.136364,0.045455,0.136364,505
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39639,http://mashable.com/2014/12/27/samsung-app-aut...,8.0,11.0,346.0,0.529052,1.0,0.684783,9.0,7.0,1.0,...,0.100000,0.75,-0.260000,-0.500,-0.125000,0.100000,0.000000,0.400000,0.000000,1800
39640,http://mashable.com/2014/12/27/seth-rogen-jame...,8.0,12.0,328.0,0.696296,1.0,0.885057,9.0,7.0,3.0,...,0.136364,0.70,-0.211111,-0.400,-0.100000,0.300000,1.000000,0.200000,1.000000,1900
39641,http://mashable.com/2014/12/27/son-pays-off-mo...,8.0,10.0,442.0,0.516355,1.0,0.644128,24.0,1.0,12.0,...,0.136364,0.50,-0.356439,-0.800,-0.166667,0.454545,0.136364,0.045455,0.136364,1900
39642,http://mashable.com/2014/12/27/ukraine-blasts/,8.0,6.0,682.0,0.539493,1.0,0.692661,10.0,1.0,1.0,...,0.062500,0.50,-0.205246,-0.500,-0.012500,0.000000,0.000000,0.500000,0.000000,1100


In [24]:
TEST_SIZE = 0.20
RANDOM_STATE = 42
CLASSIFICATION_THRESHOLD = 0.75

In [33]:
def rmse(y_true, y_pred):
    try:
        return mean_squared_error(y_true, y_pred, squared=False)
    except TypeError:
        return np.sqrt(mean_squared_error(y_true, y_pred))


def safe_qcut_bins(y_series, q_candidates=(10, 8, 6, 5, 4), min_bin_size=30):
    y_series = pd.Series(y_series).astype(float)
    for q in q_candidates:
        try:
            bins = pd.qcut(y_series, q=q, duplicates='drop')
            counts = bins.value_counts(dropna=False)
            if counts.min() >= min_bin_size and counts.size >= 2:
                return bins
        except ValueError:
            continue
    return None


def explore_data(df):
    print("=" * 60)
    print("DATA EXPLORATION")
    print("=" * 60)

    print(f"\nDataset shape: {df.shape}")
    print(f"Missing values: {df.isnull().sum().sum()}")

    if 'shares' in df.columns:
        print(f"\nTarget Statistics:")
        print(f"  Mean: {df['shares'].mean():.2f}")
        print(f"  Median: {df['shares'].median():.2f}")
        print(f"  Std: {df['shares'].std():.2f}")
        print(f"  Min: {df['shares'].min():.2f}")
        print(f"  Max: {df['shares'].max():.2f}")


def prepare_features(df):
    print("\n" + "=" * 60)
    print("DATA PREPARATION")
    print("=" * 60)

    # Drop target and non-features
    cols_to_drop = [c for c in ['shares', 'url'] if c in df.columns]
    X = df.drop(columns=cols_to_drop, errors='ignore')
    y = df['shares'].copy()

    # Handle missing values
    if X.isnull().sum().sum() > 0:
        X = X.fillna(X.median())

    # Remove constant columns
    nunique = X.nunique()
    constant_cols = nunique[nunique == 1].index.tolist()
    if constant_cols:
        X = X.drop(columns=constant_cols)

    # One-hot encode potential categorical features
    potential_categorical = [col for col in X.columns
                             if X[col].nunique() < 10 and X[col].dtype in ['int64', 'float64']]

    if potential_categorical:
        X = pd.get_dummies(X, columns=potential_categorical, drop_first=True)
        X = X.astype(float)

    # Handle infinite values
    if np.isinf(X.values).any():
        X = X.replace([np.inf, -np.inf], np.nan).fillna(X.median())

    print(f"Final feature matrix: {X.shape}")

    return X, y


def analyze_feature_correlation(X):
    print("\n" + "=" * 60)
    print("FEATURE CORRELATION ANALYSIS")
    print("=" * 60)

    if X.shape[1] < 100:
        correlation_matrix = X.corr().abs()
        upper_triangle = correlation_matrix.where(
            np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool)
        )
        high_corr_features = [col for col in upper_triangle.columns
                              if any(upper_triangle[col] > 0.95)]

        if high_corr_features:
            print(f"Removing {len(high_corr_features)} highly correlated features")
            X = X.drop(columns=high_corr_features)
            print(f"Final features: {X.shape[1]}")

    return X


def prepare_targets(y, threshold=CLASSIFICATION_THRESHOLD):
    # Regression target (log transform)
    if y.mean() < 0.1 and abs(y.std() - 1.0) < 0.1:
        y_reg = y.copy()
    else:
        y_reg = np.log1p(y)

    # Classification target
    thresh_val = y.quantile(threshold)
    y_class = (y > thresh_val).astype(int)

    print(f"\nClassification Target (Top {int((1-threshold)*100)}%):")
    print(f"  Low shares:  {(y_class == 0).sum()} ({(y_class == 0).sum()/len(y_class)*100:.1f}%)")
    print(f"  High shares: {(y_class == 1).sum()} ({(y_class == 1).sum()/len(y_class)*100:.1f}%)")

    return y_reg, y_class

# Split data into training and testing sets
def split_data(X, y_reg, y_class, test_size=TEST_SIZE, random_state=RANDOM_STATE):
    print("\n" + "=" * 60)
    print("TRAIN/TEST SPLIT")
    print("=" * 60)

    # Create bins for stratification
    y_bins = safe_qcut_bins(y_reg, q_candidates=(10, 8, 6, 5, 4), min_bin_size=30)
    stratify_param = y_bins if y_bins is not None else None

    X_train, X_test, y_reg_train, y_reg_test, y_class_train, y_class_test = train_test_split(
        X, y_reg, y_class,
        test_size=test_size,
        random_state=random_state,
        stratify=stratify_param
    )

    print(f"Training: {len(X_train)} samples ({len(X_train)/len(X)*100:.1f}%)")
    print(f"Testing:  {len(X_test)} samples ({len(X_test)/len(X)*100:.1f}%)")

    return X_train, X_test, y_reg_train, y_reg_test, y_class_train, y_class_test

# Regression models
def train_regression_models(X_train, y_reg_train):
    print("\n" + "=" * 80)
    print("REGRESSION MODELS")
    print("=" * 80)

    models = {}

    # Ridge Regression
    print("\nTraining Ridge Regression...")
    pipe_ridge = Pipeline([
        ("scaler", StandardScaler()),
        ("ridge", Ridge(alpha=1.0, random_state=RANDOM_STATE))
    ])
    pipe_ridge.fit(X_train, y_reg_train)
    models['Ridge'] = pipe_ridge

    # Random Forest
    print("Training Random Forest Regressor...")
    rf_reg = RandomForestRegressor(
        n_estimators=100, max_depth=15, min_samples_leaf=5,
        min_samples_split=10, max_features='sqrt', max_samples=0.8,
        n_jobs=-1, random_state=RANDOM_STATE
    )
    rf_reg.fit(X_train, y_reg_train)
    models['Random Forest'] = rf_reg

    # Gradient Boosting
    print("Training Gradient Boosting Regressor...")
    gb_reg = HistGradientBoostingRegressor(
        learning_rate=0.05, max_depth=5, max_bins=255,
        l2_regularization=1.0, max_iter=200, early_stopping=True,
        validation_fraction=0.1, n_iter_no_change=20, random_state=RANDOM_STATE
    )
    gb_reg.fit(X_train, y_reg_train)
    models['Gradient Boosting'] = gb_reg

    return models

# Classification models
def train_classification_models(X_train, y_class_train):
    print("\n" + "=" * 80)
    print("CLASSIFICATION MODELS")
    print("=" * 80)

    models = {}

    # Logistic Regression
    print("\nTraining Logistic Regression...")
    pipe_lr = Pipeline([
        ("scaler", StandardScaler()),
        ("lr", LogisticRegression(C=1.0, max_iter=1000, random_state=RANDOM_STATE))
    ])
    pipe_lr.fit(X_train, y_class_train)
    models['Logistic Regression'] = pipe_lr

    # Random Forest
    print("Training Random Forest Classifier...")
    rf_clf = RandomForestClassifier(
        n_estimators=100, max_depth=15, min_samples_leaf=5,
        min_samples_split=10, max_features='sqrt', class_weight='balanced',
        n_jobs=-1, random_state=RANDOM_STATE
    )
    rf_clf.fit(X_train, y_class_train)
    models['Random Forest'] = rf_clf

    # Gradient Boosting
    print("Training Gradient Boosting Classifier...")
    gb_clf = HistGradientBoostingClassifier(
        learning_rate=0.05, max_depth=5, max_bins=255,
        l2_regularization=1.0, max_iter=200, early_stopping=True,
        validation_fraction=0.1, n_iter_no_change=20, random_state=RANDOM_STATE
    )
    gb_clf.fit(X_train, y_class_train)
    models['Gradient Boosting'] = gb_clf

    return models

# Evaluation
def evaluate_regression_model(model, X_test, y_reg_test, model_name):
    y_pred = model.predict(X_test)
    rmse_val = rmse(y_reg_test, y_pred)
    r2_val = r2_score(y_reg_test, y_pred)

    print(f"\n{model_name}")
    print("=" * 60)
    print(f"RMSE: {rmse_val:.4f} | R²: {r2_val:.4f}")

    return {'rmse': rmse_val, 'r2': r2_val, 'predictions': y_pred}


def evaluate_classification_model(model, X_test, y_class_test, model_name):
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    accuracy = accuracy_score(y_class_test, y_pred)
    precision = precision_score(y_class_test, y_pred)
    recall = recall_score(y_class_test, y_pred)
    f1 = f1_score(y_class_test, y_pred)

    try:
        auc = roc_auc_score(y_class_test, y_pred_proba)
    except:
        auc = None

    print(f"\n{model_name}")
    print("=" * 60)
    print(f"Accuracy:  {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1-Score:  {f1:.4f}")
    if auc:
        print(f"ROC-AUC:   {auc:.4f}")

    return {
        'accuracy': accuracy, 'precision': precision, 'recall': recall,
        'f1': f1, 'auc': auc, 'predictions': y_pred, 'probabilities': y_pred_proba
    }


def compare_regression_models(results):
    print("\n" + "=" * 60)
    print("REGRESSION MODEL COMPARISON")
    print("=" * 60)

    comparison = {name: res['r2'] for name, res in results.items()}
    for name, r2 in sorted(comparison.items(), key=lambda x: x[1], reverse=True):
        print(f"{name:25s}: R² = {r2:.4f}")


def compare_classification_models(results):
    print("\n" + "=" * 60)
    print("CLASSIFICATION MODEL COMPARISON")
    print("=" * 60)

    for name, res in sorted(results.items(), key=lambda x: x[1]['f1'], reverse=True):
        print(f"{name:25s}: F1={res['f1']:.4f}, Acc={res['accuracy']:.4f}")

    best_model = max(results.items(), key=lambda x: x[1]['f1'])
    print(f"\nBest Classification Model: {best_model[0]}")
    return best_model[0], best_model[1]


def show_confusion_matrix(y_true, y_pred, model_name):
    print("\n" + "=" * 60)
    print(f"CONFUSION MATRIX - {model_name}")
    print("=" * 60)

    cm = confusion_matrix(y_true, y_pred)
    print("\n                Predicted")
    print("              Low    High")
    print(f"Actual Low   {cm[0,0]:5d}  {cm[0,1]:5d}")
    print(f"      High   {cm[1,0]:5d}  {cm[1,1]:5d}")


def show_feature_importance(model, feature_names, model_name, top_n=15):
    print("\n" + "=" * 60)
    print(f"TOP {top_n} IMPORTANT FEATURES - {model_name}")
    print("=" * 60)

    if hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
    elif hasattr(model, 'named_steps') and hasattr(model.named_steps['lr'], 'coef_'):
        importances = np.abs(model.named_steps['lr'].coef_[0])
    else:
        print("Model does not support feature importance")
        return

    feature_importance = pd.DataFrame({
        'feature': feature_names,
        'importance': importances
    }).sort_values('importance', ascending=False)

    for idx, row in feature_importance.head(top_n).iterrows():
        print(f"{row['feature']:40s}: {row['importance']:.4f}")


def cross_validate_model(model, X, y, cv=5):
    scores = cross_val_score(model, X, y, cv=cv, scoring='f1')
    print(f"\nCross-Validation F1 Scores: {scores}")
    print(f"Average F1 with {cv}-Fold CV: {np.mean(scores):.4f} (+/- {np.std(scores):.4f})")

In [34]:
# Main function
def main(df):
    # 1. Explore data
    explore_data(df)

    # 2. Prepare features
    X, y = prepare_features(df)

    # 3. Analyze correlations
    X = analyze_feature_correlation(X)

    # 4. Prepare targets
    y_reg, y_class = prepare_targets(y)

    # 5. Split data
    X_train, X_test, y_reg_train, y_reg_test, y_class_train, y_class_test = split_data(
        X, y_reg, y_class
    )

    # 6. Train and evaluate regression models
    reg_models = train_regression_models(X_train, y_reg_train)
    reg_results = {}
    for name, model in reg_models.items():
        reg_results[name] = evaluate_regression_model(model, X_test, y_reg_test, name)
    compare_regression_models(reg_results)

    # 7. Train and evaluate classification models
    clf_models = train_classification_models(X_train, y_class_train)
    clf_results = {}
    for name, model in clf_models.items():
        clf_results[name] = evaluate_classification_model(model, X_test, y_class_test, name)
    best_clf_name, best_clf_result = compare_classification_models(clf_results)

    # 8. Show confusion matrix for best model
    show_confusion_matrix(y_class_test, best_clf_result['predictions'], best_clf_name)

    # 9. Show feature importance
    show_feature_importance(clf_models[best_clf_name], X.columns, best_clf_name)

    # 10. Cross-validate best model
    cross_validate_model(clf_models[best_clf_name], X_train, y_class_train, cv=5)

    return {
      'X_train': X_train, 'X_test': X_test,
      'y_class_train': y_class_train, 'y_class_test': y_class_test,
      'reg_models': reg_models, 'clf_models': clf_models,
      'clf_results': clf_results,
      'best_clf_name': best_clf_name,
      'best_clf_result': best_clf_result,
      'feature_names': X.columns
    }

In [35]:
# Handling Imbalanced Target Variable
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek

## SMOTE to handle imbalance
def train_with_smote(X_train, y_class_train):
    print("\n" + "=" * 80)
    print("TRAINING WITH SMOTE (OVERSAMPLING)")
    print("=" * 80)

    # Apply SMOTE
    print("\nApplying SMOTE...")
    smote = SMOTE(random_state=RANDOM_STATE)
    X_train_smote, y_train_smote = smote.fit_resample(X_train, y_class_train)

    print(f"Original distribution: {np.bincount(y_class_train)}")
    print(f"After SMOTE: {np.bincount(y_train_smote)}")

    models = {}

    # Logistic Regression with SMOTE
    print("\nTraining Logistic Regression with SMOTE...")
    pipe_lr = Pipeline([
        ("scaler", StandardScaler()),
        ("lr", LogisticRegression(C=1.0, max_iter=1000, random_state=RANDOM_STATE))
    ])
    pipe_lr.fit(X_train_smote, y_train_smote)
    models['Logistic Regression (SMOTE)'] = pipe_lr

    # Random Forest with SMOTE
    print("Training Random Forest with SMOTE...")
    rf_clf = RandomForestClassifier(
        n_estimators=100, max_depth=15, min_samples_leaf=5,
        min_samples_split=10, max_features='sqrt',
        n_jobs=-1, random_state=RANDOM_STATE
    )
    rf_clf.fit(X_train_smote, y_train_smote)
    models['Random Forest (SMOTE)'] = rf_clf

    # Gradient Boosting with SMOTE
    print("Training Gradient Boosting with SMOTE...")
    gb_clf = HistGradientBoostingClassifier(
        learning_rate=0.05, max_depth=5, max_bins=255,
        l2_regularization=1.0, max_iter=200, random_state=RANDOM_STATE
    )
    gb_clf.fit(X_train_smote, y_train_smote)
    models['Gradient Boosting (SMOTE)'] = gb_clf

    return models


def train_with_undersampling(X_train, y_class_train):
    print("\n" + "=" * 80)
    print("TRAINING WITH RANDOM UNDERSAMPLING")
    print("=" * 80)

    # Apply Random Undersampling
    print("\nApplying Random Undersampling...")
    rus = RandomUnderSampler(random_state=RANDOM_STATE)
    X_train_rus, y_train_rus = rus.fit_resample(X_train, y_class_train)

    print(f"Original distribution: {np.bincount(y_class_train)}")
    print(f"After undersampling: {np.bincount(y_train_rus)}")

    models = {}

    # Random Forest with undersampling
    print("\nTraining Random Forest with undersampling...")
    rf_clf = RandomForestClassifier(
        n_estimators=100, max_depth=15, min_samples_leaf=5,
        min_samples_split=10, max_features='sqrt',
        n_jobs=-1, random_state=RANDOM_STATE
    )
    rf_clf.fit(X_train_rus, y_train_rus)
    models['Random Forest (Undersample)'] = rf_clf

    return models


def train_with_combined_sampling(X_train, y_class_train):
    print("\n" + "=" * 80)
    print("TRAINING WITH SMOTE + TOMEK LINKS")
    print("=" * 80)

    # Apply SMOTETomek
    print("\nApplying SMOTETomek...")
    smt = SMOTETomek(random_state=RANDOM_STATE)
    X_train_smt, y_train_smt = smt.fit_resample(X_train, y_class_train)

    print(f"Original distribution: {np.bincount(y_class_train)}")
    print(f"After SMOTETomek: {np.bincount(y_train_smt)}")

    models = {}

    # Random Forest with SMOTETomek
    print("\nTraining Random Forest with SMOTETomek...")
    rf_clf = RandomForestClassifier(
        n_estimators=100, max_depth=15, min_samples_leaf=5,
        min_samples_split=10, max_features='sqrt',
        n_jobs=-1, random_state=RANDOM_STATE
    )
    rf_clf.fit(X_train_smt, y_train_smt)
    models['Random Forest (SMOTETomek)'] = rf_clf

    return models


def train_with_adjusted_weights(X_train, y_class_train):
    print("\n" + "=" * 80)
    print("TRAINING WITH ADJUSTED CLASS WEIGHTS")
    print("=" * 80)

    models = {}

    # class weights
    class_counts = np.bincount(y_class_train)
    total = len(y_class_train)
    weight_for_0 = total / (2 * class_counts[0])
    weight_for_1 = total / (2 * class_counts[1])

    print(f"\nClass weights: {{0: {weight_for_0:.2f}, 1: {weight_for_1:.2f}}}")

    # Logistic Regression with custom weights
    print("\nTraining Logistic Regression with custom weights...")
    pipe_lr = Pipeline([
        ("scaler", StandardScaler()),
        ("lr", LogisticRegression(
            C=1.0, max_iter=1000,
            class_weight={0: weight_for_0, 1: weight_for_1},
            random_state=RANDOM_STATE
        ))
    ])
    pipe_lr.fit(X_train, y_class_train)
    models['Logistic Regression (Custom Weights)'] = pipe_lr

    # Random Forest with more aggressive weights
    print("Training Random Forest with custom weights...")
    rf_clf = RandomForestClassifier(
        n_estimators=100, max_depth=15, min_samples_leaf=5,
        min_samples_split=10, max_features='sqrt',
        class_weight={0: weight_for_0, 1: weight_for_1 * 1.5},
        n_jobs=-1, random_state=RANDOM_STATE
    )
    rf_clf.fit(X_train, y_class_train)
    models['Random Forest (Custom Weights)'] = rf_clf

    return models


def train_with_threshold_tuning(X_train, y_class_train):
    print("\n" + "=" * 80)
    print("TRAINING WITH THRESHOLD TUNING")
    print("=" * 80)

    # Train standard Random Forest
    rf_clf = RandomForestClassifier(
        n_estimators=100, max_depth=15, min_samples_leaf=5,
        min_samples_split=10, max_features='sqrt', class_weight='balanced',
        n_jobs=-1, random_state=RANDOM_STATE
    )
    rf_clf.fit(X_train, y_class_train)

    return {'Random Forest (Threshold Tuning)': rf_clf}


def evaluate_with_threshold(model, X_test, y_class_test, thresholds=[0.3, 0.35, 0.4, 0.45, 0.5]):
    print("\n" + "=" * 60)
    print("THRESHOLD TUNING RESULTS")
    print("=" * 60)

    y_pred_proba = model.predict_proba(X_test)[:, 1]

    best_f1 = 0
    best_threshold = 0.5
    best_results = None

    for threshold in thresholds:
        y_pred = (y_pred_proba >= threshold).astype(int)

        f1 = f1_score(y_class_test, y_pred)
        precision = precision_score(y_class_test, y_pred)
        recall = recall_score(y_class_test, y_pred)
        accuracy = accuracy_score(y_class_test, y_pred)

        print(f"\nThreshold: {threshold:.2f}")
        print(f"  F1:        {f1:.4f}")
        print(f"  Precision: {precision:.4f}")
        print(f"  Recall:    {recall:.4f}")
        print(f"  Accuracy:  {accuracy:.4f}")

        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
            best_results = {
                'accuracy': accuracy, 'precision': precision,
                'recall': recall, 'f1': f1, 'auc': None,
                'predictions': y_pred, 'probabilities': y_pred_proba
            }

    print(f"\n*** Best threshold: {best_threshold:.2f} with F1: {best_f1:.4f} ***")

    # Show confusion matrix for best threshold
    y_pred_best = (y_pred_proba >= best_threshold).astype(int)
    cm = confusion_matrix(y_class_test, y_pred_best)
    print(f"\nConfusion Matrix (threshold={best_threshold}):")
    print("                Predicted")
    print("              Low    High")
    print(f"Actual Low   {cm[0,0]:5d}  {cm[0,1]:5d}")
    print(f"      High   {cm[1,0]:5d}  {cm[1,1]:5d}")

    return best_results


def compare_all_approaches(X_train, X_test, y_class_train, y_class_test):
    """Compare all imbalance handling approaches"""
    all_results = {}

    # 1. SMOTE
    smote_models = train_with_smote(X_train, y_class_train)
    for name, model in smote_models.items():
        all_results[name] = evaluate_classification_model(model, X_test, y_class_test, name)

    # 2. Undersampling
    under_models = train_with_undersampling(X_train, y_class_train)
    for name, model in under_models.items():
        all_results[name] = evaluate_classification_model(model, X_test, y_class_test, name)

    # 3. SMOTETomek
    combined_models = train_with_combined_sampling(X_train, y_class_train)
    for name, model in combined_models.items():
        all_results[name] = evaluate_classification_model(model, X_test, y_class_test, name)

    # 4. Custom Weights
    weight_models = train_with_adjusted_weights(X_train, y_class_train)
    for name, model in weight_models.items():
        all_results[name] = evaluate_classification_model(model, X_test, y_class_test, name)

    # 5. Threshold Tuning
    threshold_models = train_with_threshold_tuning(X_train, y_class_train)
    for name, model in threshold_models.items():
        result = evaluate_with_threshold(model, X_test, y_class_test)
        all_results[name] = result

    # Comparison
    print("\n" + "=" * 80)
    print("FINAL COMPARISON - ALL STRATEGIES")
    print("=" * 80)

    for name, res in sorted(all_results.items(), key=lambda x: x[1]['f1'], reverse=True):
        print(f"{name:40s}: F1={res['f1']:.4f}, Recall={res['recall']:.4f}, Precision={res['precision']:.4f}")

    best_model = max(all_results.items(), key=lambda x: x[1]['f1'])
    print(f"\n*** BEST MODEL: {best_model[0]} with F1={best_model[1]['f1']:.4f} ***")

    return all_results

In [32]:
!pip install imbalanced-learn



In [36]:
print("\n\n" + "=" * 80)
print("PART 1: BASELINE MODELS")
print("=" * 80 + "\n")

results_1 = main(original_news_df.copy())

# Extract variables for Part 2
X_train = results_1['X_train']
X_test = results_1['X_test']
y_class_train = results_1['y_class_train']
y_class_test = results_1['y_class_test']

print("\n" + "=" * 80)
print("PART 1 SUMMARY")
print("=" * 80)
print(f"Best Baseline Model: {results_1['best_clf_name']}")
print(f"  F1-Score:  {results_1['best_clf_result']['f1']:.4f}")
print(f"  Recall:    {results_1['best_clf_result']['recall']:.4f}")
print(f"  Precision: {results_1['best_clf_result']['precision']:.4f}")
print(f"  Accuracy:  {results_1['best_clf_result']['accuracy']:.4f}")


# STEP 3: Run PART 2 - Imbalance Handling
print("\n\n" + "=" * 80)
print("PART 2: IMBALANCE HANDLING TECHNIQUES")
print("=" * 80 + "\n")

results_2 = compare_all_approaches(X_train, X_test, y_class_train, y_class_test)


# STEP 4: Final Comparison - Baseline vs Improved
print("\n\n" + "=" * 80)
print("FINAL COMPARISON: BASELINE vs IMPROVED")
print("=" * 80)

baseline_f1 = results_1['best_clf_result']['f1']
baseline_recall = results_1['best_clf_result']['recall']

print(f"\nBASELINE ({results_1['best_clf_name']}):")
print(f"  F1-Score:  {baseline_f1:.4f}")
print(f"  Recall:    {baseline_recall:.4f}")

best_improved = max(results_2.items(), key=lambda x: x[1]['f1'])
improved_f1 = best_improved[1]['f1']
improved_recall = best_improved[1]['recall']

print(f"\nBEST IMPROVED ({best_improved[0]}):")
print(f"  F1-Score:  {improved_f1:.4f}")
print(f"  Recall:    {improved_recall:.4f}")

f1_improvement = (improved_f1 - baseline_f1) / baseline_f1 * 100
recall_improvement = (improved_recall - baseline_recall) / baseline_recall * 100

print(f"\nIMPROVEMENT:")
print(f"  F1-Score:  {f1_improvement:+.1f}%")
print(f"  Recall:    {recall_improvement:+.1f}%")

if f1_improvement > 0:
    print(f"\nSuccessfully improved model performance!")
else:
    print(f"\nBaseline model performed better. Consider using threshold tuning.")




PART 1: BASELINE MODELS

DATA EXPLORATION

Dataset shape: (39644, 61)
Missing values: 0

Target Statistics:
  Mean: 3395.38
  Median: 1400.00
  Std: 11626.95
  Min: 1.00
  Max: 843300.00

DATA PREPARATION
Final feature matrix: (39644, 59)

FEATURE CORRELATION ANALYSIS
Removing 2 highly correlated features
Final features: 57

Classification Target (Top 25%):
  Low shares:  30014 (75.7%)
  High shares: 9630 (24.3%)

TRAIN/TEST SPLIT
Training: 31715 samples (80.0%)
Testing:  7929 samples (20.0%)

REGRESSION MODELS

Training Ridge Regression...
Training Random Forest Regressor...
Training Gradient Boosting Regressor...

Ridge
RMSE: 0.8806 | R²: 0.1256

Random Forest
RMSE: 0.8578 | R²: 0.1703

Gradient Boosting
RMSE: 0.8483 | R²: 0.1885

REGRESSION MODEL COMPARISON
Gradient Boosting        : R² = 0.1885
Random Forest            : R² = 0.1703
Ridge                    : R² = 0.1256

CLASSIFICATION MODELS

Training Logistic Regression...
Training Random Forest Classifier...
Training Gradient

In [38]:
# Saving the best model
import pickle
from google.colab import files
import os

# Determine which model to save
if improved_f1 > baseline_f1:
    best_overall_name = best_improved[0]
    # Get the actual model object
    if 'SMOTE' in best_overall_name:
        print(f"\nRetraining {best_overall_name} for saving...")
        smote_models = train_with_smote(X_train, y_class_train)
        best_overall_model = smote_models[best_overall_name]
    elif 'Undersample' in best_overall_name:
        print(f"\nRetraining {best_overall_name} for saving...")
        under_models = train_with_undersampling(X_train, y_class_train)
        best_overall_model = under_models[best_overall_name]
    elif 'SMOTETomek' in best_overall_name:
        print(f"\nRetraining {best_overall_name} for saving...")
        combined_models = train_with_combined_sampling(X_train, y_class_train)
        best_overall_model = combined_models[best_overall_name]
    elif 'Custom Weights' in best_overall_name:
        print(f"\nRetraining {best_overall_name} for saving...")
        weight_models = train_with_adjusted_weights(X_train, y_class_train)
        best_overall_model = weight_models[best_overall_name]
    else:  # Threshold Tuning
        print(f"\nRetraining {best_overall_name} for saving...")
        threshold_models = train_with_threshold_tuning(X_train, y_class_train)
        best_overall_model = threshold_models[best_overall_name]

    print(f"Saving improved model: {best_overall_name}")
else:
    best_overall_name = results_1['best_clf_name']
    best_overall_model = results_1['clf_models'][best_overall_name]
    print(f"\nSaving baseline model: {best_overall_name}")

# Save the model to Colab session storage
print("\nSaving files...")
with open('best_news_shares_model.pkl', 'wb') as f:
    pickle.dump(best_overall_model, f)

# Save feature names
with open('feature_names.pkl', 'wb') as f:
    pickle.dump(results_1['feature_names'].tolist(), f)

# Save model metadata
model_metadata = {
    'model_name': best_overall_name,
    'f1_score': improved_f1 if improved_f1 > baseline_f1 else baseline_f1,
    'recall': improved_recall if improved_f1 > baseline_f1 else baseline_recall,
    'precision': best_improved[1]['precision'] if improved_f1 > baseline_f1 else results_1['best_clf_result']['precision'],
    'accuracy': best_improved[1]['accuracy'] if improved_f1 > baseline_f1 else results_1['best_clf_result']['accuracy'],
    'classification_threshold': CLASSIFICATION_THRESHOLD,
    'feature_count': len(results_1['feature_names']),
    'training_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
}

with open('model_metadata.pkl', 'wb') as f:
    pickle.dump(model_metadata, f)

print("Files saved to Colab session")

# Download to your computer
print("\n" + "=" * 60)
print("=" * 60)
print("\nDownloading 3 files...")

try:
    files.download('best_news_shares_model.pkl')
    print("Downloaded: best_news_shares_model.pkl")
except Exception as e:
    print(f"Error downloading model: {e}")

try:
    files.download('feature_names.pkl')
    print("Downloaded: feature_names.pkl")
except Exception as e:
    print(f"Error downloading feature names: {e}")

try:
    files.download('model_metadata.pkl')
    print("Downloaded: model_metadata.pkl")
except Exception as e:
    print(f"Error downloading metadata: {e}")

print("\n" + "=" * 60)
print("Download Complete")
print("=" * 60)
print(f"\nModel Details:")
print(f"  Name:      {model_metadata['model_name']}")
print(f"  F1-Score:  {model_metadata['f1_score']:.4f}")
print(f"  Recall:    {model_metadata['recall']:.4f}")
print(f"  Precision: {model_metadata['precision']:.4f}")
print(f"  Accuracy:  {model_metadata['accuracy']:.4f}")
print(f"  Features:  {model_metadata['feature_count']}")
print(f"  Trained:   {model_metadata['training_date']}")

print("\n" + "=" * 80)
print("Finish!")
print("=" * 80)


Retraining Random Forest (Threshold Tuning) for saving...

TRAINING WITH THRESHOLD TUNING
Saving improved model: Random Forest (Threshold Tuning)

Saving files...
Files saved to Colab session


Downloading 3 files...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded: best_news_shares_model.pkl


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded: feature_names.pkl


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded: model_metadata.pkl

Download Complete

Model Details:
  Name:      Random Forest (Threshold Tuning)
  F1-Score:  0.4842
  Recall:    0.6592
  Precision: 0.3827
  Accuracy:  0.6596
  Features:  57
  Trained:   2025-11-02 23:58:07

Finish!
