In [1]:
# =========================================================
# SMOTE EXPERIMENT (SEPARATE FILE)
# =========================================================

import numpy as np
import pandas as pd
import re
import nltk

from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef, confusion_matrix

from imblearn.over_sampling import SMOTE
from scipy.sparse import csr_matrix, hstack

In [2]:

# =========================================================
# LOAD DATASETS
# =========================================================
datasets = [
    pd.read_csv("Youtube01-Psy.csv"),
    pd.read_csv("Youtube02-KatyPerry.csv"),
    pd.read_csv("Youtube03-LMFAO.csv"),
    pd.read_csv("Youtube04-Eminem.csv"),
    pd.read_csv("Youtube05-Shakira.csv")
]


In [3]:
# =========================================================
# TEXT CLEANING
# =========================================================
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"[^a-z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    tokens = [w for w in text.split() if w not in stop_words]
    return " ".join(tokens)

for df in datasets:
    df["clean_content"] = df["CONTENT"].apply(clean_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hatice\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# =========================================================
# EXTRA FEATURES
# =========================================================
def add_extra_features(df):
    df = df.copy()
    df["comment_len"] = df["CONTENT"].astype(str).apply(len)
    df["url_count"] = df["CONTENT"].str.count("http")
    df["excl_count"] = df["CONTENT"].str.count("!")
    df["upper_ratio"] = df["CONTENT"].apply(
        lambda x: sum(1 for c in str(x) if c.isupper()) / (len(str(x)) + 1)
    )
    return df

datasets = [add_extra_features(df) for df in datasets]

In [5]:
# =========================================================
# MODEL EVALUATION FUNCTION (SAME AS BASELINE)
# =========================================================
def run_all_models(X_train, X_test, y_train, y_test):
    models = {
        "Decision Tree": DecisionTreeClassifier(random_state=42),
        "Logistic Regression": LogisticRegression(max_iter=1000, class_weight="balanced"),
        "Bernoulli NB": BernoulliNB(),
        "Random Forest": RandomForestClassifier(random_state=42),
        "SVM Linear": LinearSVC(dual=False, max_iter=2000, random_state=42),
        "Bagging DT": BaggingClassifier(
             estimator=DecisionTreeClassifier(random_state=42),
            n_estimators=20,
           random_state=42
        ),
        "AdaBoost": AdaBoostClassifier(n_estimators=50, random_state=42),
        "Stacking": StackingClassifier(
            estimators=[
               ("dt", DecisionTreeClassifier(max_depth=5)),
                ("nb", BernoulliNB())
         ],
           final_estimator=LogisticRegression()
         )
    }

    results = {}

    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

        results[name] = {
            "Accuracy": accuracy_score(y_test, y_pred) * 100,
            "SC": (tp / (tp + fn) * 100) if (tp + fn) > 0 else 0,
            "BH": (fp / (tn + fp) * 100) if (tn + fp) > 0 else 0,
            "F1": f1_score(y_test, y_pred) ,
            "MCC": matthews_corrcoef(y_test, y_pred)
        }

    return pd.DataFrame(results).T

In [6]:
# =========================================================
# TRAIN + SMOTE / TEST (REAL DATA)
# =========================================================
def prepare_train_test_with_smote(df, target_size, test_size=0.30, random_state=42):

    X_text = df["clean_content"]
    y = df["CLASS"]

    tfidf = TfidfVectorizer(
        max_features=3000,
        ngram_range=(1, 2),
        min_df=3,
        max_df=0.85,
        stop_words="english",
        sublinear_tf=True
    )

    X_tfidf = tfidf.fit_transform(X_text)

    extra = csr_matrix(
        df[["comment_len", "url_count", "excl_count", "upper_ratio"]].values
    )

    X = hstack([X_tfidf, extra])

    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=test_size,
        stratify=y,
        random_state=random_state
    )

    smote = SMOTE(
        sampling_strategy={0: target_size, 1: target_size},
        random_state=random_state,
        k_neighbors=3
    )

    X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

    return X_train_smote, X_test, y_train_smote, y_test

In [7]:
# =========================================================
# RUN SMOTE EXPERIMENTS (1000 → 5000)
# =========================================================
def run_smote_experiments(df, sizes=[1000,5000]):
    results = {}

    for size in sizes:
        print(f"\nRunning SMOTE with {size} samples per class")

        X_train, X_test, y_train, y_test = prepare_train_test_with_smote(
            df,
            target_size=size
        )

        results[size] = run_all_models(X_train, X_test, y_train, y_test)

    return results

In [8]:
# =========================================================
# FINAL RUN
# =========================================================

all_smote_results = {}

for i, df in enumerate(datasets):
    print(f"\n================ DATASET {i+1} ================")

    res = run_smote_experiments(df)
    print("DEBUG sizes:", res.keys())   # ← ÇOK ÖNEMLİ

    all_smote_results[f"Dataset_{i+1}"] = res




Running SMOTE with 1000 samples per class

Running SMOTE with 5000 samples per class
DEBUG sizes: dict_keys([1000, 5000])


Running SMOTE with 1000 samples per class

Running SMOTE with 5000 samples per class
DEBUG sizes: dict_keys([1000, 5000])


Running SMOTE with 1000 samples per class

Running SMOTE with 5000 samples per class
DEBUG sizes: dict_keys([1000, 5000])


Running SMOTE with 1000 samples per class

Running SMOTE with 5000 samples per class
DEBUG sizes: dict_keys([1000, 5000])


Running SMOTE with 1000 samples per class

Running SMOTE with 5000 samples per class
DEBUG sizes: dict_keys([1000, 5000])


In [9]:
# =========================================================
# AGGREGATE RESULTS (ONLY TABLE 6 & 7)
# =========================================================

final_tables = {1000: [], 5000: []}

for dataset_name, size_results in all_smote_results.items():
    for size in [1000, 5000]:
        if size in size_results:
            final_tables[size].append(size_results[size])

for size in [1000, 5000]:
    print(f"\n\n================ TABLE ({size} spam + {size} ham) ================")

    print("DEBUG table count:", len(final_tables[size]))

    if len(final_tables[size]) == 0:
        print("❌ NO DATA FOUND FOR THIS SIZE")
        continue

    combined = pd.concat(final_tables[size])
    mean_table = combined.groupby(combined.index).mean()

    print(mean_table.round(4))





DEBUG table count: 5
                     Accuracy       SC      BH      F1     MCC
AdaBoost              89.6075  80.8401  1.3832  0.8861  0.8060
Bagging DT            92.1361  87.7163  3.4221  0.9172  0.8470
Bernoulli NB          88.5103  80.4180  3.2972  0.8697  0.7877
Decision Tree         91.5327  88.5500  5.5873  0.9117  0.8323
Logistic Regression   91.7839  87.8951  4.4825  0.9142  0.8386
Random Forest         94.0093  89.8182  1.7606  0.9364  0.8839
SVM Linear            93.3573  91.2537  4.5815  0.9324  0.8678
Stacking              92.8374  89.4721  3.6201  0.9260  0.8592


DEBUG table count: 5
                     Accuracy       SC      BH      F1     MCC
AdaBoost              90.1687  81.7237  1.0553  0.8932  0.8166
Bagging DT            92.7109  88.4970  3.0831  0.9239  0.8572
Bernoulli NB          90.1197  83.9970  3.6746  0.8925  0.8144
Decision Tree         91.7265  88.3826  4.9316  0.9138  0.8363
Logistic Regression   92.8775  90.0999  4.4825  0.9269  0.8591
Random Fo