**MIDTERM PROJECT**

In [25]:
# =========================================================
# IMPORTS
# =========================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk

from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, StackingClassifier

from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef, confusion_matrix

plt.style.use("seaborn-v0_8")


In [26]:

# =========================================================
# LOAD DATASETS
# =========================================================
datasets = [
    pd.read_csv("Youtube01-Psy.csv"),
    pd.read_csv("Youtube02-KatyPerry.csv"),
    pd.read_csv("Youtube03-LMFAO.csv"),
    pd.read_csv("Youtube04-Eminem.csv"),
    pd.read_csv("Youtube05-Shakira.csv")
]

In [27]:

# =========================================================
# TEXT CLEANING
# =========================================================
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"[^a-z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    tokens = [w for w in text.split() if w not in stop_words]
    return " ".join(tokens)

for df in datasets:
    df["clean_content"] = df["CONTENT"].apply(clean_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hatice\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [28]:

# =========================================================
# EXTRA FEATURES
# =========================================================
def add_extra_features(df):
    df = df.copy()
    df["comment_len"] = df["CONTENT"].astype(str).apply(len)
    df["url_count"] = df["CONTENT"].str.count("http")
    df["excl_count"] = df["CONTENT"].str.count("!")
    df["upper_ratio"] = df["CONTENT"].apply(
        lambda x: sum(1 for c in str(x) if c.isupper()) / (len(str(x)) + 1)
    )
    return df

datasets = [add_extra_features(df) for df in datasets]




In [29]:

# =========================================================
# TF-IDF + SPLIT
# =========================================================
def prepare_train_test(df, test_size=0.30):
    X_text = df["clean_content"]
    y = df["CLASS"]

    tfidf = TfidfVectorizer(
        max_features=4000,
        ngram_range=(1, 2),
        min_df=3,
        max_df=0.85,
        stop_words="english",
        sublinear_tf=True
    )

    X = tfidf.fit_transform(X_text)

    from scipy.sparse import hstack, csr_matrix
    extra = csr_matrix(df[["comment_len", "url_count", "excl_count", "upper_ratio"]].values)
    X = hstack([X, extra])

    return X, y




In [30]:

# =========================================================
# MODEL FUNCTION
# =========================================================
def run_all_models(X_train, X_test, y_train, y_test):
    models = {
        "Decision Tree": DecisionTreeClassifier(random_state=42),
        "Logistic Regression": LogisticRegression(max_iter=1000, class_weight="balanced"),
        "Bernoulli NB": BernoulliNB(),
        "Random Forest": RandomForestClassifier(random_state=42),

        "SVM Linear": SVC(kernel="linear"),

        "Bagging DT": BaggingClassifier(
            estimator=DecisionTreeClassifier(random_state=42),
            n_estimators=20,
            random_state=42
        ),

        "AdaBoost": AdaBoostClassifier(
            n_estimators=50,
            random_state=42
        ),

        "Stacking": StackingClassifier(
            estimators=[
                ("dt", DecisionTreeClassifier(max_depth=5)),
                ("nb", BernoulliNB())
            ],
            final_estimator=LogisticRegression()
        )
    }

    results = {}

    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

        results[name] = {
            "Accuracy": accuracy_score(y_test, y_pred) * 100,
            "SC": (tp / (tp + fn) * 100) if (tp + fn) > 0 else 0,
            "BH": (fp / (tn + fp) * 100) if (tn + fp) > 0 else 0,
            "F1": f1_score(y_test, y_pred) * 100,
            "MCC": matthews_corrcoef(y_test, y_pred)
        }

    return results

In [31]:

# =========================================================
# REPEATED EXPERIMENT (FIXED)
# =========================================================
def run_repeated_experiment(X, y, n_runs=5):
    all_results = []

    for i in range(n_runs):
        X_train, X_test, y_train, y_test = train_test_split(
            X, y,
            test_size=0.3,
            random_state=42 + i,
            stratify=y
        )
        all_results.append(run_all_models(X_train, X_test, y_train, y_test))

    models = all_results[0].keys()
    metrics = all_results[0][list(models)[0]].keys()

    summary = {}
    for model in models:
        summary[model] = {}
        for metric in metrics:
            vals = [res[model][metric] for res in all_results]
            summary[model][metric] = f"{np.mean(vals):.2f} ± {np.std(vals):.2f}"

    return pd.DataFrame(summary).T

In [32]:

# =========================================================
# FINAL RUN
# =========================================================
all_tables = []

for df in datasets:
    X, y = prepare_train_test(df)
    all_tables.append(run_repeated_experiment(X, y))

final_table = pd.concat(all_tables).groupby(level=0).agg("first")

print("\nFINAL RESULTS\n")
print(final_table)



FINAL RESULTS

                         Accuracy            SC           BH            F1  \
AdaBoost             92.19 ± 1.11  87.40 ± 2.59  3.03 ± 1.52  91.76 ± 1.30   
Bagging DT           92.76 ± 2.05  89.27 ± 4.37  3.81 ± 1.71  92.42 ± 2.33   
Bernoulli NB         94.10 ± 2.51  97.71 ± 1.87  9.50 ± 4.92  94.33 ± 2.35   
Decision Tree        91.62 ± 2.72  90.81 ± 3.13  7.60 ± 3.16  91.52 ± 2.81   
Logistic Regression  92.38 ± 2.63  87.38 ± 4.02  2.66 ± 1.50  91.91 ± 2.96   
Random Forest        94.10 ± 1.11  93.88 ± 3.10  5.71 ± 1.24  94.04 ± 1.27   
SVM Linear           95.81 ± 1.77  93.12 ± 3.78  1.52 ± 1.41  95.64 ± 1.94   
Stacking             96.19 ± 0.85  95.43 ± 1.92  3.03 ± 2.26  96.16 ± 0.85   

                             MCC  
AdaBoost             0.85 ± 0.02  
Bagging DT           0.86 ± 0.04  
Bernoulli NB         0.89 ± 0.05  
Decision Tree        0.83 ± 0.05  
Logistic Regression  0.85 ± 0.05  
Random Forest        0.88 ± 0.02  
SVM Linear           0.92 ± 0.03  
S