In [1]:
import re, string, joblib, pandas as pd
from pathlib import Path
from sklearn.model_selection import (
    train_test_split, StratifiedKFold, GridSearchCV
)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [3]:
DATA_DIR = Path("prepare_datasets/ISO Fake News")

real_df = pd.read_csv(DATA_DIR / "real.csv")
fake_df = pd.read_csv(DATA_DIR / "fake.csv")

TEXT_COL = "full_text"
real_df[TEXT_COL] = real_df["title"] + real_df["text"]
real_df.drop(["title", "text"], axis=1, inplace=True)
fake_df[TEXT_COL] = fake_df["title"] + fake_df["text"]
fake_df.drop(["title", "text"], axis=1, inplace=True)

real_df["label"] = "real"
fake_df["label"] = "fake"

df = pd.concat([real_df[[TEXT_COL, "label"]],
                fake_df[[TEXT_COL, "label"]]],
               ignore_index=True).sample(frac=1, random_state=42
) 

X_raw, y = df[TEXT_COL], df["label"]

In [5]:
# INFO: Ran once to download stop words
#import nltk
#for pkg in ("stopwords", "punkt", "wordnet", "omw-1.4"):
    #nltk.download(pkg)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\maxis\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\maxis\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\maxis\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\maxis\AppData\Roaming\nltk_data...


In [6]:
lemmatizer = WordNetLemmatizer()
stops = set(stopwords.words("english"))

URL_RE  = re.compile(r"http\S+|www\.\S+")
TAG_RE  = re.compile(r"[@#]\w+")
PUNCTUATION_TABLE = str.maketrans("", "", string.punctuation)

def clean(text: str) -> str:
    text = URL_RE.sub(" ", text)
    text = TAG_RE.sub(" ", text)
    text = text.translate(PUNCTUATION_TABLE)
    tokens = [
        lemmatizer.lemmatize(tok)
        for tok in text.lower().split()
        if tok not in stops and len(tok) > 2
    ]
    return " ".join(tokens)

X = X_raw.astype(str).apply(clean)

In [7]:
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(
        analyzer="word",
        ngram_range=(1, 2),
        min_df=2,
        max_df=0.7,
        sublinear_tf=True,
        stop_words="english",
        norm="l2",
    )),
    ("clf", LinearSVC(class_weight="balanced")),
])

param_grid = {
    "tfidf__ngram_range": [(1, 1), (1, 2), (1, 3)],
    "tfidf__min_df": [1, 2, 5],
    "tfidf__max_df": [0.7, 0.85, 0.95],
    "clf__C": [0.1, 0.5, 1, 2, 5, 10],
    "clf__loss": ["hinge", "squared_hinge"],
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(
    pipeline,
    param_grid,
    cv=cv,
    scoring="accuracy",
    n_jobs=-1,
    verbose=2,
)

grid.fit(X, y)

print(f"Best 5-fold accuracy: {grid.best_score_:.4f}")
print("Best params:", grid.best_params_)

Fitting 5 folds for each of 324 candidates, totalling 1620 fits


KeyboardInterrupt: 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42
)

best_model = grid.best_estimator_
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)
print("Hold-out accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=3))