In [None]:
!pip install textstat sentence-transformers xgboost

import os
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import textstat
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
import torch
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


# CONFIG
SEED = 42
TFIDF_MAX_FEATURES = 5000     # original tf-idf dim before SVD
SVD_COMPONENTS = 300          # reduce TF-IDF to this many dims
BERT_MODEL_NAME = "distilbert-base-uncased"  # lightweight BERT-family
BERT_MAX_LEN = 256
BERT_BATCH_SIZE = 32          # increase if GPU and memory allow
RANDOM_STATE = SEED

# DEVICE
device = "cuda" if torch.cuda.is_available() else "cpu"
print("device:", device)

device: cpu


In [None]:
# LOAD DATA
fake_df = pd.read_csv('./Fake.csv')
true_df = pd.read_csv('./True.csv')
# fake_news_df = pd.read_csv ('./fake_news_dataset.csv')


shape = [
    fake_df.describe (),
    '--------------------------------------------',
    true_df.describe (),
    '--------------------------------------------',
    # fake_news_df.describe ()
]

shape

[                                                    title   text subject  \
 count                                               23481  23481   23481   
 unique                                              17903  17455       6   
 top     MEDIA IGNORES Time That Bill Clinton FIRED His...           News   
 freq                                                    6    626    9050   
 
                 date  
 count          23481  
 unique          1681  
 top     May 10, 2017  
 freq              46  ,
 '--------------------------------------------',
                                                     title  \
 count                                               21417   
 unique                                              20826   
 top     Factbox: Trump fills top jobs for his administ...   
 freq                                                   14   
 
                                                      text       subject  \
 count                                               21

In [None]:
# Add labels
fake_df['label'] = 0
true_df['label'] = 1

# Combine datasets
df = pd.concat([fake_df, true_df], ignore_index=True)
df['content'] = df['title'].fillna('') + ' ' + df['text'].fillna('')


df

Unnamed: 0,title,text,subject,date,label,content
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0,Donald Trump Sends Out Embarrassing New Year’...
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0,Drunk Bragging Trump Staffer Started Russian ...
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0,Sheriff David Clarke Becomes An Internet Joke...
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0,Trump Is So Obsessed He Even Has Obama’s Name...
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0,Pope Francis Just Called Out Donald Trump Dur...
...,...,...,...,...,...,...
44893,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",1,'Fully committed' NATO backs new U.S. approach...
44894,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",1,LexisNexis withdrew two products from Chinese ...
44895,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",1,Minsk cultural hub becomes haven from authorit...
44896,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",1,Vatican upbeat on possibility of Pope Francis ...


In [None]:
# ---------------------------
# SPLIT (feature engineering)
# ---------------------------
X_temp, X_test, y_temp, y_test = train_test_split(
    df["content"], df["label"], test_size=0.2, stratify=df["label"], random_state=RANDOM_STATE
)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, stratify=y_temp, random_state=RANDOM_STATE
)  # 60/20/20

print("shapes:", X_train.shape, X_val.shape, X_test.shape)

shapes: (26938,) (8980,) (8980,)


In [None]:
# =====================
# HANDCRAFTED FEATURES FUNCTION
# =====================
def handcrafted_features(texts):
    features = pd.DataFrame()
    features['len_text'] = texts.apply(lambda x: len(x))
    features['num_words'] = texts.apply(lambda x: len(x.split()))
    features['avg_word_length'] = texts.apply(lambda x: np.mean([len(w) for w in x.split()]))
    features['num_sentences'] = texts.apply(lambda x: x.count('.'))
    features['num_uppercase'] = texts.apply(lambda x: sum(1 for c in x if c.isupper()))
    features['exclamation_ratio'] = texts.apply(lambda x: x.count('!')/(len(x)+1))
    features['question_ratio'] = texts.apply(lambda x: x.count('?')/(len(x)+1))
    features['dot_ratio'] = texts.apply(lambda x: x.count('.')/(len(x)+1))
    features['comma_ratio'] = texts.apply(lambda x: x.count(',')/(len(x)+1))
    features['semicolon_ratio'] = texts.apply(lambda x: x.count(';')/(len(x)+1))
    features['colon_ratio'] = texts.apply(lambda x: x.count(':')/(len(x)+1))
    features['quote_ratio'] = texts.apply(lambda x: x.count('"')/(len(x)+1))
    features['parenthesis_ratio'] = texts.apply(lambda x: x.count('(')/(len(x)+1))
    features['bracket_ratio'] = texts.apply(lambda x: x.count('[')/(len(x)+1))
    features['backslash_ratio'] = texts.apply(lambda x: x.count('\\')/(len(x)+1))
    features['bar_ratio'] = texts.apply(lambda x: x.count('|')/(len(x)+1))
    features['dollar_ratio'] = texts.apply(lambda x: x.count('$')/(len(x)+1))
    features['percent_ratio'] = texts.apply(lambda x: x.count('%')/(len(x)+1))
    features['ampersand_ratio'] = texts.apply(lambda x: x.count('&')/(len(x)+1))
    features['star_ratio'] = texts.apply(lambda x: x.count('*')/(len(x)+1))
    features['at_ratio'] = texts.apply(lambda x: x.count('@')/(len(x)+1))
    features['hash_ratio'] = texts.apply(lambda x: x.count('#')/(len(x)+1))
    features['caret_ratio'] = texts.apply(lambda x: x.count('^')/(len(x)+1))
    features['tilde_ratio'] = texts.apply(lambda x: x.count('~')/(len(x)+1))
    features['backtick_ratio'] = texts.apply(lambda x: x.count('`')/(len(x)+1))
    features['readability'] = texts.apply(lambda x: textstat.flesch_reading_ease(x) if len(x)>0 else 0)
    return features

# =====================
# CALCULATE FEATURES FOR EACH SPLIT
# =====================
X_train_hand = handcrafted_features(X_train)
X_val_hand = handcrafted_features(X_val)
X_test_hand = handcrafted_features(X_test)


In [None]:
# ---------------------------
# TF-IDF (fit on train only) + SVD to reduce memory
# ---------------------------
tfidf = TfidfVectorizer(max_features=TFIDF_MAX_FEATURES, ngram_range=(1,2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)
X_test_tfidf = tfidf.transform(X_test)

# reduce tfidf dimension
svd = TruncatedSVD(n_components=SVD_COMPONENTS, random_state=RANDOM_STATE)
X_train_tfidf_svd = svd.fit_transform(X_train_tfidf)
X_val_tfidf_svd = svd.transform(X_val_tfidf)
X_test_tfidf_svd = svd.transform(X_test_tfidf)


In [None]:
# # ---------------------------
# # BERT Embeddings (batched, memory-savvy)
# # ---------------------------
# tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_NAME)
# bert_model = AutoModel.from_pretrained(BERT_MODEL_NAME).to(device)
# bert_model.eval()

# def encode_texts_bert(texts, batch_size=BERT_BATCH_SIZE, max_len=BERT_MAX_LEN):
#     all_embs = []
#     it = range(0, len(texts), batch_size)
#     for i in tqdm(it, desc="BERT batches", total=(len(texts)+batch_size-1)//batch_size):
#         batch_texts = texts[i:i+batch_size]
#         enc = tokenizer(
#             list(batch_texts),
#             padding=True,
#             truncation=True,
#             max_length=max_len,
#             return_tensors="pt"
#         ).to(device)
#         with torch.no_grad():
#             out = bert_model(**enc)
#             # use mean pooling of token embeddings (or CLS: out.last_hidden_state[:,0,:])
#             token_emb = out.last_hidden_state  # (B, L, D)
#             attention_mask = enc["attention_mask"].unsqueeze(-1)  # (B, L, 1)
#             token_emb = token_emb * attention_mask
#             summed = token_emb.sum(dim=1)
#             counts = attention_mask.sum(dim=1).clamp(min=1)
#             mean_pooled = (summed / counts).cpu().numpy()
#             all_embs.append(mean_pooled)
#     return np.vstack(all_embs)


model_st = SentenceTransformer("all-MiniLM-L6-v2", device=device)

def encode_texts_st(texts, batch_size=32):
    """
    Encode list/Series of texts into sentence embeddings using SentenceTransformer.
    """
    # sentence-transformers đã tự động batch trong encode
    emb = model_st.encode(
        texts,
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_numpy=True,
        normalize_embeddings=False
    )
    return emb

# encode each split
X_train_emb = encode_texts_st(X_train.tolist(), batch_size=BERT_BATCH_SIZE)
X_val_emb = encode_texts_st(X_val.tolist(), batch_size=BERT_BATCH_SIZE)
X_test_emb = encode_texts_st(X_test.tolist(), batch_size=BERT_BATCH_SIZE)

print("emb shapes:", X_train_emb.shape, X_val_emb.shape, X_test_emb.shape)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/842 [00:00<?, ?it/s]

Batches:   0%|          | 0/281 [00:00<?, ?it/s]

Batches:   0%|          | 0/281 [00:00<?, ?it/s]

emb shapes: (26938, 384) (8980, 384) (8980, 384)


In [None]:
# ---------------------------
# COMBINE features: emb + tfidf_svd + handcrafted
# ---------------------------
def combine(emb, tfidf_svd, hand):
    # emb: (N, E), tfidf_svd: (N, S), hand: DataFrame (N, H)
    return np.hstack([emb, tfidf_svd, hand.values])

X_train_all = combine(X_train_emb, X_train_tfidf_svd, X_train_hand)
X_val_all = combine(X_val_emb, X_val_tfidf_svd, X_val_hand)
X_test_all = combine(X_test_emb, X_test_tfidf_svd, X_test_hand)


# def combine(tfidf_svd, hand):
#     # emb: (N, E), tfidf_svd: (N, S), hand: DataFrame (N, H)
#     return np.hstack([tfidf_svd, hand.values])

# X_train_all = combine(X_train_tfidf_svd, X_train_hand)
# X_val_all = combine(X_val_tfidf_svd, X_val_hand)
# X_test_all = combine(X_test_tfidf_svd, X_test_hand)

print("final feature shapes:", X_train_all.shape, X_val_all.shape, X_test_all.shape)

final feature shapes: (26938, 710) (8980, 710) (8980, 710)


In [None]:
# Khởi tạo mô hình
xgb = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    eval_metric="logloss",
    random_state=RANDOM_STATE
)

rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=15,
    random_state=RANDOM_STATE,
    n_jobs=-1
)

logreg = LogisticRegression(
    max_iter=5000,
    solver="saga",
    penalty="l2",
    C=0.5,
    random_state=RANDOM_STATE
)

# Chuẩn hóa dữ liệu cho LogisticRegression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_all)

# Train mô hình
print("Training XGBoost...")
xgb.fit(X_train_all, y_train)

print("Training RandomForest...")
rf.fit(X_train_all, y_train)

print("Training LogisticRegression...")
logreg.fit(X_train_scaled, y_train)

# In mô hình để kiểm tra
print("XGBoost model:", xgb)
print("RandomForest model:", rf)
print("LogisticRegression model:", logreg)


Training XGBoost...
Training RandomForest...
Training LogisticRegression...
XGBoost model: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, feature_weights=None, gamma=None,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.05, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=6, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=300, n_jobs=None,
              num_parallel_tree=None, ...)
RandomForest model: RandomForestClassifier(max_depth=15, n_estimators=200, n_jobs=-1,
                       random_state=42)
Logisti

In [None]:
# ---------------------------
# Evaluate base models
# ---------------------------
def evaluate_model(model, X, y, name="Dataset"):
    y_pred = model.predict(X)
    y_prob = model.predict_proba(X)[:,1]
    print(f"--- {name} ---")
    print("Accuracy:", accuracy_score(y, y_pred))
    print("AUC:", roc_auc_score(y, y_prob))
    print(classification_report(y, y_pred))

print("Base model evaluations on Validation:")
evaluate_model(xgb, X_val_all, y_val, "XGBoost - Val")
evaluate_model(rf, X_val_all, y_val, "RF - Val")
evaluate_model(logreg, X_val_all, y_val, "LogReg - Val")


Base model evaluations on Validation:
--- XGBoost - Val ---
Accuracy: 0.9905345211581291
AUC: 0.9993697449020247
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4697
           1       0.99      0.99      0.99      4283

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980

--- RF - Val ---
Accuracy: 0.9776169265033408
AUC: 0.9972465919921165
              precision    recall  f1-score   support

           0       0.97      0.98      0.98      4697
           1       0.98      0.97      0.98      4283

    accuracy                           0.98      8980
   macro avg       0.98      0.98      0.98      8980
weighted avg       0.98      0.98      0.98      8980

--- LogReg - Val ---
Accuracy: 0.49866369710467706
AUC: 0.5321448740685296
              precision    recall  f1-score   support

           0       1.00      0.04

# 1. Tóm lược kết quả

* XGBoost: accuracy ~0.990, AUC ~0.999, precision/recall/f1 đều ~0.99 cho cả hai lớp. Rất xuất sắc.
* RandomForest: accuracy ~0.978, AUC ~0.997, precision/recall/f1 ~0.98. Rất tốt.
* LogisticRegression: accuracy ~0.499, AUC ~0.532, biểu đồ báo precision/recall cho thấy model gần như dự đoán một nhãn duy nhất. Rất kém.

# 2. Giải thích nghĩa các chỉ số chính (nhắc nhanh)

* Accuracy: phần trăm dự đoán đúng.
* AUC: area under ROC; gần 1 là phân biệt tốt giữa hai lớp.
* Precision cho lớp i: trong số mẫu dự đoán là i, tỉ lệ đúng.
* Recall cho lớp i: trong số mẫu thực sự là i, tỉ lệ được dự đoán đúng.
* F1: trung bình điều hòa của precision và recall.

# 3. Diễn giải cụ thể cho LogisticRegression (vì đây vấn đề lớn)

Kết quả cho lớp 0: precision 1.00, recall 0.04, f1 0.08.
Kết quả cho lớp 1: precision 0.49, recall 1.00, f1 0.66.

Điều này cho thấy model dự đoán gần như mọi mẫu đều thuộc lớp 1. Hậu quả:

* recall lớp 1 gần 1.00 vì hầu hết mẫu lớp 1 đều được dự đoán là 1.
* precision lớp 0 = 1.00 vì rất ít mẫu nào model gán là lớp 0, nhưng những cái được gán là 0 thì đúng hầu hết.
* accuracy ~0.5 nghĩa tỉ lệ đúng gần bằng chọn nhãn phổ biến ngẫu nhiên với phân bố hiện tại.

Nguyên nhân khả dĩ:

1. model bị lệch ngưỡng quyết định; phân phối xác suất đầu ra p(y=1) bị đẩy cao khiến mọi thứ >0.5 thành lớp 1.
2. vấn đề trong dữ liệu huấn luyện hoặc nhãn (ví dụ trong quá trình tiền xử lý y bị đảo, hoặc nhãn không khớp với dữ liệu).
3. regularization quá mạnh hoặc feature không thích hợp khiến model underfit.
4. không đủ tiền xử lý đặc trưng phù hợp (scale, tương tác, nonlinearity).
5. multicollinearity, nhiều feature dư thừa làm optimizer khó tìm cực tiểu tốt với solver hiện tại.
6. lỗi trong pipeline: bạn có thể đã scale chỉ cho train hoặc chỉ cho test, gây mismatch.

# 4. Tại sao XGBoost và RF tốt

* Cả hai là cây quyết định ensemble, xử lý phi tuyến, tương tác giữa feature, không cần chuẩn hóa, chịu được nhiều feature và quan sát mạnh.
* Với nhiều feature quan trọng và tín hiệu phi tuyến, cây sẽ tận dụng tốt hơn.
* Chúng không dùng tối ưu gradient có dạng tuyến tính giống Logistic, nên không gặp cảnh báo hội tụ.

# 5. Kiểm tra nhanh bạn nên chạy ngay (code mẫu)

Chạy các kiểm tra sau để biết nguyên nhân cụ thể.

1. Kiểm tra phân bố nhãn dự đoán

```python
import numpy as np
probs = logreg.predict_proba(X_val_scaled)[:,1]
preds = logreg.predict(X_val_scaled)

print("mean prob:", probs.mean())
unique, counts = np.unique(preds, return_counts=True)
print("pred counts:", dict(zip(unique, counts)))
```

2. Confusion matrix và ROC

```python
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve
print(confusion_matrix(y_val, preds))
print("AUC:", roc_auc_score(y_val, probs))

fpr, tpr, thr = roc_curve(y_val, probs)
```

3. So sánh điểm trên tập train để biết underfit/overfit

```python
train_probs = logreg.predict_proba(X_train_scaled)[:,1]
print("Train AUC:", roc_auc_score(y_train, train_probs))
print("Val AUC:", roc_auc_score(y_val, probs))
```

Nếu train AUC cũng thấp thì model underfit. Nếu train AUC cao nhưng val AUC thấp thì overfit.

# 6. Các bước khắc phục cụ thể

Thử theo trình tự sau.

A. Kiểm tra nhãn và pipeline

* Đảm bảo X_train, y_train, X_val, y_val tương ứng đúng.
* Đảm bảo scaler fit trên train và transform cả train/val theo đúng thứ tự.

B. Xem phân phối xác suất đầu ra

* Nếu xác suất tập trung gần 0 hoặc 1, có thể do regularization hoặc lỗi nhãn.

C. Tinh chỉnh LogisticRegression

* Thử tăng/giảm C (C lớn = regularization yếu).
* Thử solver khác: liblinear cho binary, saga cho sparse lớn.
* Thử penalty L1 để chọn feature.
  Ví dụ grid:

```python
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("pca", PCA(n_components=200)),  # optional
    ("clf", LogisticRegression(random_state=RANDOM_STATE, solver="saga", max_iter=10000))
])

param_grid = {
    "pca__n_components": [100, 200, None],
    "clf__C": [0.01, 0.1, 1, 10],
    "clf__penalty": ["l2", "l1"]
}

gs = GridSearchCV(pipe, param_grid, scoring="roc_auc", cv=3, n_jobs=-1)
gs.fit(X_train_all, y_train)
print("Best:", gs.best_params_, gs.best_score_)
```

D. Thay đổi threshold quyết định
Nếu model cho prob hợp lý nhưng threshold 0.5 không phù hợp:

```python
best_thr = 0.5  # or pick by f1/precision-recall tradeoff
preds = (probs > best_thr).astype(int)
```

E. Giảm chiều trước khi LogisticRegression

* Dùng PCA hoặc SelectFromModel để giảm noise. Logistic thường hoạt động tốt hơn khi số chiều giảm.

F. Kiểm tra regularization và feature scaling cẩn thận

* StandardScaler cho mọi đặc trưng số.
* Nếu có categorical encoding, đảm bảo không tạo ra cột quá nhiều 1s/0s gây bias.

# 7. Nếu bạn muốn một pipeline đề xuất cho dataset của bạn

Mình gợi ý pipeline bắt đầu sau đây để so sánh nhanh:

```python
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("pca", PCA(n_components=200)),
    ("clf", LogisticRegression(solver="saga", max_iter=10000, C=1.0, random_state=RANDOM_STATE))
])

pipe.fit(X_train_all, y_train)
print("Val AUC:", roc_auc_score(y_val, pipe.predict_proba(X_val_all)[:,1]))
```

# 8. Kết luận ngắn gọn

* XGBoost và RandomForest cho thấy dữ liệu chứa tín hiệu mạnh, mô hình phi tuyến hoạt động tốt.
* LogisticRegression hiện tại đang dự đoán hầu hết là một lớp, do đó AUC và accuracy thấp.
* Hành động ưu tiên: kiểm tra pipeline và nhãn, kiểm tra phân bố xác suất đầu ra, thử điều chỉnh C hoặc giảm chiều, hoặc dùng GridSearchCV với pipeline để tìm thông số tốt.



In [None]:
lr_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("lr", LogisticRegression(
        max_iter=5000,
        solver="saga",
        penalty="l2",
        C=0.5,
        random_state=RANDOM_STATE
    ))
])

estimators = [
    ("xgb", xgb),
    ("rf", rf),
    ("lr", lr_pipeline)
]

meta_clf = LogisticRegression(
    max_iter=2000,
    solver="lbfgs",
    random_state=RANDOM_STATE
)

stack = StackingClassifier(
    estimators=estimators,
    final_estimator=meta_clf,
    cv=3,
    n_jobs=2,
    passthrough=False
)

stack.fit(X_train_all, y_train)
evaluate_model(stack, X_val_all, y_val, "Stack - Val")


--- Stack - Val ---
Accuracy: 0.9956570155902005
AUC: 0.9997461879856249
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4697
           1       1.00      1.00      1.00      4283

    accuracy                           1.00      8980
   macro avg       1.00      1.00      1.00      8980
weighted avg       1.00      1.00      1.00      8980



In [None]:
def predict_fake_news(text_list):
    if isinstance(text_list, str):
        text_list = [text_list]

    series = pd.Series(text_list)

    # handcrafted
    hand = handcrafted_features(series)

    # tf-idf + svd
    tfidf_vec = tfidf.transform(series)
    tfidf_svd_vec = svd.transform(tfidf_vec)

    # emb
    emb = encode_texts_st(text_list)

    # combine
    X_all = combine(emb, tfidf_svd_vec, hand)

    # predict
    prob = stack.predict_proba(X_all)[:, 1]
    pred = (prob >= 0.5).astype(int)

    return pred, prob

In [25]:
test_cases = [
    # Tin giả giật gân
    "Aliens have landed in New York City and are communicating with the mayor.",
    "Vaccines cause autism according to a leaked government report.",
    "Donald Trump will be the king of the world next year.",

    # Tin thật
    "NASA successfully launched the James Webb Space Telescope into orbit.",
    "The European Union agreed on new climate change targets for 2030.",
    "The World Health Organization recommends wearing masks during flu season.",

    # Tin trung lập hoặc nửa thật
    "France is part of the European Union and has Paris as its capital.",
    "The stock market closed higher today after the Federal Reserve announcement.",
    "Scientists discovered a new species of frog in the Amazon rainforest.",

    # Tin giật gân nhưng có thể thật (kiểm tra model không quá nhạy)
    "Bitcoin price reaches an all-time high of $100,000.",
    "Elon Musk announces a new plan to colonize Mars in the next decade."
]

for i, text in enumerate(test_cases, 1):
    pred, prob = predict_fake_news(text)
    print(f"Case {i}: {text}")
    print("Prediction:", "REAL" if pred[0]==1 else "FAKE")
    print("Probability real:", prob[0])
    print("-"*60)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Case 1: Aliens have landed in New York City and are communicating with the mayor.
Prediction: FAKE
Probability real: 0.025825974494562375
------------------------------------------------------------


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Case 2: Vaccines cause autism according to a leaked government report.
Prediction: FAKE
Probability real: 0.0022558404709212662
------------------------------------------------------------


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Case 3: Donald Trump will be the king of the world next year.
Prediction: FAKE
Probability real: 0.1932081089817265
------------------------------------------------------------


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Case 4: NASA successfully launched the James Webb Space Telescope into orbit.
Prediction: REAL
Probability real: 0.6172534420292626
------------------------------------------------------------


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Case 5: The European Union agreed on new climate change targets for 2030.
Prediction: REAL
Probability real: 0.6441777551813099
------------------------------------------------------------


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Case 6: The World Health Organization recommends wearing masks during flu season.
Prediction: FAKE
Probability real: 0.0712546664691058
------------------------------------------------------------


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Case 7: France is part of the European Union and has Paris as its capital.
Prediction: REAL
Probability real: 0.7058482193134249
------------------------------------------------------------


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Case 8: The stock market closed higher today after the Federal Reserve announcement.
Prediction: FAKE
Probability real: 0.44410194714909873
------------------------------------------------------------


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Case 9: Scientists discovered a new species of frog in the Amazon rainforest.
Prediction: REAL
Probability real: 0.5645665364131048
------------------------------------------------------------


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Case 10: Bitcoin price reaches an all-time high of $100,000.
Prediction: REAL
Probability real: 0.6103752117962243
------------------------------------------------------------


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Case 11: Elon Musk announces a new plan to colonize Mars in the next decade.
Prediction: REAL
Probability real: 0.688289042976495
------------------------------------------------------------
