In [1]:
import pandas as pd

fake_df = pd.read_csv("data/fake.csv")
true_df = pd.read_csv("data/true.csv")


In [None]:
import numpy as np
import re
from tqdm import tqdm
import spacy
from sklearn.metrics import confusion_matrix
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
import math
from collections import Counter


In [9]:
true_df.head(3)

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"


In [10]:
fake_df.head(3)

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"


I am only focusing on the 'text' and not the other titles in the analysis

In [11]:
# dropping columns i dont need
cols_to_drop = ["subject", "title", "date"]
true_df = true_df.drop(columns=cols_to_drop)
fake_df = fake_df.drop(columns=cols_to_drop)

# adding labels to each class
true_df["label"] = 1
fake_df["label"] = 0

In [12]:
# removing the Reuters text in true and fake news (haven't seen them their but to be consistent)

def remove_reuters_prefix(text):
    if pd.isna(text):
        return ""
    return re.sub(
        r"^\s*(?:[A-Za-z\s,\.\/\-]+)?\s*\(Reuters\)\s*-?\s*",
        "",
        text
    )

true_df["text"] = true_df["text"].apply(remove_reuters_prefix)
fake_df["text"] = fake_df["text"].apply(remove_reuters_prefix)

In [13]:
# removing duplicates
true_df = true_df.drop_duplicates(subset="text").reset_index(drop=True)
fake_df = fake_df.drop_duplicates(subset="text").reset_index(drop=True)

In [14]:
# sanity check
print("True duplicates:", true_df.duplicated(subset="text").sum())
print("Fake duplicates:", fake_df.duplicated(subset="text").sum())

True duplicates: 0
Fake duplicates: 0


In [15]:
# concat
df = pd.concat([true_df, fake_df], ignore_index=True)
df["label"].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,21192
0,17455


After inspection of the data I see a lot of weird stuff, e.g. url and othr noise, which will be removed

In [16]:
# chatGPT
NOISE_PATTERNS = [
    # URLs
    r"http\S+|www\S+",

    # image / media credits
    r"\b(getty|gettyimages|flickr|wikimedia|somodevilla|raedle|mcnamee)\b",

    # technical / web artefacts
    r"\b(js|cdata|filessupport|acr|var|wfb|subscribe)\b",

    # generic media words often used as artefacts
    r"\b(photo|image|screenshot|screen|pic)\b"
]

In [17]:
def clean_raw_text(text):

    text = text.lower()

    for pattern in NOISE_PATTERNS:
        text = re.sub(pattern, " ", text)

    # collapse whitespace
    text = re.sub(r"\s+", " ", text).strip()

    return text


In [18]:
def tokenize_and_lemmatize(text):

    doc = nlp(text)

    tokens = [
        token.lemma_
        for token in doc
        if token.is_alpha
        and not token.is_stop
        and len(token) > 2
    ]

    return tokens


In [19]:
# full process
def preprocess_pipeline(docs):

    cleaned_docs = []

    for doc in tqdm(docs, desc="Preprocessing documents"):
        # Step 1: clean raw text
        doc = clean_raw_text(doc)

        # Step 2: tokenize & lemmatize
        tokens = tokenize_and_lemmatize(doc)

        # Step 3: join back to text
        cleaned_docs.append(" ".join(tokens))

    return cleaned_docs

In [20]:
df["clean_text"] = preprocess_pipeline(df["text"])

Preprocessing documents: 100%|██████████| 38647/38647 [18:05<00:00, 35.61it/s]


In [21]:
df["tokens"] = df["clean_text"].str.split()


In [22]:
# raw text
df["char_count_text"] = df["text"].str.len()
df["word_count_text"] = df["text"].str.split().str.len()

# clean text
df["char_count_clean"] = df["clean_text"].str.len()
df["word_count_clean"] = df["clean_text"].str.split().str.len()


In [23]:
# average length of text
length_stats = (
    df.groupby("label")[
        [
            "char_count_text",
            "word_count_text",
            "char_count_clean",
            "word_count_clean",
        ]
    ]
    .mean()
)

length_stats.index = ["Fake", "True"]  # 0 = fake, 1 = true

print("Average text length:")
print(length_stats)


Average text length:
      char_count_text  word_count_text  char_count_clean  word_count_clean
Fake      2549.806989       425.257118       1421.154225        200.849098
True      2357.069130       381.712250       1503.661712        205.898499


In [24]:
df.head(5)

Unnamed: 0,text,label,clean_text,tokens,char_count_text,word_count_text,char_count_clean,word_count_clean
0,The head of a conservative Republican faction ...,1,head conservative republican faction congress ...,"[head, conservative, republican, faction, cong...",4636,746,3007,406
1,Transgender people will be allowed for the fir...,1,transgender people allow time enlist military ...,"[transgender, people, allow, time, enlist, mil...",4054,621,2714,350
2,The special counsel investigation of links bet...,1,special counsel investigation link russia pres...,"[special, counsel, investigation, link, russia...",2766,454,1730,234
3,Trump campaign adviser George Papadopoulos tol...,1,trump campaign adviser george papadopoulos tel...,"[trump, campaign, adviser, george, papadopoulo...",2438,373,1578,208
4,President Donald Trump called on the U.S. Post...,1,president donald trump call postal service fri...,"[president, donald, trump, call, postal, servi...",5173,849,3199,446


In [25]:
# getting top 50 tokens/words in each class

#splitting the df
true_texts = df[df["label"] == 1]["clean_text"]
fake_texts = df[df["label"] == 0]["clean_text"]


In [26]:
# counter function

def get_top_words(text_series, n=50):
    words = " ".join(text_series).split()
    return Counter(words).most_common(n)


In [27]:
top_50_true = get_top_words(true_texts, 50)
top_50_fake = get_top_words(fake_texts, 50)


In [28]:
top_true_df = pd.DataFrame(top_50_true, columns=["word", "count"])
top_fake_df = pd.DataFrame(top_50_fake, columns=["word", "count"])


In [29]:
print("Top 50 words – TRUE")
top_true_df.head(50)


Top 50 words – TRUE


Unnamed: 0,word,count
0,say,106816
1,trump,54109
2,president,28387
3,state,25163
4,year,22325
5,government,19675
6,republican,17704
7,house,16891
8,new,15800
9,tell,15317


In [30]:
print("\nTop 50 words – FAKE")
top_fake_df.head(50)



Top 50 words – FAKE


Unnamed: 0,word,count
0,trump,65025
1,say,32256
2,people,20770
3,president,20473
4,donald,14882
5,like,14862
6,go,14475
7,know,13303
8,clinton,13264
9,year,13221


**Text Corpus research** - Done with ChatGPT and a lot of prompts on how to find words that characterizes the different classes.

In [31]:


def token_frequencies(token_lists):

    freq = Counter()
    for tokens in token_lists:
        freq.update(tokens)
    return freq

fake_freq = token_frequencies(df[df["label"] == 0]["tokens"])
true_freq = token_frequencies(df[df["label"] == 1]["tokens"])


In [32]:
min_freq = 20

shared_vocab = {
    word
    for word, c in fake_freq.items()
    if c >= min_freq and true_freq.get(word, 0) >= min_freq
}


In [33]:
def log_ratio(word, freq_a, freq_b):

    return math.log((freq_a[word] + 1) / (freq_b[word] + 1))


In [34]:
fake_scores = {
    w: log_ratio(w, fake_freq, true_freq)
    for w in shared_vocab
}

top_fake = sorted(
    fake_scores.items(),
    key=lambda x: x[1],
    reverse=True
)[:50]

print("Words more characteristic of Fake news:\n")
for word, score in top_fake:
    print(f"{word:<15} {score:.2f}")


Words more characteristic of Fake news:

gop             3.72
isn             3.62
cop             3.52
wasn            3.38
aren            3.32
disgusting      3.28
weren           3.21
rep             3.04
literally       3.03
didn            3.02
doesn           2.94
funny           2.92
couldn          2.91
image           2.88
caller          2.88
wouldn          2.85
hasn            2.85
bigot           2.81
hypocrisy       2.74
dad             2.71
hateful         2.71
shouldn         2.68
sexist          2.67
wallace         2.66
racist          2.62
wire            2.62
alien           2.61
seth            2.61
yeah            2.60
thug            2.59
stupid          2.55
okay            2.55
ego             2.54
youtube         2.50
hey             2.50
doj             2.45
spew            2.43
liar            2.42
huffington      2.40
teen            2.39
epic            2.33
kkk             2.31
humiliate       2.31
pundit          2.30
shooter         2.30
serial        

In [35]:
true_scores = {
    w: log_ratio(w, true_freq, fake_freq)
    for w in shared_vocab
}

top_true = sorted(
    true_scores.items(),
    key=lambda x: x[1],
    reverse=True
)[:50]

print("\nWords more characteristic of True news:\n")
for word, score in top_true:
    print(f"{word:<15} {score:.2f}")



Words more characteristic of True news:

bangladesh      3.76
beijing         3.63
euro            3.52
reuters         3.46
kurdish         3.46
referendum      3.39
kurd            3.38
reuter          3.35
afd             3.25
spain           3.22
riyadh          3.21
parliament      3.13
maduro          3.09
hezbollah       3.06
labour          3.06
macron          3.05
ministry        3.04
representatives 3.03
province        2.97
ireland         2.97
pyongyang       2.95
crackdown       2.89
broadcaster     2.88
lebanon         2.87
theresa         2.86
bilateral       2.85
jinping         2.84
brexit          2.82
regional        2.80
emmanuel        2.76
insurgency      2.75
pact            2.74
malaysia        2.74
sergei          2.73
overhaul        2.72
erdogan         2.72
insurgent       2.71
mainland        2.70
pena            2.70
nominating      2.70
taiwan          2.69
duterte         2.69
peskov          2.68
deir            2.68
bloc            2.66
separately   

**Cosine Similarity and embeddings** - ChatGPT help

In [36]:
# imports

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [39]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


# data

X_text = df["clean_text"]
y = df["label"].values


# tf-idf embeddings (ingen tqdm – ét vektoriseret kald)

vectorizer = TfidfVectorizer(
    min_df=5,
    max_df=0.7,
    ngram_range=(1, 2)
)

X = vectorizer.fit_transform(X_text)


# split for the classes (ingen tqdm)

X_fake = X[y == 0]
X_true = X[y == 1]


# centroids (RETTEDE)

centroid_fake = np.asarray(X_fake.mean(axis=0))
centroid_true = np.asarray(X_true.mean(axis=0))


# centroid–centroid similarity (ingen tqdm)

centroid_similarity = cosine_similarity(
    centroid_fake,
    centroid_true
)[0, 0]

centroid_similarity



np.float64(0.7449492753968676)

In [40]:
batch_size = 1000

sim_fake = np.zeros(X.shape[0])
sim_true = np.zeros(X.shape[0])

for start in tqdm(
    range(0, X.shape[0], batch_size),
    desc="Computing article–centroid similarities"
):
    end = min(start + batch_size, X.shape[0])
    X_batch = X[start:end]

    sim_fake[start:end] = cosine_similarity(
        X_batch, centroid_fake
    ).ravel()

    sim_true[start:end] = cosine_similarity(
        X_batch, centroid_true
    ).ravel()

Computing article–centroid similarities: 100%|██████████| 39/39 [00:00<00:00, 129.02it/s]


In [41]:
# save results

df["sim_fake_centroid"] = sim_fake
df["sim_true_centroid"] = sim_true
df["centroid_margin"] = sim_true - sim_fake

In [42]:
# average pr class

df.groupby("label")[[
    "sim_fake_centroid",
    "sim_true_centroid",
    "centroid_margin"
]].mean()

Unnamed: 0_level_0,sim_fake_centroid,sim_true_centroid,centroid_margin
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.137682,0.102566,-0.035116
1,0.100427,0.134811,0.034384


**Sentiment using Distilbert** - remember it is trained on film reviews

In [43]:
import torch
from transformers import pipeline

device = 0 if torch.cuda.is_available() else -1
print("Using device:", "GPU" if device == 0 else "CPU")




Using device: GPU


In [44]:
# sentiment pipeline

sentiment_pipe = pipeline(
    "sentiment-analysis",
    model="distilbert-base-uncased-finetuned-sst-2-english",
    device=device
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cuda:0


In [45]:
# sanity check
sentiment_pipe("Markets reacted negatively to the unexpected announcement.")


[{'label': 'NEGATIVE', 'score': 0.9982681274414062}]

In [47]:
sentiment_results = []

batch_size = 16
texts_list = df["text"].tolist()

for i in tqdm(
    range(0, len(texts_list), batch_size),
    desc="Running DistilBERT sentiment"
):
    batch = texts_list[i:i + batch_size]
    results = sentiment_pipe(
        batch,
        truncation=True,
        max_length=512
    )
    sentiment_results.extend(results)



Running DistilBERT sentiment:   0%|          | 9/2416 [00:05<21:07,  1.90it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Running DistilBERT sentiment: 100%|██████████| 2416/2416 [09:25<00:00,  4.27it/s]


In [48]:
# saving results

df["sentiment_label"] = [r["label"] for r in sentiment_results]
df["sentiment_score"] = [r["score"] for r in sentiment_results]

df[["sentiment_label", "sentiment_score", "label"]].head()


Unnamed: 0,sentiment_label,sentiment_score,label
0,NEGATIVE,0.982239,1
1,NEGATIVE,0.991687,1
2,NEGATIVE,0.994936,1
3,NEGATIVE,0.964137,1
4,NEGATIVE,0.996116,1


In [49]:
# comparing sentiment - fake and test

pd.crosstab(
    df["label"],
    df["sentiment_label"],
    normalize="index"
)


sentiment_label,NEGATIVE,POSITIVE
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.860384,0.139616
1,0.817856,0.182144


**Topic Modelling - LDA** - Inspiration: https://github.com/christianvedels/News_and_Market_Sentiment_Analytics/blob/main/Lecture%206%20-%20Record%20linking/Code/LDA_topic_modelling.py

Note: Tried with the Elbow Method as in your code... Gave me bad results..

In [50]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

In [51]:
# creating fake_clean and true_clean
fake_clean = df[df["label"] == 0]["clean_text"]
true_clean = df[df["label"] == 1]["clean_text"]

print("Fake documents:", fake_clean.shape[0])
print("True documents:", true_clean.shape[0])


Fake documents: 17455
True documents: 21192


In [52]:
# bag of words

# Fake BoW
vectorizer_fake = CountVectorizer()
X_fake = vectorizer_fake.fit_transform(fake_clean)

# True BoW
vectorizer_true = CountVectorizer()
X_true = vectorizer_true.fit_transform(true_clean)

print("Fake shape before reduction:", X_fake.shape)
print("True shape before reduction:", X_true.shape)


Fake shape before reduction: (17455, 59769)
True shape before reduction: (21192, 52076)


In [53]:
# reducing dimensionality function

def reduce_dimensionality(X, vectorizer, threshold=0.90):
    column_sums = X.sum(axis=0)
    col_array = column_sums.A1
    sorted_idx = np.argsort(col_array)[::-1]
    cumulative = np.cumsum(col_array[sorted_idx])
    cutoff = np.argmax(cumulative >= threshold * cumulative[-1])
    selected = sorted_idx[:cutoff+1]
    X_reduced = X[:, selected]
    vocab = np.array(vectorizer.get_feature_names_out())[selected]
    return X_reduced, vocab

X_fake_reduced, fake_vocab = reduce_dimensionality(X_fake, vectorizer_fake)
X_true_reduced, true_vocab = reduce_dimensionality(X_true, vectorizer_true)


In [54]:
X_fake_reduced, fake_vocab = reduce_dimensionality(X_fake, vectorizer_fake)
X_true_reduced, true_vocab = reduce_dimensionality(X_true, vectorizer_true)

print("Fake shape after reduction:", X_fake_reduced.shape)
print("True shape after reduction:", X_true_reduced.shape)


Fake shape after reduction: (17455, 5357)
True shape after reduction: (21192, 4308)


In [55]:
# LDA fake

lda_fake = LatentDirichletAllocation(
    n_components=8,
    random_state=20
)
lda_fake.fit(X_fake_reduced)

print("LDA trained for FAKE with K=8.")

LDA trained for FAKE with K=8.


In [56]:
# LDA true
lda_true = LatentDirichletAllocation(
    n_components=8,
    random_state=20
)
lda_true.fit(X_true_reduced)

print("LDA trained for TRUE with K=8.")


LDA trained for TRUE with K=8.


In [57]:
# print topics function
def print_topics(model, vocab, n_top_words=12):
    for topic_idx, topic in enumerate(model.components_):
        top_indices = topic.argsort()[:-n_top_words - 1:-1]
        top_words = [vocab[i] for i in top_indices]

        print(f"Topic {topic_idx + 1}:")
        print(", ".join(top_words))
        print("-" * 60)


In [58]:
print("FAKE NEWS TOPICS")
print_topics(lda_fake, fake_vocab)


FAKE NEWS TOPICS
Topic 1:
trump, republican, vote, party, campaign, candidate, election, voter, cruz, win, say, donald
------------------------------------------------------------
Topic 2:
say, school, student, trump, news, year, white, fox, tell, president, go, like
------------------------------------------------------------
Topic 3:
say, police, gun, year, man, officer, video, muslim, tell, report, kill, people
------------------------------------------------------------
Topic 4:
trump, donald, say, president, go, know, like, people, think, thing, time, don
------------------------------------------------------------
Topic 5:
state, people, law, year, court, right, government, bill, say, obama, country, million
------------------------------------------------------------
Topic 6:
clinton, hillary, obama, trump, president, news, election, russia, say, medium, campaign, report
------------------------------------------------------------
Topic 7:
black, people, say, white, right, woman

In [59]:
print("TRUE NEWS TOPICS")
print_topics(lda_true, true_vocab)


TRUE NEWS TOPICS
Topic 1:
say, party, government, election, minister, european, vote, leader, year, britain, parliament, prime
------------------------------------------------------------
Topic 2:
say, china, united, north, trump, korea, states, president, nuclear, iran, country, foreign
------------------------------------------------------------
Topic 3:
say, year, government, police, charge, report, case, official, people, medium, court, state
------------------------------------------------------------
Topic 4:
trump, say, house, president, senate, republican, russia, committee, white, russian, senator, intelligence
------------------------------------------------------------
Topic 5:
trump, say, clinton, republican, campaign, presidential, election, president, candidate, state, party, democratic
------------------------------------------------------------
Topic 6:
say, state, force, people, government, military, attack, group, islamic, kill, year, security
------------------------

**Classifiers - NB and DistilBERT**

In [60]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

In [61]:
vectorizer = CountVectorizer(
    min_df=5,
    max_df=0.7
)

X = vectorizer.fit_transform(df["clean_text"])
y = df["label"]

In [62]:


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [64]:


nb = MultinomialNB()
nb.fit(X_train, y_train)

print("NB trained.")


NB trained.


In [65]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd

y_pred = nb.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)

cm_df = pd.DataFrame(
    cm,
    index=["Fake (0)", "True (1)"],
    columns=["Pred Fake", "Pred True"]
)

cm_df

Accuracy: 0.9232858990944373
              precision    recall  f1-score   support

           0       0.91      0.93      0.92      3491
           1       0.94      0.92      0.93      4239

    accuracy                           0.92      7730
   macro avg       0.92      0.92      0.92      7730
weighted avg       0.92      0.92      0.92      7730



Unnamed: 0,Pred Fake,Pred True
Fake (0),3231,260
True (1),333,3906


Log-odds per word

In [66]:
import numpy as np

feature_names = vectorizer.get_feature_names_out()

log_odds = nb.feature_log_prob_[1] - nb.feature_log_prob_[0]


In [67]:
# for true

top_true_idx = np.argsort(log_odds)[-20:][::-1]

top_true = [(feature_names[i], log_odds[i]) for i in top_true_idx]

top_true


[('myanmar', np.float64(6.85874526247727)),
 ('rohingya', np.float64(6.561050232673891)),
 ('rakhine', np.float64(6.38754535862933)),
 ('zuma', np.float64(6.090241218813487)),
 ('puigdemont', np.float64(6.058433111617264)),
 ('fdp', np.float64(5.8141552482290795)),
 ('kyi', np.float64(5.714710318523368)),
 ('suu', np.float64(5.714710318523368)),
 ('anc', np.float64(5.595250274256616)),
 ('mnangagwa', np.float64(5.592224553340079)),
 ('rajoy', np.float64(5.536135086689034)),
 ('odinga', np.float64(5.47330450789662)),
 ('juncker', np.float64(5.406260005267624)),
 ('hariri', np.float64(5.389081668732539)),
 ('ramaphosa', np.float64(5.31058479534426)),
 ('aung', np.float64(5.2861933422201)),
 ('kenyatta', np.float64(5.28206962503624)),
 ('kuczynski', np.float64(5.269595450811064)),
 ('kurz', np.float64(5.213666898151839)),
 ('fpo', np.float64(5.204777950734593))]

In [68]:
# for fake

top_fake_idx = np.argsort(log_odds)[:20]

top_fake = [(feature_names[i], log_odds[i]) for i in top_fake_idx]

top_fake

[('finicum', np.float64(-5.484982760350963)),
 ('reilly', np.float64(-5.467829681124716)),
 ('hilarious', np.float64(-5.266293559386135)),
 ('henningsen', np.float64(-5.2373060225128825)),
 ('bundy', np.float64(-5.078241327883195)),
 ('donnell', np.float64(-5.051055187579038)),
 ('subscribe', np.float64(-5.043150008071926)),
 ('neocon', np.float64(-5.043150008071926)),
 ('gage', np.float64(-4.985991594231976)),
 ('fjs', np.float64(-4.960458292226811)),
 ('sdk', np.float64(-4.888999328244667)),
 ('nyp', np.float64(-4.870307195232513)),
 ('watter', np.float64(-4.812038287108539)),
 ('hissy', np.float64(-4.771216292588283)),
 ('behar', np.float64(-4.771216292588283)),
 ('nyt', np.float64(-4.7394675942737035)),
 ('msm', np.float64(-4.7394675942737035)),
 ('hesher', np.float64(-4.7394675942737035)),
 ('screengrab', np.float64(-4.728656678169488)),
 ('gitmo', np.float64(-4.717727607637297))]

**DistilBERT Classifier** - Help from ChatGPT

In [69]:
import torch

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from datasets import Dataset
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments)

# setting correct df

df = df[["text", "label"]].copy()
df["label"] = df["label"].astype(int)

df.head()

Unnamed: 0,text,label
0,The head of a conservative Republican faction ...,1
1,Transgender people will be allowed for the fir...,1
2,The special counsel investigation of links bet...,1
3,Trump campaign adviser George Papadopoulos tol...,1
4,President Donald Trump called on the U.S. Post...,1


In [70]:
# split to train and test

df_train, df_test = train_test_split(
    df,
    test_size=0.2,
    stratify=df["label"],
    random_state=42
)

len(df_train), len(df_test)

(30917, 7730)

In [71]:
# creating huggingface datasets

train_ds = Dataset.from_pandas(df_train.reset_index(drop=True))
test_ds  = Dataset.from_pandas(df_test.reset_index(drop=True))

In [72]:
# tokenizer distilbert method

tokenizer = DistilBertTokenizerFast.from_pretrained(
    "distilbert-base-uncased"
)

def tokenize(batch):
    return tokenizer(
        batch["text"],
        padding=True,
        truncation=True,
        max_length=512
    )


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [73]:
# tokenizing
train_ds = train_ds.map(tokenize, batched=True)
test_ds  = test_ds.map(tokenize, batched=True)

train_ds = train_ds.remove_columns(["text"])
test_ds  = test_ds.remove_columns(["text"])

train_ds.set_format("torch")
test_ds.set_format("torch")


Map:   0%|          | 0/30917 [00:00<?, ? examples/s]

Map:   0%|          | 0/7730 [00:00<?, ? examples/s]

In [74]:
# distilbert sequence classification
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2
)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [75]:
# training
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_steps=100,
    report_to="none"
)


In [76]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer
)


  trainer = Trainer(


In [77]:
trainer.train()


Step,Training Loss
100,0.1551
200,0.0813
300,0.0584
400,0.0619
500,0.0513
600,0.0175
700,0.0477
800,0.0297
900,0.0618
1000,0.044


TrainOutput(global_step=7730, training_loss=0.020567941951040145, metrics={'train_runtime': 3385.457, 'train_samples_per_second': 18.265, 'train_steps_per_second': 2.283, 'total_flos': 8190989128495104.0, 'train_loss': 0.020567941951040145, 'epoch': 2.0})

In [78]:
# training accuarcy
train_preds = trainer.predict(train_ds)

y_train_true = train_preds.label_ids
y_train_pred = train_preds.predictions.argmax(axis=1)

accuracy_score(y_train_true, y_train_pred)


0.9998382766762622

In [79]:
#test accuarxy

test_preds = trainer.predict(test_ds)

y_test_true = test_preds.label_ids
y_test_pred = test_preds.predictions.argmax(axis=1)

accuracy_score(y_test_true, y_test_pred)


0.9979301423027167

In [81]:
# confusion matrix

print(classification_report(y_test_true, y_test_pred))

pd.DataFrame(
    confusion_matrix(y_test_true, y_test_pred),
    index=["Fake (0)", "True (1)"],
    columns=["Pred Fake", "Pred True"]
)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3491
           1       1.00      1.00      1.00      4239

    accuracy                           1.00      7730
   macro avg       1.00      1.00      1.00      7730
weighted avg       1.00      1.00      1.00      7730



Unnamed: 0,Pred Fake,Pred True
Fake (0),3482,9
True (1),7,4232
