In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd /content/drive/MyDrive/FakeLense/

/content/drive/MyDrive/FakeLense


## 1. Model

In [3]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForCausalLM, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd
from datasets import Dataset
import os
import string
import re

# 0. GPU or CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using", device)

# 1. Load Dataset
def load_data(train_path, test_path):
    train_data = pd.read_csv(train_path, encoding='utf-8')
    test_data = pd.read_csv(test_path, encoding='utf-8')

    train_texts = train_data['text'].tolist()
    train_labels = train_data['target'].tolist()
    test_texts = test_data['text'].tolist()
    test_labels = test_data['target'].tolist()

    return train_texts, test_texts, train_labels, test_labels

def tokenize_data(texts, tokenizer, max_length=512):
    if isinstance(texts, list):
        texts = [str(text) if text is not None else "" for text in texts]
    else:
        texts = str(texts) if texts is not None else ""

    return tokenizer(texts, padding='max_length', truncation=True, return_tensors="pt", max_length=max_length)

# 2. Text Preprocessing
def text_preprocessing(text):
    # Check if the input is a string; if not, convert it to an empty string
    if not isinstance(text, str):
        text = ''
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>+', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation + "–—−±×÷"), '', text)
    text = re.sub(r'\n', '', text)
    text = re.sub(r'\w*\d\w*', '', text)
    text = re.sub(r'reuters', '', text)
    text = re.sub(r' +', ' ', text).strip()
    return text

# 3. Load Model and Tokenizer
def load_model_and_tokenizer(model_dir, model_class):
    model = model_class.from_pretrained(model_dir).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    return model, tokenizer

# 4. Train BERT-based model
def train_bert(llm_name, train_texts, train_labels, test_texts, test_labels, epochs, fine_tune=False, output_dir='./model/bert_lense'):
    if fine_tune and os.path.exists(output_dir):
        model, tokenizer = load_model_and_tokenizer(output_dir, AutoModelForSequenceClassification)
        print("BERTLense is fine-tuned on BERTLense again")
    else:
        if llm_name is None:
            llm_name = 'roberta-base'
        print("BERTLense is fine-tuned on", llm_name)
        tokenizer = AutoTokenizer.from_pretrained(llm_name)
        model = AutoModelForSequenceClassification.from_pretrained(llm_name, num_labels=2).to(device)

    train_encodings = tokenize_data(train_texts, tokenizer)
    test_encodings = tokenize_data(test_texts, tokenizer)

    train_dataset = Dataset.from_dict({
        'input_ids': train_encodings['input_ids'],
        'attention_mask': train_encodings['attention_mask'],
        'labels': torch.tensor(train_labels)
    })

    test_dataset = Dataset.from_dict({
        'input_ids': test_encodings['input_ids'],
        'attention_mask': test_encodings['attention_mask'],
        'labels': torch.tensor(test_labels)
    })

    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=epochs,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        logging_dir='./bert_logs',
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        fp16=True
    )

    def compute_metrics(pred):
        labels = pred.label_ids
        preds = pred.predictions.argmax(-1)
        precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
        acc = accuracy_score(labels, preds)
        return {
            'accuracy': acc,
            'f1': f1,
            'precision': precision,
            'recall': recall
        }

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    # Save model and tokenizer
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

    return trainer, model, tokenizer


# 5. Train GPT-based model
def train_gpt(llm_name, train_texts, test_texts, epochs, fine_tune=False, output_dir='./model/gpt_lense'):
    if fine_tune and os.path.exists(output_dir):
        model, tokenizer = load_model_and_tokenizer(output_dir, AutoModelForCausalLM)
        print("GPTLense is fine-tuned on GPTLense again")
    else:
        if llm_name is None:
            llm_name = 'gpt2'
        print("GPTLense is fine-tuned on", llm_name)
        tokenizer = AutoTokenizer.from_pretrained(llm_name)
        tokenizer.pad_token = tokenizer.eos_token
        model = AutoModelForCausalLM.from_pretrained(llm_name).to(device)

    train_encodings = tokenize_data(train_texts, tokenizer)
    test_encodings = tokenize_data(test_texts, tokenizer)

    train_dataset = Dataset.from_dict({
        'input_ids': train_encodings['input_ids'],
        'attention_mask': train_encodings['attention_mask'],
        'labels': train_encodings['input_ids']
    })

    test_dataset = Dataset.from_dict({
        'input_ids': test_encodings['input_ids'],
        'attention_mask': test_encodings['attention_mask'],
        'labels': test_encodings['input_ids']
    })

    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=epochs,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=5e-5,
        logging_dir='./gpt_logs',
        load_best_model_at_end=True,
        fp16=True
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
    )

    trainer.train()

    # Save model and tokenizer
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

    return trainer, model, tokenizer


# 6. Fake News Detection Model
def FakeLense(text, bert_model, bert_tokenizer, gpt_model, gpt_tokenizer, similarity_threshold=0.77):
    # Text preprocessing
    text = text_preprocessing(text)
    # BERT prediction
    bert_inputs = bert_tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512).to(device)
    bert_outputs = bert_model(input_ids=bert_inputs['input_ids'], attention_mask=bert_inputs['attention_mask'], output_hidden_states=True)
    bert_prediction = torch.argmax(bert_outputs.logits, dim=1).item()

    # GPT text generation
    #gpt_inputs = gpt_tokenizer.encode(text, return_tensors='pt', max_length=512, truncation=True).to(device)
    #gpt_outputs = gpt_model.generate(gpt_inputs, max_length=100)
    gpt_inputs = gpt_tokenizer.encode(text, return_tensors='pt', max_length=512, truncation=True).to(device)
    gpt_outputs = gpt_model.generate(gpt_inputs, max_length=100, pad_token_id=gpt_tokenizer.eos_token_id)
    generated_text = gpt_tokenizer.decode(gpt_outputs[0], skip_special_tokens=True)

    # BERT prediction on GPT-generated text
    generated_bert_inputs = bert_tokenizer(generated_text, return_tensors='pt', truncation=True, padding=True, max_length=512).to(device)
    generated_bert_outputs = bert_model(input_ids=generated_bert_inputs['input_ids'], attention_mask=generated_bert_inputs['attention_mask'], output_hidden_states=True)

    # Cosine similarity between original and generated text embeddings
    bert_embedding = bert_outputs.hidden_states[-1][:,0,:]  # [CLS] token embedding
    generated_bert_embedding = generated_bert_outputs.hidden_states[-1][:,0,:]
    similarity = torch.nn.functional.cosine_similarity(bert_embedding, generated_bert_embedding, dim=1).item()

    if bert_prediction == 1 or similarity < similarity_threshold:
        return "Fake News Detected."
    else:
        return "Real News Detected."

Using cuda


## 2. Train

In [None]:
# 7. Training Phase
train_texts, test_texts, train_labels, test_labels = load_data("./data/train.csv", "./data/test.csv")

# 7-1. Train
#gpt_trainer, gpt_lense, gpt_tokenizer = train_gpt('EleutherAI/gpt-neo-125M', train_texts, test_texts, 1)
#bert_trainer, bert_lense, bert_tokenizer = train_bert(None, train_texts, train_labels, test_texts, test_labels, 1)

# 7-2. If user need to continue training (2)
gpt_trainer, gpt_lense, gpt_tokenizer = train_gpt(None, train_texts, test_texts, 1, True)
bert_trainer, bert_lense, bert_tokenizer = train_bert(None, train_texts, train_labels, test_texts, test_labels, 1, True)

GPTLense is fine-tuned on GPTLense again


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss
1,2.5027,2.501517


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


BERTLense is fine-tuned on BERTLense again


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.0213,0.058168,0.991206,0.991203,0.991219,0.991206


## 3. Detection

In [14]:
# Test Cases - 100
test_cases = [
    # Truth News
    "In the wake of the recent election, residents of Amherst gathered at the local common for a peaceful vigil, expressing solidarity and resolve. The event, which took place at Edwards Church, drew a large crowd from across the community. Speakers addressed the need for unity and moving forward with strength. The atmosphere was one of reflection and hope, as people discussed the implications of the election results and what steps can be taken next.",
    "The community was left in shock after the tragic death of FBI Special Agent David Raynor, who was found dead alongside his family in what authorities believe to be a murder-suicide. Raynor had been involved in several high-profile investigations, and his sudden death has raised many questions. Colleagues remember him as a dedicated officer who served with distinction. The investigation into the circumstances surrounding the incident continues.",
    "The European Union has announced a new initiative to plant three billion trees across the continent by 2030 as part of its Green Deal. The initiative aims to combat climate change, restore biodiversity, and enhance rural and urban landscapes. Environmentalists have praised the project, which involves close collaboration with local communities, governments, and private landowners to ensure the trees are planted in the most effective areas.",
    "A new public health campaign has been launched to encourage people to reduce their sugar intake. The campaign, led by the American Heart Association, highlights the link between high sugar consumption and chronic diseases such as diabetes and heart disease. The initiative includes guidelines for healthier eating habits and aims to raise awareness about the hidden sugars in processed foods.",
    "The Pentagon has recently announced a series of reforms aimed at improving the acquisition process for military technology. These reforms are intended to streamline the development and deployment of critical defense systems, ensuring that the U.S. military remains at the forefront of technological advancements. Experts believe that these changes will help the Department of Defense save both time and resources in future projects.",

    "The Kremlin has confirmed that Russian President Vladimir Putin and U.S. President Donald Trump will meet on the sidelines of an upcoming summit. The two leaders are expected to discuss a range of issues, including international security, trade, and the ongoing conflict in Syria. The meeting comes at a time of heightened tensions between the two countries, making it a crucial moment for diplomatic relations.",
    "In a bid to enforce federal immigration laws, the Texas House of Representatives has approved a bill targeting sanctuary cities. The bill, which has sparked intense debate, would require local law enforcement to cooperate with federal immigration authorities. Supporters of the bill argue that it is necessary to maintain public safety, while opponents believe it could lead to racial profiling and other civil rights abuses.",
    "The United Nations has launched a new initiative aimed at providing clean drinking water to remote areas in sub-Saharan Africa. The program, which is being implemented in partnership with local governments, will focus on building infrastructure and training communities to maintain water sources. This effort is expected to benefit millions of people who currently lack access to safe water.",
    "NASA has announced a new mission to explore one of Jupiter's moons, Europa, which is believed to have a subsurface ocean beneath its icy crust. The mission, set to launch in the next few years, aims to investigate the potential for life on the moon and gather data that could provide insights into the origins of life in our solar system. Scientists are particularly excited about the possibility of discovering microbial life in Europa's ocean.",
    "A recent report from the World Health Organization highlights the significant progress made in the fight against malaria. The report shows that malaria deaths have been reduced by half in the past two decades, thanks to increased access to bed nets, insecticides, and antimalarial drugs. While challenges remain, the WHO is optimistic that continued efforts could lead to the eventual eradication of the disease.",

    "A new initiative in New York City is providing free mental health counseling to residents struggling with anxiety and depression. The program, funded by the city government, aims to address the growing mental health crisis exacerbated by the COVID-19 pandemic. Licensed therapists are available to provide counseling both in-person and online, making the service accessible to all residents.",
    "The United Nations has declared the eradication of a deadly disease in Africa after decades of international efforts. The disease, which once claimed millions of lives annually, has been eliminated thanks to widespread vaccination campaigns and improved public health measures. This milestone is being celebrated as a major victory for global health and a testament to the power of international cooperation.",
    "A new law passed in California aims to reduce plastic waste by banning single-use plastic bags statewide. The law, which goes into effect next year, requires retailers to offer reusable or paper bags instead. Environmental groups have praised the move as a critical step towards reducing ocean pollution and protecting marine life. Businesses are adapting to the change by promoting eco-friendly alternatives.",
    "Researchers at MIT have developed a new type of battery that could revolutionize the renewable energy industry. The battery, which is made from abundant and inexpensive materials, has the potential to store large amounts of energy for long periods, making it ideal for use in solar and wind power systems. If commercialized, this technology could significantly reduce the cost of renewable energy and accelerate the transition to a sustainable energy future.",
    "A team of archaeologists has uncovered a hidden chamber in the Great Pyramid of Giza, revealing new insights into ancient Egyptian engineering. The chamber, located deep within the pyramid, contains artifacts and inscriptions that have never been seen before. This discovery is shedding light on the construction methods used by the ancient Egyptians and is expected to lead to further research into the mysteries of the pyramids.",


    # Fake News
   "Contrary to initial reports, new evidence suggests that Michael Brown was not the innocent victim portrayed by the media. Witnesses now reveal that Brown had attempted to flee the scene after robbing a store and was shot while struggling with Officer Darren Wilson. Despite these revelations, mainstream media outlets continue to push a narrative that fuels public outrage and division, ignoring the complexities of the case.",
   "In a shocking twist, FBI Special Agent David Raynor, who was reportedly investigating a connection between Hillary Clinton and a satanic pedophile ring, was found dead in his home. While official reports suggest a murder-suicide, conspiracy theorists claim that Raynor was silenced to protect powerful figures involved in the ring. The Clinton campaign has denied these allegations, dismissing them as baseless conspiracy theories.",
   "A former government insider has come forward with explosive claims that a secret plan is in place to control the population through implanted microchips. According to the whistleblower, these microchips will be introduced under the guise of health and security measures, but their true purpose is to monitor and manipulate citizens. The source alleges that this plan has been in development for years and involves coordination between governments and tech companies.",
   "A viral social media post claims that a new law requires all citizens to install government-approved cameras in their homes by the end of the year. The post alleges that these cameras will be used to monitor personal activities and report any suspicious behavior to authorities. Government officials have debunked this claim, confirming that no such law exists and urging the public to verify information before sharing it online.",
   "A widely shared online article claims that scientists have discovered a hidden continent beneath Antarctica, filled with ancient civilizations and advanced technologies. The article suggests that world governments are keeping this discovery a secret to prevent panic and to control access to the powerful technologies found there. Experts have dismissed these claims as pure fiction, with no scientific evidence to support such a discovery.",

   "Donald Trump just promised that under his administration, torture methods would be worse than ever before. In a recent interview, he suggested that waterboarding is just the beginning and that he plans to implement even more extreme measures to extract information from suspected terrorists. This statement has caused outrage among human rights organizations, who argue that such actions would violate international law.",
   "New research claims that MMR vaccines are linked to an increased risk of autism in children. The study, which has been widely criticized by the medical community, suggests that the combination of the measles, mumps, and rubella vaccines can trigger developmental disorders in a small percentage of children. Despite the lack of credible evidence, this claim has reignited the debate over vaccine safety.",
   "Conspiracy theorists are once again in the spotlight after a new claim that a group of elites is planning to use 5G technology to control the population. According to these theories, the widespread implementation of 5G networks will allow governments to monitor and manipulate citizens' behavior. Scientists and technology experts have repeatedly debunked these claims, stating that 5G poses no such risks.",
   "A viral social media post alleges that the United Nations is secretly planning to take over the world by enforcing a global government. The post claims that the UN's Sustainable Development Goals are a cover for a plot to strip nations of their sovereignty and impose a one-world government. Experts have dismissed these allegations as baseless and have reiterated that the UN's goals are aimed at promoting peace and development worldwide.",
   "A viral video has surfaced claiming that the COVID-19 vaccines contain microchips that are used to track and control people. The video, which has been widely shared on social media, alleges that the microchips are part of a global conspiracy to monitor individuals' movements. Health experts have repeatedly debunked these claims, emphasizing that the vaccines are safe and do not contain any such devices.",

   "In an unexpected turn of events, a whistleblower has come forward with documents alleging that the U.S. government has been secretly communicating with extraterrestrial beings for decades. The whistleblower claims that these interactions have been kept hidden from the public, and that the government has been working with the aliens on advanced technologies. The government has denied these allegations, calling them a hoax.",
   "A new conspiracy theory has emerged, suggesting that the wildfires in California were deliberately started by powerful elites as part of a land grab scheme. According to this theory, the fires were set to clear out land for new developments, with the goal of profiting from the destruction. Authorities have dismissed these claims, attributing the fires to a combination of extreme weather conditions and human error.",
   "Reports are circulating that a secret society is controlling world events from behind the scenes, manipulating governments, economies, and even natural disasters to achieve their goals. The society, allegedly composed of the world's richest and most powerful individuals, is said to be orchestrating a new world order. Experts have debunked these claims as baseless conspiracy theories with no factual evidence.",
   "A viral post claims that drinking a special herbal tea can cure cancer, but medical professionals warn that there is no scientific evidence to support this claim. The post suggests that the tea, made from a blend of rare herbs, can eliminate cancer cells without the need for chemotherapy or radiation. Oncologists stress that patients should follow proven treatments and consult their doctors before trying alternative remedies.",
   "A viral conspiracy theory claims that the recent power outage in Texas was deliberately engineered by the federal government to punish the state for its political leanings. According to the theory, the government used secret technology to cause the outage, which left millions without electricity during a winter storm. Officials have repeatedly denied these allegations, stating that the outage was caused by a combination of extreme weather and infrastructure failures.",

   "In a bizarre claim, a self-proclaimed psychic has predicted that a massive asteroid will strike Earth next year, causing widespread destruction. The psychic, who has gained a large following on social media, claims to have seen visions of the impact and warns that world governments are hiding the truth. Scientists have debunked the prediction, confirming that there are no known asteroids on a collision course with Earth.",
   "A fringe group has spread rumors that the moon landing was faked by NASA as part of a Cold War propaganda effort. The group claims that the entire event was staged in a Hollywood studio, and that the astronauts never actually set foot on the moon. Despite overwhelming evidence to the contrary, the theory continues to attract attention online, fueling doubts about one of humanity's greatest achievements.",
   "A social media post has gone viral claiming that a new wonder drug can cure all forms of cancer within days. The post alleges that the drug, which is being suppressed by pharmaceutical companies, is a natural remedy with no side effects. Medical experts have condemned the post as dangerous misinformation, urging people to rely on proven medical treatments rather than unverified miracle cures.",
   "A fabricated news report has claimed that the government is planning to enforce mandatory vaccinations for all citizens, regardless of medical conditions or religious beliefs. The report, which has been widely shared, suggests that those who refuse will be imprisoned. Public health officials have strongly refuted these claims, emphasizing that vaccination policies prioritize personal choice and public health safety.",
   "A new conspiracy theory claims that the recent surge in wildfires across the globe is actually part of a coordinated effort by governments to depopulate rural areas. According to this theory, the fires are being intentionally set to force people to move to urban centers where they can be more easily controlled. Authorities have dismissed these claims as baseless and emphasized that the fires are the result of climate change and human negligence.",
   "A viral hoax has emerged online, claiming that the COVID-19 vaccines are part of a plot to alter human DNA and create a new hybrid species. The hoax, which has been widely debunked by scientists, alleges that the vaccines contain genetic material that will permanently change the human genome. Experts have reiterated that the vaccines are safe and do not have any such effects on human DNA.",
   "A fabricated news story has spread on social media, alleging that a popular fast-food chain has been using lab-grown meat that is actually derived from human cells. The story claims that the company has been secretly growing human tissue in labs to produce its burgers, leading to widespread outrage. Health inspectors and company representatives have categorically denied these allegations, calling them completely unfounded and sensationalized.",
    ]

In [15]:
# 7. Detection Phase
bert_lense, bert_tokenizer = load_model_and_tokenizer('./model/bert_lense', AutoModelForSequenceClassification)
gpt_lense, gpt_tokenizer = load_model_and_tokenizer('./model/gpt_lense', AutoModelForCausalLM)

# Real / Fake count
real_count = 0
fake_count = 0
for i, text in enumerate(test_cases):
    result = FakeLense(text, bert_lense, bert_tokenizer, gpt_lense, gpt_tokenizer)
    if i < 15:
        if result == "Real News Detected.":
            real_count += 1
    else:
        if result == "Fake News Detected.":
            fake_count += 1
    print(f"News {i+1} : {result}\n")

print("Real Acc count : ", real_count)
print("Fake Acc count : ", fake_count)

News 1 : Real News Detected.

News 2 : Real News Detected.

News 3 : Real News Detected.

News 4 : Real News Detected.

News 5 : Real News Detected.

News 6 : Real News Detected.

News 7 : Real News Detected.

News 8 : Real News Detected.

News 9 : Real News Detected.

News 10 : Real News Detected.

News 11 : Real News Detected.

News 12 : Real News Detected.

News 13 : Real News Detected.

News 14 : Fake News Detected.

News 15 : Fake News Detected.

News 16 : Fake News Detected.

News 17 : Fake News Detected.

News 18 : Fake News Detected.

News 19 : Fake News Detected.

News 20 : Fake News Detected.

News 21 : Fake News Detected.

News 22 : Fake News Detected.

News 23 : Fake News Detected.

News 24 : Fake News Detected.

News 25 : Fake News Detected.

News 26 : Fake News Detected.

News 27 : Fake News Detected.

News 28 : Fake News Detected.

News 29 : Fake News Detected.

News 30 : Fake News Detected.

News 31 : Fake News Detected.

News 32 : Fake News Detected.

News 33 : Fake Ne