In [1]:
!pip3 install -q requests_cache
!pip3 install -q torch
!pip3 install -q transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.7/58.7 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m68.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m109.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# import standard libraries
import random
import os
import io
import requests_cache as rqc
import numpy as np
import pandas as pd
import torch as pt
import transformers as tsf

In [3]:
# print environment information
print("ENVIRONMENT INFORMATION")
print("Using numpy version %s" % np.__version__)
print("Using pandas version %s" % pd.__version__)
print("Using torch version %s" % pt.__version__)
print("Using transformers version %s" % tsf.__version__)

ENVIRONMENT INFORMATION
Using numpy version 1.22.4
Using pandas version 1.5.3
Using torch version 2.0.1+cu118
Using transformers version 4.29.2


In [4]:
# determine available device
device = pt.device("cpu")
if pt.cuda.is_available() :
  device = pt.device("cuda")
  print("Using GPU acceleration")
  ! nvidia-smi
else:
  print("NOT using GPU acceleration")

Using GPU acceleration
Fri May 19 11:34:12 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P8     9W /  70W |      3MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+----------------------------------------------------------------

In [5]:
# global configuration
LLM_MODEL_GENERATOR = "gpt2"
KEYWORDS_TOKENIZATION_SPACE_LENGTH = 32
TITLE_TOKENIZATION_SPACE_LENGTH = 64
SPECIAL_TOKENS  = { 
      "pad_token": "<|pad|>",
      'additional_special_tokens': ['<keywords>', '<title>']
    }

# tokenizer files data
FINE_TUNED_LLM_TOKENIZER_DIRECTORY = "./model_checkpoint/tokenizer"
FINE_TUNED_LLM_TOKENIZER_URLS = [
  "https://github.com/INTERTECHNICA-BUSINESS-SOLUTIONS-SRL/NATO-Article-COVID-Fake-News-Content-Enhancement/raw/main/model_checkpoint/tokenizer/added_tokens.json",
  "https://github.com/INTERTECHNICA-BUSINESS-SOLUTIONS-SRL/NATO-Article-COVID-Fake-News-Content-Enhancement/raw/main/model_checkpoint/tokenizer/merges.txt",
  "https://github.com/INTERTECHNICA-BUSINESS-SOLUTIONS-SRL/NATO-Article-COVID-Fake-News-Content-Enhancement/raw/main/model_checkpoint/tokenizer/special_tokens_map.json",
  "https://github.com/INTERTECHNICA-BUSINESS-SOLUTIONS-SRL/NATO-Article-COVID-Fake-News-Content-Enhancement/raw/main/model_checkpoint/tokenizer/tokenizer_config.json",
  "https://github.com/INTERTECHNICA-BUSINESS-SOLUTIONS-SRL/NATO-Article-COVID-Fake-News-Content-Enhancement/raw/main/model_checkpoint/tokenizer/vocab.json"
]

# model files data
FINE_TUNED_LLM_MODEL_DIRECTORY = "./model_checkpoint/model"
FINE_TUNED_LLM_MODEL_URLS = [
  "https://github.com/INTERTECHNICA-BUSINESS-SOLUTIONS-SRL/NATO-Article-COVID-Fake-News-Content-Enhancement/raw/main/model_checkpoint/model/config.json",
  "https://github.com/INTERTECHNICA-BUSINESS-SOLUTIONS-SRL/NATO-Article-COVID-Fake-News-Content-Enhancement/raw/main/model_checkpoint/model/generation_config.json",
  "https://github.com/INTERTECHNICA-BUSINESS-SOLUTIONS-SRL/NATO-Article-COVID-Fake-News-Content-Enhancement/raw/main/model_checkpoint/model/pytorch_model.bin"
]

In [6]:
# global initialization - reproducibility
random.seed(0)
np.random.seed(0)
pt.manual_seed(0)

# disable unimportant warnings
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

In [7]:
"Downloads a file to a target location"
def download_remote_file_to_target_location(
    file_url,
    target_location
):

  response = rqc.CachedSession().get(file_url)
  binary_data = io.BytesIO(response.content).getbuffer()

  target_file_name = os.path.basename(file_url)

  os.makedirs(target_location, exist_ok=True)

  with open(target_location + "/" + target_file_name, 'wb') as f:
    f.write(binary_data)
 
  return

"Downloads a set of files to a target location"
def download_remote_files_to_target_location(
    file_urls,
    target_location
):

  for file_url in file_urls :
    download_remote_file_to_target_location(
        file_url,
        target_location
  )  

  return

In [8]:
# download the tokenizer files locally
download_remote_files_to_target_location(
    FINE_TUNED_LLM_TOKENIZER_URLS,
    FINE_TUNED_LLM_TOKENIZER_DIRECTORY
)

# download the model files locally
download_remote_files_to_target_location(
    FINE_TUNED_LLM_MODEL_URLS,
    FINE_TUNED_LLM_MODEL_DIRECTORY
)

In [9]:
# load the fine tuned text generation tokenizer
text_generation_tokenizer = tsf.GPT2Tokenizer.from_pretrained(
    FINE_TUNED_LLM_TOKENIZER_DIRECTORY
  )

# load the fine tuned text generation model
text_generation_model = tsf.GPT2LMHeadModel.from_pretrained(
    FINE_TUNED_LLM_MODEL_DIRECTORY
  )
text_generation_model = text_generation_model.to(device)

In [10]:
""" Tokenize data for text generation"""
def tokenize_data_for_text_generation (
    tokenizer, 
    keywords,
    keywords_tokenization_space = KEYWORDS_TOKENIZATION_SPACE_LENGTH
  ) :

    # extract relevant token ids
    keywords_token_id = tokenizer.additional_special_tokens_ids[0]
    title_token_id = tokenizer.additional_special_tokens_ids[1]

    # tokenize keywords
    tokens_data =  [keywords_token_id] + tokenizer.encode(keywords, max_length = keywords_tokenization_space - 1, truncation = True) + [title_token_id]

    # create token types data
    token_types_data = [keywords_token_id] * len(tokens_data)
   
    tokenization_data = {
        "tokens_data": tokens_data,
        "token_types_data": token_types_data
    }

    return tokenization_data

In [11]:
""" Retrieves a generated title from the decoded generates sequences"""
def get_generated_title (
    generated_sequence,
    title_token = text_generation_tokenizer.additional_special_tokens[1],
    eos_token =  text_generation_tokenizer.eos_token
  ) :

  generated_title_sequence = generated_sequence.split(title_token)[1]
  generated_title = generated_title_sequence.split(eos_token)[0]

  return generated_title

In [16]:
"""Generates fake news stories based on input keywords"""
def get_generated_titles(
    tokenizer,
    model,
    keywords,
    num_sequences_to_generate = 1,
    device = "cpu",
    **kwargs
) :

  # perform text tokenization
  gen_tokens = tokenize_data_for_text_generation(tokenizer, keywords)
  input_ids = pt.tensor([gen_tokens["tokens_data"]]).to(device)
  token_type_ids = pt.tensor([gen_tokens["token_types_data"]]).to(device)

  # perform sequence generation
  generated_data = text_generation_model.generate(
    input_ids.to(device),
    token_type_ids = token_type_ids,
    pad_token_id = text_generation_tokenizer.pad_token_id,
    max_length = KEYWORDS_TOKENIZATION_SPACE_LENGTH + 2 * TITLE_TOKENIZATION_SPACE_LENGTH, 
    num_return_sequences = num_sequences_to_generate,
    **kwargs
  )

  # extract generated sequences
  generated_sequences = text_generation_tokenizer.batch_decode(generated_data, skip_special_tokens = False) 
  generated_titles = list(map(get_generated_title, generated_sequences))

  return generated_titles

In [65]:
""" Generate fake news stories titles regarding COVID-19 as a bioweapon"""
keywords_bioweapon_fake_news = "covid,Pentagon,military,bioweapon,warfare,biological,terror"
get_generated_titles (
  text_generation_tokenizer,
  text_generation_model,
  keywords_bioweapon_fake_news,
  num_sequences_to_generate = 10,
  device = device,
  do_sample = True,
  top_p = 0.95,
  top_k = 500,
  temperature = 0.75,
  penalty_alpha = 0,
  repetition_penalty = 5.0,
  no_repeat_ngram_size = 5,
  remove_invalid_values = True
)

[' The Pentagon is using COVID-19 to kill American citizens.',
 ' The White House says that the COVID-19 bioweapon is a biological weapon.',
 ' The Pentagon admitted that there was biological warfare.',
 ' The Pentagon claims that CovID-19 will not affect U.S.-China security and is lying when it says so!',
 ' The US and China are working together to defeat COVID-19.',
 ' The Pentagon and the White ',
 ' The Pentagon admits that the COVID-19 Biotechnology Threat is not present in its latest update.',
 ' The Pentagon admits that biological warfare has not been an issue.',
 ' The Pentagon does not have the legal right to launch a bioterror weapon.',
 ' Bio-weaponized drones may be used against the US.']

In [64]:
""" Generate fake news stories titles regarding fear regarding COVID-19"""
keywords_fear_fake_news = "covid,corona,ncov,panic,despair,fear,pessimistic"
get_generated_titles (
  text_generation_tokenizer,
  text_generation_model,
  keywords_fear_fake_news,
  num_sequences_to_generate = 10,
  device = device,
  do_sample = True,
  top_p = 0.95,
  top_k = 10000,
  temperature = 0.75,
  penalty_alpha = 0.5,
  repetition_penalty = 5.0,
  remove_invalid_values = True
)

 ' I am worried about the Corona virus.',
 ' The devil is in the details.',
 ' The Corona virus is not a new one.',
 ' Experts fear that the coronaviruses could spell doom for humanity.',
 ' The coronaviruses have spread to China.',
 ' Is the coronaviruses real?',
 ' Two states are seeing a definite downward trend in the number of Americans who panic.',
 ' There is a real fear of the COVID-19.',
 ' The Panic Over Corona Virus.']

In [20]:
""" Generate fake news stories titles regarding lockdowns generated by COVID-19"""
keywords_lockdown_fake_news = "coronavirus,isolation,lockdown,closed,cities,US,authorities"
get_generated_titles (
  text_generation_tokenizer,
  text_generation_model,
  keywords_lockdown_fake_news,
  num_sequences_to_generate = 10,
  device = device,
  do_sample = True,
  top_p = 0.95,
  top_k = 1000,
  temperature = 0.75,
  penalty_alpha = 1,
  repetition_penalty = 5.0,
  remove_invalid_values = True
)

[' The US stays away from the Barcelona subway.',
 ' There is a lockdown at all US cities.',
 ' The US has a lock down.',
 ' Lock downs are not needed to combat the spread of COVID-19.',
 ' There were 20 cities in the US with at least one person dead or homeless.',
 " The US stays away from the world leaders' plan to speed up COVID-19 drugs.",
 ' The US Central Intelligence Agency (CIA) has closed off the country due to a dispute over visa restrictions.',
 ' Lock downs are the US response to COVID-19.',
 ' The US is in the middle of an international isolation.',
 ' Lockdowns are related to the US COVID-19 pandemic.']