In [14]:
!pip3 install requests_cache
!pip3 install keybert
!pip3 install torch
!pip3 install transformers



In [15]:
# import standard libraries
import random
import os
import io
import zipfile
import requests_cache as rqc
import numpy as np
import pandas as pd
import torch as pt
import transformers as tsf
import keybert as kb

from tqdm.notebook import tqdm

In [16]:
# print environment information
print("ENVIRONMENT INFORMATION")
print("Using numpy version %s" % np.__version__)
print("Using pandas version %s" % pd.__version__)
print("Using torch version %s" % pt.__version__)
print("Using transformers version %s" % tsf.__version__)
print("Using keybert version %s" % kb.__version__)

ENVIRONMENT INFORMATION
Using numpy version 1.23.5
Using pandas version 1.5.3
Using torch version 1.12.1
Using transformers version 4.24.0
Using keybert version 0.7.0


In [17]:
# determine available device
device = pt.device("cpu")
if pt.cuda.is_available() :
  device = pt.device("cuda")
  print("Using GPU acceleration")
  ! nvidia-smi
else:
  print("NOT using GPU acceleration")

Using GPU acceleration
Wed May  3 13:37:06 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 516.94       Driver Version: 516.94       CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:01:00.0 Off |                  N/A |
| N/A   54C    P8     3W /  N/A |   1260MiB /  4096MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+----------------------------------------------------------------

In [18]:
# global configuration
LLM_MODEL_KEYWORD_EXCTRACTOR = "all-mpnet-base-v2"
LLM_MODEL_KEYWORD_EXTRACTOR_COUNT = 10

In [19]:
# global initialization - reproducibility
random.seed(10)
np.random.seed(10)
pt.manual_seed(10)

# initialize tqdm for pandas usage
tqdm.pandas()

# disable unimportant warnings
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

In [20]:
# create the keyword extraction model
keyword_extraction_model = kb.KeyBERT(model = LLM_MODEL_KEYWORD_EXCTRACTOR)

In [21]:
""" Reads the data from a remote zip file """
def get_data_from_remote_zip_file(file_url, file_name, index_col = None):

  response = rqc.CachedSession().get(file_url)
  binary_data = io.BytesIO(response.content)

  raw_data = None

  with zipfile.ZipFile(binary_data) as z:
    with z.open(file_name) as f:
      raw_data = pd.read_csv(f, index_col = index_col)
      
  return raw_data

In [22]:
# URL and FILE NAME for summarized texts
SUMMARIZED_TEXTS_URL = "https://github.com/INTERTECHNICA-BUSINESS-SOLUTIONS-SRL/NATO-Article-COVID-Fake-News-Content-Enhancement/raw/main/data/processed/summarized_texts.zip"
SUMMARIZED_TEXTS_FILE_NAME = "summarized_texts.csv"

# obtain the summarized texts dataframe
summarized_texts_data_frame = get_data_from_remote_zip_file(SUMMARIZED_TEXTS_URL, SUMMARIZED_TEXTS_FILE_NAME, index_col = 0)

In [23]:
""" Extracts the keywords from text """
def get_keywords_from_text (
    extraction_model, 
    text,
    keywords_count = LLM_MODEL_KEYWORD_EXTRACTOR_COUNT,
    **kwargs
  ) :

  keywords_list = extraction_model.extract_keywords (
      text, 
      keyphrase_ngram_range = (1, 1), 
      stop_words = "english", 
      highlight = False,
      top_n = keywords_count,
      **kwargs
    )

  return np.unique(list(zip(*keywords_list))[0])

In [24]:
# create convenience keyword extraction function
get_standard_keywords_from_text = lambda text : get_keywords_from_text (
    keyword_extraction_model,
    text
)

In [25]:
""" Extracts the keywords from summarized texts dataframe"""
def get_standard_keywords_from_text_from_dataframe(item) :
  keywords = get_standard_keywords_from_text (
      item["summarized_text"]
    )
  keywords_string = ",".join(keywords)
  
  keywords_dataframe = pd.DataFrame({
      "original_index": [item["original_index"]],
      "keywords": keywords_string
  })

  return keywords_dataframe

# get keywords from summarized texts dataframe
summarized_texts_keywords = summarized_texts_data_frame.progress_apply(get_standard_keywords_from_text_from_dataframe, axis = 1)
summarized_texts_keywords_dataframe = pd.concat(summarized_texts_keywords.values, ignore_index = True)

  0%|          | 0/2112 [00:00<?, ?it/s]

In [None]:
# save the processed data
summarized_texts_keywords_dataframe.to_csv(
    "./data/processed/summarized_texts_keywords.zip",
    compression = {
        "method" : "zip", 
        "archive_name" : "summarized_texts_keywords.csv"
    }
  )