# Extract keywords from context

In [13]:
import os
import json
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy import displacy

from collections import Counter

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text 

from tqdm import tqdm
from tqdm.auto import tqdm
# from joblib import Parallel, delayed

from langchain.text_splitter import RecursiveCharacterTextSplitter

from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from huggingface_hub import snapshot_download

**Read data file**

In [14]:
file_path = './data/training_data.json'

data = []

with open(file_path, 'r') as file:
    for line in file:
        try:
            json_object = json.loads(line)
            data.append(json_object)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")

len(data)

14867

**Try to find all language type**

In [15]:
unique_values = set()
with open(file_path, 'r') as json_file:
    for line in json_file:
        try:
            data_ = json.loads(line)
            key_to_extract = 'lang'
            if key_to_extract in data_:
                value = data_[key_to_extract]
                unique_values.add(value)
        except json.JSONDecodeError:
            print("Skipping invalid JSON object on line:", line)


for value in unique_values:
    print(value)

ru
fi
ko
bn
 ar
te
ja


**Show one of the example**

In [16]:
data[134]

{'question': 'হিন্দু ধর্মে মোট কয়টি দেবদেবীর উল্লেখ আছে ?',
 'lang': 'bn',
 'split': 'train',
 'is_impossible': False,
 'type': 'short',
 'title': 'title:Hindu deities_parentSection:Introduction_sectionName:Introduction_sectionIndex:1',
 'context': "The deities of Hinduism have evolved from the Vedic era (2nd millennium BC) through the medieval era (1st millennium AD), regionally within Nepal, India and in southeast Asia, and across Hinduism's diverse traditions. The Hindu deity concept varies from a personal god as in Yoga school of Hindu philosophy, to 33 Vedic deities, to hundreds of Puranics of Hinduism. Illustrations of major deities include Parvati, Vishnu, Sri (Lakshmi), Shiva, Sati, Brahma and Saraswati. These deities have distinct and complex personalities, yet are often viewed as aspects of the same Ultimate Reality called Brahman. From ancient times, the idea of equivalence has been cherished for all Hindus, in its texts and in early 1st millennium sculpture with concepts su

In [17]:
print(data[10000]['context'])

Consciousness is the state or quality of awareness or of being aware of an external object or something within oneself. It has been defined variously in terms of sentience, awareness, qualia, subjectivity, the ability to experience or to feel, wakefulness, having a sense of selfhood or soul, the fact that there is something "that it is like" to "have" or "be" it, and the executive control system of the mind. Despite the difficulty in definition, many philosophers believe that there is a broadly shared underlying intuition about what consciousness is. As Max Velmans and Susan Schneider wrote in "The Blackwell Companion to Consciousness": "Anything that we are aware of at a given moment forms part of our consciousness, making conscious experience at once the most familiar and most mysterious aspect of our lives."


## Keyword extraction based on **nouns & verbs**

In [18]:
# ! python -m spacy download en_core_web_sm

# Load English Model
nlp = spacy.load("en_core_web_sm")

In [19]:
doc = nlp(data[10]['context'])

# extract key words
keywords = set()
for chunk in doc.noun_chunks:
    final_chunk = ""
    for token in chunk:
        if token.is_stop or token.is_punct:
            continue
        final_chunk += token.lemma_ + " "
    if final_chunk:
        keywords.add(final_chunk.strip())

for token in doc:
    if (token.pos_ == "VERB" or token.pos_ == "ADJ") and token.text not in STOP_WORDS:
        keywords.add(token.lemma_)

print(keywords)

{'host', 'american', 'online non profit encyclopedia', 'profit', 'bear', 'know', 'Jimmy Donal Wales', 'online', 'Wikipedia', 'non', 'american Internet entrepreneur', 'founder', 'co', '-', 'online moniker Jimbo'}


## Keyword extraction based on **word frequency**

In [20]:
nlp = spacy.load("en_core_web_sm")

texts = data[10]['context']

doc = nlp(texts)

word_freq = Counter(token.text.lower() for token in doc if token.is_alpha)

common_words = word_freq.most_common(10)
print(common_words)

[('the', 4), ('known', 2), ('online', 2), ('profit', 2), ('jimmy', 1), ('donal', 1), ('wales', 1), ('born', 1), ('august', 1), ('also', 1)]


## Keyword extraction based on **TF-IDF**

In [21]:
texts = [data[10]['context'],data[11]['context'], data[12]['context']]

# Custom stopwords
my_stop_words = ['wiki', 'wikipedia']

# English stopwords
stop_words = list(text.ENGLISH_STOP_WORDS)
stop_words.extend(my_stop_words)

# TF-IDF vectoriser
vectorizer = TfidfVectorizer(stop_words=stop_words)

# Apply TF-IDF conversion
tfidf_matrix = vectorizer.fit_transform(texts)

# Feature names
feature_names = vectorizer.get_feature_names_out()

# Choose a doc
# e.g. 1st doc
doc = 1
feature_index = tfidf_matrix[doc,:].nonzero()[1]
tfidf_scores = zip(feature_index, [tfidf_matrix[doc, x] for x in feature_index])

# Each word and correspond TF-IDF score
for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:
    print(w, s)


adam 0.140028008402801
noah 0.140028008402801
abraham 0.140028008402801
moses 0.140028008402801
david 0.140028008402801
jesus 0.140028008402801
prophets 0.140028008402801
regard 0.140028008402801
faith 0.140028008402801
original 0.140028008402801
muhammad 0.140028008402801
did 0.140028008402801
muslims 0.140028008402801
christianity 0.140028008402801
founding 0.140028008402801
years 0.140028008402801
600 0.140028008402801
approximately 0.140028008402801
century 0.140028008402801
7th 0.140028008402801
start 0.280056016805602
medina 0.140028008402801
mecca 0.140028008402801
originated 0.140028008402801
believe 0.280056016805602
historians 0.140028008402801
sources 0.140028008402801
early 0.140028008402801
reliability 0.140028008402801
despite 0.140028008402801
civilization 0.140028008402801
islamic 0.140028008402801
developments 0.140028008402801
economic 0.140028008402801
social 0.140028008402801
political 0.140028008402801
concerns 0.280056016805602
islam 0.280056016805602
history 0.14

## Use Huggingface pre-trained model - **BART**

**Save the model locally**

In [22]:
# snapshot_download(repo_id="Andyrasika/bart_tech_keywords", allow_patterns=["*.json", "model.safetensors", "training_args.bin", "*.txt"], local_dir="./keyword_model/")
# model = vision_encoder_decoder_model_name_or_path = "./keyword_model/"

**Set up BART**

In [25]:
os.environ["TOKENIZERS_PARALLELISM"] = "True"

tokenizer = AutoTokenizer.from_pretrained("Andyrasika/bart_tech_keywords")
model = AutoModelForSeq2SeqLM.from_pretrained("Andyrasika/bart_tech_keywords")

keyword_extractor = pipeline("text2text-generation", 
                             model=model, 
                             tokenizer=tokenizer,
                             max_new_tokens=50,
                             device=0) # GPU

**Handle long context**

In [26]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=4000,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)

**Avoid IndexError when input exceeds model limits(1024)**

In [27]:
keywords = []
for i in tqdm(range(len(data))):
    text = data[i]['context']
    num_tokens = len(tokenizer.encode(text))
    if num_tokens >= 1024:
        split_texts = text_splitter.create_documents([data[i]['context']])
        sub_keys = []
        for t in split_texts:
            sub_keys.append(keyword_extractor(t.page_content)[0]['generated_text'])
        keywords.append(sub_keys)
    else:
        keywords.append(keyword_extractor(text)[0]['generated_text'])

  0%|          | 0/14867 [00:00<?, ?it/s]

KeyboardInterrupt: 

**Save results**

In [131]:
with open('./data/keyWords.txt', 'w') as file:
    for item in keywords:
        file.write(f"{item}\n")

## Read back *keyWords.txt*

In [28]:
key_words = []
with open('./data/keyWords.txt', 'r', encoding='utf-8') as file:
    key_words = file.readlines()
    
key_words = [line.strip() for line in key_words]

len(key_words)

14867

In [29]:
key_words

['WikiLeaks, News Leaks, Sunshine Press',
 'World War II, Europe, Japan, Soviet Union, War Crimes Trials',
 'Same-sex Marriage, United States, Marriage Equality',
 'Arab War, Casualties, Henry Laurens',
 'Capitalism, European Transformation, Adam Smith, Max Weber, Fernand Braudel, Henri Pirenne, Paul Sweezy',
 'World War I, WWI, Great War, Genocides, Influenza Pandemic',
 'Evolutionary Ideas, Jean-Baptiste Lamarck, Charles Darwin, Natural Selection',
 'Darwinian Evolution, Natural Selection, Mendelian Genetic Variability, Adaptation',
 'Republican Party, Conservatism, Progressive Party, New Deal Democrats',
 'Arab War, Casualties, Henry Laurens',
 'Jimmy Donal Wales, Wikipedia, Wikia',
 'Islam, History',
 'Great Salt Lake, Utah, Salt Water Lake, Terminal Lake',
 'Wikipedia, Jimmy Wales, Larry Sanger',
 'Arnold Schwarzenegger, Republican Presidential Candidate, John McCain',
 'On the Origin of Species, Natural Selection, Evolutionary Biology',
 'Athens, Greece',
 "Content Theory, Human 

In [30]:
for i in range(0,5):
    print(f"Text:\n{data[i]['context']}\n\nKeywords:\n{key_words[i]}\n---------------------------------------------------------\n")

Text:
WikiLeaks () is an international non-profit organisation that publishes secret information, news leaks, and classified media provided by anonymous sources. Its website, initiated in 2006 in Iceland by the organisation Sunshine Press, claims a database of 10 million documents in 10 years since its launch. Julian Assange, an Australian Internet activist, is generally described as its founder and director. Kristinn Hrafnsson is its editor-in-chief.

Keywords:
WikiLeaks, News Leaks, Sunshine Press
---------------------------------------------------------

Text:
The war in Europe concluded with an invasion of Germany by the Western Allies and the Soviet Union, culminating in the capture of Berlin by Soviet troops, the suicide of Adolf Hitler and the German unconditional surrender on 8 May 1945. Following the Potsdam Declaration by the Allies on 26 July 1945 and the refusal of Japan to surrender under its terms, the United States dropped atomic bombs on the Japanese cities of Hiroshima