In [1]:
import pandas as pd

import progressbar
import os

import warnings
warnings.filterwarnings('ignore')

In [2]:
!jupyter --version

jupyter core     : 4.7.1
jupyter-notebook : 6.3.0
qtconsole        : not installed
ipython          : 7.16.1
ipykernel        : 5.3.4
jupyter client   : 7.0.2
jupyter lab      : 3.2.2
nbconvert        : 5.6.1
ipywidgets       : 7.6.5
nbformat         : 5.1.3
traitlets        : 4.3.3


# 1. Read processed dataset

In [4]:
df = pd.read_csv('../Data/Processed/Sent to Kamran/data.csv', encoding='latin1')
print(df.shape)
df.head(2)

(7925, 14)


Unnamed: 0,Year,Titles,Abstract,Authors,Authors with affiliations,Author Keywords,Indexed Keywords,EID,Funding Details,Funding Texts,Document Type,Open Access,text,lower_text
0,2022,Energy Efficient Layered Cluster Head Rotation...,Energy efficiency is of paramount concern in u...,Datta A.; Dasgupta M.,"Datta A., Department of Computer Applications,...",Cluster-head; Energy consumption; Routing prot...,Energy efficiency; Internet protocols; Network...,2-s2.0-85128393697,,,Article,,Energy Efficient Layered Cluster Head Rotation...,energy efficient layered cluster head rotation...
1,2022,Underwater object detection using collaborativ...,"Despite recent progress in deep learning, unde...",Cai S.; Li G.; Shan Y.,"Cai S., School of Applied Science, Beijing Inf...",Collaborative learning; Noisy samples; Underwa...,Benchmarking; Deep learning; Object recognitio...,2-s2.0-85132754967,"National Natural Science Foundation of China, ...",This work was supported by the National Natura...,Article,,Underwater object detection using collaborativ...,underwater object detection using collaborativ...


In [5]:
df.Year.min(), df.Year.max()

(2011, 2022)

# 2. Create annual data chunks
* from 1990 to 2021, inclusive.

In [6]:
for year in range(2011, 2023):
    variable_name = "df_" + str(year)
    locals()[variable_name] = df[df.Year == year]
    print("%d size: %s" %(year, locals()[variable_name].shape))

2011 size: (340, 14)
2012 size: (424, 14)
2013 size: (428, 14)
2014 size: (482, 14)
2015 size: (601, 14)
2016 size: (607, 14)
2017 size: (618, 14)
2018 size: (758, 14)
2019 size: (950, 14)
2020 size: (896, 14)
2021 size: (1047, 14)
2022 size: (774, 14)


# 3. Keyword extraction using KeyBERT
* from: https://towardsdatascience.com/keyword-extraction-python-tf-idf-textrank-topicrank-yake-bert-7405d51cd839
* KeyBERT repo: https://github.com/MaartenGr/KeyBERT

BERT (Bidirectional Encoder Representations from Transformers) is a transformer-based model for natural language processing. Pretrained models can transform sentences or words in language representation consisting of an array of numbers (embedding). Sentences or words having similar latent representations (embedding) should have similar semantic meanings. An implementation that uses this approach to extract the keywords of a text is KeyBERT.

## 3.1. Annual, up to 3-gram

In [7]:
max_ngram_size = 3  
numOfKeywords = 500
diversity = 0.5 # parameter to diversify the results, we can use Maximal Margin Relevance (MMR) to create keywords / keyphrases which is based on cosine similarity.

In [7]:
from keybert import KeyBERT  #!pip install keybert

progress = progressbar.ProgressBar(max_value=len(range(2011,2023))) 

try:
    # create the keyword extractor
    kw_model = KeyBERT()
    
    for year in progress(range(2011, 2023)):
        variable_name = "df_" + str(year)
        keywords_bert = kw_model.extract_keywords(' '.join(locals()[variable_name].text).lower(), keyphrase_ngram_range=(1,max_ngram_size), stop_words='english', use_mmr=True, diversity=diversity, top_n=numOfKeywords)
        
        # need to create the "Results" folder...
        with open('../Results/keywords_0.5div_' + str(year) + '.csv', 'w', encoding="utf-8") as out_file:
            out_file.write('Keyword,Score\n')
            for keyword in keywords_bert:
                line_to_write = str(keyword[0]) + ',' + str(keyword[1]) + '\n'
                out_file.write(line_to_write)
except UnicodeEncodeError as e:
    print(e)

 11% (2 of 17) |#              | Elapsed Time: 22:00:41 ETA:  12 days, 14:10:48

KeyboardInterrupt: 