# Keyword Extraction

In [4]:
import pandas as pd
import numpy as np

In [5]:
# Fetch preprocessed data
fe_data = pd.read_csv("/content/preprocessed.csv", usecols=["Id", "Name", "Language", "Description", "bow"])
fe_data.head()

Unnamed: 0,Id,Name,Language,Description,bow
0,1020396,the gospel of john,en,what sets this commentary on the fourth gospel...,francis_j._moloney michael_glazier en
1,1073868,hanslick on the musically beautiful: sixteen l...,en,the sixteen lectures by geoffrey payzant in th...,geoffrey_payzant 1-877275-49-2 en
2,1025976,microserfs,fre,génération x 1018 n° 2508 qui a connu un gros ...,douglas_coupland 10/18 fre
3,1045943,courir avec des ciseaux,fre,roman autobiographique choc courir avec des ci...,augusten_burroughs 10/18 fre
4,1027805,affinités,fre,pour tromper son ennui une demoiselle de la bo...,sarah_waters 10/18 fre


In [6]:
# Find duplicated values
fe_data.shape[0] - fe_data.nunique()

Unnamed: 0,0
Id,0
Name,85
Language,34435
Description,435
bow,1862


---
### Consider only English books

In [7]:
# # Temporary process only english books
fe_data = fe_data[fe_data.Language.isin(["eng", "en-US", "en-GB"])].copy()

---
### Extract keywords from description using keyBERT

In [8]:
pip install KeyBERT

Collecting KeyBERT
  Downloading keybert-0.9.0-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.3.8->KeyBERT)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.3.8->KeyBERT)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.3.8->KeyBERT)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers>=0.3.8->KeyBERT)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers>=0.3.8->KeyBERT)
  Downloading nvi

In [9]:
from keybert import KeyBERT
kw_model = KeyBERT()

def get_keywords(text):
    keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 1), stop_words="english")
    keywords = " ".join([k[0] for k in keywords])
    return keywords

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [10]:
import dask.dataframe as dd

# Convert pandas DataFrame to Dask DataFrame
dd_fe_data = dd.from_pandas(fe_data, npartitions=4)

# Apply get_keywords function in parallel
dd_fe_data["keywords"] = dd_fe_data["Description"].apply(get_keywords, meta=('x', 'object'))

# Compute the result
fe_data["keywords"] = dd_fe_data.compute()["keywords"]


In [11]:
fe_data["keywords"] = fe_data.Description.apply(get_keywords)

In [12]:
fe_data.keywords.head()

Unnamed: 0,keywords
12,memphis egypt delta governor thebes
15,proverb picket bells christmas stories
21,emma paris french shes sullivan
25,moomintroll comet moominvalley adventures adve...
27,acheron greeks trojan troy helen


In [13]:
fe_data["keywords"] = fe_data[['bow', 'keywords']].fillna('').agg(' '.join, axis=1)
fe_data.drop(['bow', 'Description'], axis = 1, inplace=True)

---
### Remove duplicated book names

In [14]:
fe_data[fe_data.duplicated(subset=["Name"], keep="first")]

Unnamed: 0,Id,Name,Language,keywords
3235,1061929,the moon is a harsh mistress,eng,robert_a._heinlein berkley_medallion eng lehr...
3616,1061923,the moon is a harsh mistress,eng,robert_a._heinlein blackstone_publishing eng ...
7574,1038823,blood and chocolate,eng,annette_curtis_klause delacorte_press eng wer...
9045,1098850,decline and fall,eng,evelyn_waugh everyman's_library eng librarian...
9139,1023491,the remains of the day,eng,kazuo_ishiguro faber_and_faber_ltd. eng steve...
11884,1037476,the great and secret show (book of the art #1),eng,book_of_the_art_#1 clive_barker harpercollins_...
11969,1072552,the tiger who came to tea,eng,judith_kerr harpercollinschildren’sbooks eng ...
12558,1086403,something for the weekend (leo street #1),eng,leo_street_#1 pauline_mclynn headline eng leo ...
20296,1004432,"playing with fire (inspector banks, #14)",eng,"inspector_banks,_#14 peter_robinson pan_macmil..."
21653,1050062,the anastasia syndrome and other stories,eng,mary_higgins_clark pocket_books eng anastasia...


In [15]:
fe_data = fe_data.drop_duplicates(subset=["Name"], keep='first')

---
### Save final dataset

In [19]:
fe_data.to_csv("/content/keywords.csv", sep=",", index=False)