 # Keyword Extraction

In [1]:
pip install yake

Collecting yake
  Downloading yake-0.4.8-py2.py3-none-any.whl (60 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/60.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.2/60.2 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting segtok (from yake)
  Downloading segtok-1.5.11-py3-none-any.whl (24 kB)
Installing collected packages: segtok, yake
Successfully installed segtok-1.5.11 yake-0.4.8


In [2]:
import yake
import pandas as pd

In [3]:
prompts = pd.read_csv('PIMS_Sample_Prompts.csv')
prompts

Unnamed: 0,Prompt,Relevance,Unnamed: 2,Legend,Unnamed: 4
0,Vessel caught misreporting catch amount,Broad,,Broad,"Relevant, but will probably yield general resu..."
1,Vessel caught falsifying fishing logs,Broad,,Specific,"Relevant, but will generate results for a very..."
2,Vessel caught with incorrect catch reports,Broad,,,
3,Vessel caught underreporting catch in North At...,Broad,,* Note: This batch of queries is not represent...,
4,Vessel caught misreporting haddock catch,Specific,,,
...,...,...,...,...,...
95,Vessel caught underreporting catch for insuran...,Broad,,,
96,Vessel caught with mislabeled fish species,Broad,,,
97,Vessel caught falsifying catch during transport,Broad,,,
98,Vessel caught misreporting catch for fishing c...,Broad,,,


In [4]:
# YAKE keyword extraction
language = 'en'
max_ngram = 1
dup_threshold = 0.9
num_keywords = 20
extractor = yake.KeywordExtractor(lan=language, n=max_ngram, dedupLim=dup_threshold, top=num_keywords, features=None)

In [5]:
all_keywords = []
for prompt in prompts['Prompt']:
    keywords = [kw[0] for kw in extractor.extract_keywords(prompt)]
    all_keywords.extend(keywords)

In [6]:
all_keywords

['Vessel',
 'amount',
 'caught',
 'misreporting',
 'catch',
 'Vessel',
 'logs',
 'caught',
 'falsifying',
 'fishing',
 'Vessel',
 'reports',
 'caught',
 'incorrect',
 'catch',
 'Atlantic',
 'North',
 'Vessel',
 'caught',
 'underreporting',
 'catch',
 'Vessel',
 'catch',
 'caught',
 'misreporting',
 'haddock',
 'Vessel',
 'records',
 'caught',
 'inaccurate',
 'fish',
 'size',
 'Vessel',
 'area',
 'caught',
 'underreporting',
 'catch',
 'protected',
 'Vessel',
 'quantities',
 'caught',
 'misreporting',
 'tuna',
 'catch',
 'Vessel',
 'species',
 'caught',
 'unrecorded',
 'fish',
 'Vessel',
 'system',
 'caught',
 'bypassing',
 'quota',
 'Vessel',
 'documentation',
 'caught',
 'illegal',
 'catch',
 'Vessel',
 'inspection',
 'caught',
 'underreporting',
 'catch',
 'Vessel',
 'gear',
 'caught',
 'banned',
 'fishing',
 'Vessel',
 'fish',
 'caught',
 'hidden',
 'compartments',
 'Vessel',
 'taxes',
 'caught',
 'misreporting',
 'catch',
 'evade',
 'Vessel',
 'catch',
 'caught',
 'underreporting',

# TF-IDF Transformation

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(all_keywords)

In [9]:
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
tfidf_df.to_csv('tfidf_matrix.csv', index=False)

# K-means Clustering for Initial Grouping

In [10]:
from sklearn.cluster import KMeans

In [11]:
# Apply K-means clustering to TF-IDF matrix
kmeans = KMeans(n_clusters=4, random_state=42)
kmeans.fit(tfidf_matrix)



In [12]:
# Get cluster labels for each keyword
cluster_labels = kmeans.labels_

In [13]:
# Add cluster labels to the DataFrame with keywords
keyword_clusters_df = pd.DataFrame({'Keyword': all_keywords, 'Cluster': cluster_labels})

In [14]:
keyword_clusters_df.to_csv('keyword_clusters.csv', index=False)

# Embedding Keywords with SentenceTransformer

In [15]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/227.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━[0m [32m194.6/227.1 kB[0m [31m5.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (

In [16]:
from sentence_transformers import SentenceTransformer

# Load SentenceTransformer model for embedding
model = SentenceTransformer('all-mpnet-base-v2')

  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [17]:
# Encode keywords into embeddings
keyword_embeddings = model.encode(keyword_clusters_df['Keyword'].tolist())

In [18]:
# Add embeddings to the DataFrame
keyword_clusters_df['Embedding'] = list(keyword_embeddings)

In [19]:
#saving the file
embedding_df = pd.DataFrame(keyword_embeddings)
embedding_df.to_csv('keyword_embeddings.csv', index=False)

# Clustering with DBSCAN for Refinement

In [20]:
from sklearn.cluster import DBSCAN

In [21]:
# Extract embeddings for clustering
embeddings = keyword_clusters_df['Embedding'].tolist()

In [22]:
# Clustering with DBSCAN
dbscan = DBSCAN(eps=0.4, min_samples=5, metric='cosine')
dbscan.fit(embeddings)

In [23]:
dbscan_labels = dbscan.labels_

In [24]:
# Add DBSCAN labels to the DataFrame
keyword_clusters_df['DBSCAN_Label'] = dbscan_labels

In [25]:
# Save final clustered keywords to CSV
keyword_clusters_df.to_csv('clustered_keywordsnew.csv', index=False)

# filtering noise

In [26]:
# Filter out the noise points
cleaned_keyword_clusters_df = keyword_clusters_df[keyword_clusters_df['DBSCAN_Label'] != -1]

In [27]:
cleaned_keyword_clusters_df.to_csv('cleaned_clustered_keywords.csv', index=False)

# labeling the clusters

In [28]:
subject_clusters = [1]
consequence_clusters = [3]
subject_labels = [0]
consequence_labels = [1]
crime_clusters = [0]
crime_labels = [9, 3, 2]
specific_lables = [7, 4]
specific_clusters = [0]

# Generate prompts

In [29]:
import random

In [30]:
# Extract keywords for each part of the prompt
subjects = cleaned_keyword_clusters_df[
    (cleaned_keyword_clusters_df['Cluster'].isin(subject_clusters)) &
    (cleaned_keyword_clusters_df['DBSCAN_Label'].isin(subject_labels))
]['Keyword'].tolist()

In [31]:
# Extract keywords for each part of the prompt
consequences = cleaned_keyword_clusters_df[
    (cleaned_keyword_clusters_df['Cluster'].isin(consequence_clusters)) &
    (cleaned_keyword_clusters_df['DBSCAN_Label'].isin(consequence_labels))
]['Keyword'].tolist()

In [32]:
# Extract keywords for each part of the prompt
crimes = cleaned_keyword_clusters_df[
    (cleaned_keyword_clusters_df['Cluster'].isin(crime_clusters)) &
    (cleaned_keyword_clusters_df['DBSCAN_Label'].isin(crime_labels))
]['Keyword'].tolist()

In [34]:
# Extract keywords for each part of the prompt
specifics = cleaned_keyword_clusters_df[
    (cleaned_keyword_clusters_df['Cluster'].isin(specific_clusters)) &
    (cleaned_keyword_clusters_df['DBSCAN_Label'].isin(specific_lables))
]['Keyword'].tolist()

In [35]:
# Generate random prompts based on our format
num_prompts = 100
formatted_prompts = []

for _ in range(num_prompts):
    subject = random.choice(subjects)
    consequence = random.choice(consequences)
    crime = random.choice(crimes)
    specific = random.choice(specifics)

    prompt = f"{subject} {consequence} {crime} {specific}."
    formatted_prompts.append(prompt)

In [37]:
# Display the generated prompts
for i, prompt in enumerate(formatted_prompts, start=1):
    print(f"Prompt {i}: {prompt}")

Prompt 1: Vessel caught underreporting endangered.
Prompt 2: Vessel caught underreporting fishing.
Prompt 3: Vessel caught falsifying fish.
Prompt 4: Vessel caught underreporting fishing.
Prompt 5: Vessel caught underreporting fish.
Prompt 6: Vessel caught misreporting endangered.
Prompt 7: Vessel caught underreporting fishing.
Prompt 8: Vessel caught avoid fish.
Prompt 9: Vessel caught underreporting fisheries.
Prompt 10: Vessel caught misreporting fishing.
Prompt 11: Vessel caught falsifying fish.
Prompt 12: Vessel caught misreporting species.
Prompt 13: Vessel caught falsifying sardine.
Prompt 14: Vessel caught misreporting fish.
Prompt 15: Vessel caught reporting fishing.
Prompt 16: Vessel caught falsifying fishing.
Prompt 17: Vessel caught misreporting fishing.
Prompt 18: Vessel caught avoid species.
Prompt 19: Vessel caught fake species.
Prompt 20: Vessel caught avoid fishing.
Prompt 21: Vessel caught unreported tuna.
Prompt 22: Vessel caught underreporting species.
Prompt 23: Ve