##  count occurrences of verbs 

In [1]:
!pip install pyspark



In [2]:
# Create an RDD from a file
from pyspark import SparkContext

# Initialize a SparkContext
sc = SparkContext.getOrCreate()

# Load the csv file into an RDD
debates_rdd = sc.textFile("hdfs://namenode:9000/un-general-debates.csv").map(lambda line: line.split(',')[-1])

# Load the text file into an RDD
verbs_rdd = sc.textFile("hdfs://namenode:9000/all_verbs.txt")

# Load the text file into an RDD
verb_dict_rdd = sc.textFile("hdfs://namenode:9000/verb_dict.txt")


In [3]:
import re

# Filter out None values
debates_rdd = debates_rdd.filter(lambda line: line is not None)

# Remove empty lines
text_rdd_clean = debates_rdd.filter(lambda line: line.strip())


In [4]:
# Remove punctuation and handle lowercase
def tokenize_text(text):
    tokens = re.findall(r"\b(?:[A-Za-z]\.){2,}|\b[A-Za-z]+(?:['-][A-Za-z]+)*|\d+(?:-\d+)*\b", text.lower())
    return tokens

text_rdd_clean = debates_rdd.filter(lambda line: line.strip()).flatMap(tokenize_text)

In [5]:
# Load in verb list
verb_list = verbs_rdd.collect() 

# filter verb match in verb list
verbs_in_text_rdd = text_rdd_clean.filter(lambda word: word in verb_list)

In [6]:
# Split each line by commas
verb_dict_split = verb_dict_rdd.map(lambda line: line.split(','))

# Create (verb form, root verb) for each form and collect as a dictionary
verb_dict_flat = verb_dict_split.flatMap(lambda forms: [(form, forms[0]) for form in forms])

verb_dict = verb_dict_flat.collectAsMap()

In [7]:
# Transfer in different tenses 
def normalize_verb(verb, verb_dict_flat):
    if verb in verb_dict:
        return verb_dict[verb] # Return the infinitive form of the verb in the dictionary
    else:
        return verb # If the verb does not exist in the dictionary, return the original verb

# regular verb
verbs_normalized_rdd = verbs_in_text_rdd.map(lambda verb: normalize_verb(verb, verb_dict))
    
# Count the frequently used verbs
verb_counts_rdd = verbs_normalized_rdd.map(lambda verb: (verb, 1)).reduceByKey(lambda a, b: a + b)

# Count the top 10 frequently used verbs
top_10_verbs = verb_counts_rdd.sortBy(lambda x: x[1], ascending=False).take(10)

# Display the results in the format 
for verb, count in top_10_verbs:
    print(f"('{verb}', {count})")

('be', 350853)
('have', 132533)
('state', 41644)
('make', 26852)
('support', 25705)
('take', 22237)
('continue', 21837)
('develop', 21146)
('need', 19186)
('do', 18590)


In [8]:
!pip install --upgrade typing_extensions

Requirement already up-to-date: typing_extensions in /opt/conda/lib/python3.8/site-packages (4.12.2)


In [9]:
!pip install torch sentence-transformers



In [10]:
!pip show typing_extensions

Name: typing-extensions
Version: 4.12.2
Summary: Backported and Experimental Type Hints for Python 3.8+
Home-page: None
Author: None
Author-email: "Guido van Rossum, Jukka Lehtosalo, Łukasz Langa, Michael Lee" <levkivskyi@gmail.com>
License: None
Location: /opt/conda/lib/python3.8/site-packages
Requires: 
Required-by: torch, huggingface-hub, bokeh


In [11]:
!pip install numpy



In [12]:
!pip install tqdm



In [13]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from tqdm.notebook import tqdm

# Load the SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')


  from tqdm.autonotebook import tqdm, trange


In [20]:
"""
The code snippet (How to make Faiss run faster) below has been sourced from
https://github.com/facebookresearch/faiss/wiki/How-to-make-Faiss-run-faster
The code snippet appears in its original form to speed up query
"""

# Function to encode complete debate texts and create a Faiss IVFPQ index
def encode_and_index(debate_texts, model, batch_size=512, nlist=100):
    all_vectors = []
    """
    The code snippet (tqdm) below has been sourced from
    https://github.com/tqdm/tqdm
    I have applied to check process status.
    """
    # Encode full debate texts in batches with a progress bar
    for i in tqdm(range(0, len(debate_texts), batch_size), desc="Encoding Loading"):
        batch = debate_texts[i:i+batch_size]
        all_vectors.append(model.encode(batch))
    
    """
    The code snippet (IVFPQ) below has been sourced from
    https://github.com/facebookresearch/faiss/blob/main/tutorial/python/3-IVFPQ.py
    I have applied to speed up query.
    """
    # Combine all encoded vectors
    debate_vectors = np.vstack(all_vectors)
    dimension = 384
    
    # Create a Faiss index with IVFPQ for faster searches
    quantizer = faiss.IndexFlatL2(dimension)
    indexIVF = faiss.IndexIVFPQ(quantizer, dimension, nlist, 4, 8)
    
    # Train the index on debate vectors
    indexIVF.train(debate_vectors)
    
    # Add the vectors to the index
    indexIVF.add(debate_vectors)
    
    # Enable precomputed lookup tables to speed up searches
    indexIVF.use_precomputed_table = True

    return indexIVF, debate_vectors


In [21]:
# Function to search for the most similar debate to the query
def search_similar_debate(query_sentence, model, indexIVF, debate_texts, nprobe=8):
    # Encode the query sentence into a vector
    query_vector = model.encode([query_sentence])
    
    # Set the number of clusters to search through
    indexIVF.nprobe = nprobe
    
    # Search for the most similar debate
    D, I = indexIVF.search(query_vector, k=1)
    
    # Return the most similar debate based on the search result
    most_similar_debate = debate_texts[I[0][0]]
    return most_similar_debate


In [22]:
import pandas as pd

# Read the CSV file and choose text row
df = pd.read_csv('un-general-debates.csv')
debates_list = df['text'].tolist() 


# Encode the debate texts and store them in a Faiss index
indexIVF, debate_vectors = encode_and_index(debates_list, model, batch_size=512, nlist=100)


HBox(children=(HTML(value='Encoding Loading'), FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [23]:
# The query sentence
query_sentence = "Global climate change is both a serious threat to our planet and survival."

# Search for the most similar debate content to the query
most_similar_debate = search_similar_debate(query_sentence, model, indexIVF, debates_list, )

# Output the most similar debate text
print("The Most similar to the query is:")
print(most_similar_debate)



The Most similar to the query is:
At the outset of my 
statement, I wish to offer the President and our  
newly appointed Secretary-General my warmest 
congratulations and best wishes for the success of the 
sixty-second session of the General Assembly, which 
has as its main focus the impact of climate change on 
global peace and development. 
 As the recently concluded High-level Event 
revealed, there is now a greater awareness of that issue 
and of the need to address them with greater urgency. 
Like the legendary Rip Van Winkle, we appear to have 
finally awakened from a long and deep slumber, only 
to find that the world around us is in serious danger of 
degradation. Our environment has become frightening. 
As a result of the increased consumption of fossil fuels 
in past decades, the amount of carbon dioxide in the 
atmosphere has increased by more than 25 per cent, 
which will, if unmitigated, eventually lead to a rise in 
the Earth’s temperature of more than 5 degrees in the 