### Import Data

In [None]:
import json

def load_data(file):
    with open (file, "r", encoding="utf-8") as f:
        data = json.load(f) 
    return (data)

## Keyword Extraction

### Named entity Recognition

#### Import Libraries

In [None]:
import json
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from collections import Counter
import nltk 
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')


In [None]:
from nltk.corpus import wordnet as wn

In [None]:
syns = wordnet.synsets("program")

#### Key Word extractor 

In [None]:

# Define a function to extract keywords
def extract_keywords(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())

    # Remove stopwords
    stopwords_list = stopwords.words('english')
    tokens = [token for token in tokens if token not in stopwords_list]

    # Tag the tokens with their part of speech
    tagged_tokens = pos_tag(tokens)

    # Define a function to convert part of speech tags to WordNet compatible tags
    def get_wordnet_pos(treebank_tag):
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return ''

    # Lemmatize the tokens using WordNet
    lemmatizer = wordnet.WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token, pos=get_wordnet_pos(tag)) for token, tag in tagged_tokens]

    # Identify named entities using NLTK's named entity recognition (NER) module
    named_entities = ne_chunk(pos_tag(word_tokenize(text)))

    # Extract the most common lemmas and named entities
    keywords = [token for token, count in Counter(lemmatized_tokens + [chunk[0] for chunk in named_entities if hasattr(chunk, 'label')]).most_common(10)]

    return keywords

#### Implementaion

In [None]:
#download using nltk.download('punkt') if you get an nltk error
all_keywords = []
for index, story in enumerate(corpus):
    keywords = extract_keywords(story[index])
    all_keywords.append(keywords)

### TF-IDF

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
import nltk
nltk.download('stopwords')

# Load the English language model
nlp = spacy.load("en_core_web_sm")

# Get the stop words for English
spacy_stop_words = nlp.Defaults.stop_words

# choose the set of english stopwords
nltk_stop_words = set(stopwords.words('english'))

# Function to preprocess the input corpus
def preprocess_text(text):
    # tokenize a story within a corpus 
    tokens = word_tokenize(text.lower())

    # remove alphanumeric characters and stop words
    cleaned_tokens = [token for token in tokens if token.isalpha() and token not in spacy_stop_words]
    return cleaned_tokens

cleaned_corpus = [preprocess_text(doc) for doc in corpus]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
# vectorize using TfidfVectorizer which combines counting and noralized weighting
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)

feature_names = vectorizer.get_feature_names_out()

In [None]:
import numpy as np

def get_top_keywords(tfidf_scores, feature_names, top_n):
    sorted_scores = np.argsort(tfidf_scores)[::-1]
    top_keywords = [feature_names[i] for i in sorted_scores[:top_n]]
    return top_keywords

for i, doc in enumerate(corpus):
    tfidf_scores = tfidf_matrix[i].toarray()[0]
    top_keywords = get_top_keywords(tfidf_scores, feature_names, 10)
    print(f"Document {i+1} top keywords: {top_keywords}")


##### Experimental 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Define the keywords and phrases that are likely to appear in relevant stories
relevant_keywords = ['low socio-economic background', 'higher education', 'difficulties', 'overcome']

# Define the path to the file containing the stories
file_path = 'path/to/file.csv'

# Read in the stories from the file
with open(file_path, 'r', encoding='utf-8') as csv_file:
    csv_reader = csv.DictReader(csv_file)
    stories = [row['story'] for row in csv_reader]

# Define a vectorizer that will convert the stories into a matrix of TF-IDF features
vectorizer = TfidfVectorizer(stop_words='english', use_idf=True)

# Convert the stories into a matrix of TF-IDF features
story_matrix = vectorizer.fit_transform(stories)

# Loop through the stories and calculate the cosine similarity between each story and the relevant keywords
for i, story in enumerate(stories):
    # Convert the story into a matrix of TF-IDF features
    story_vec = vectorizer.transform([story])

    # Calculate the cosine similarity between the story and the relevant keywords
    similarity = cosine_similarity(story_vec, story_matrix[:, [vectorizer.vocabulary_.get(word) for word in relevant_keywords]])

    # If the similarity is above a certain threshold, print the story
   


### Rake_NLTK

**Use "rake_nltk.txt" to install all the necessary packages** (```pip install -r requirements.txt```).

In [None]:
from rake_nltk import Rake

r = Rake(min_length = 1)

r.extract_keywords_from_text(corpus[0])
r.get_ranked_phrases_with_scores()

### KeyBERT & KeyphraseVectorizers (best)

**Use "KeyBert_req.txt" to install all the necessary packages** (```pip install -r requirements.txt```).
However, before you install anything, make sure to fulfill the requirements below:
* Make sure you installed the following [cuda(especially nvcc)](https://nvidia.github.io/cuda-python/install.html), [spacy](https://spacy.io/usage#quickstart), [visual c++ >2017 and the windows SDK for C++](https://visualstudio.microsoft.com/visual-cpp-build-tools/). The links above should lead you to the installation instruction of each of these libraries in case the pip install of the requirements doesn't work. Visual c++ and the Windows SDK for C++ needs to be installed manually. <u><span style="background-color: #f70000">**Make sure to use Python <= 3.9.**</span><u>
    
      
    

In [None]:
from keyphrase_vectorizers import KeyphraseCountVectorizer
from keybert import KeyBERT
from paths import DATA 

#init model
kw_model = KeyBERT(model='all-MiniLM-L6-v2')
corpus = load_data(DATA)["File"]

In [None]:
#vectorizer based
keywords = kw_model.extract_keywords(
        docs=corpus, 
        vectorizer = KeyphraseCountVectorizer(spacy_pipeline='en_core_web_sm'), #passing vectorizer in, don't use keyphrase_ngram_range
        use_maxsum=True, 
        use_mmr=True,
        nr_candidates = 20,
        top_n = 10,
        diversity=0.5
        )

keywords

In [None]:
#ngram_range
keywords_2 = kw_model.extract_keywords(
        docs=corpus, 
        keyphrase_ngram_range = (1,3),
        use_maxsum=True, 
        use_mmr=True,
        stop_words ='english', 
        nr_candidates = 20,
        top_n = 10,
        diversity=0.5
        )

keywords_2

In [None]:
keyphrases = [q[0] for x in keywords for q in x]
keyphrases_2 = [q[0] for x in keywords_2 for q in x]

#change to numpy array 
import numpy as np

keyphrases = np.array(keyphrases)
keyphrases_2 = np.array(keyphrases_2)

#combine the two arrays into 2-d array
keyphrase_arr = np.stack((keyphrases, keyphrases_2), axis = 1)

#table of keyphrases
import pandas as pd
df = pd.DataFrame(keyphrase_arr, columns = ['verctorized_keyphrases', 'ngram_range_keyphrases'])
df.to_csv('keyphrases.csv', index = False)

## Data Prep

Make sure to install Microsoft Visual C++ 14.0 or greater and Windows SDK for your specific OS. Get it with "Microsoft C++ Build Tools": https://visualstudio.microsoft.com/visual-cpp-build-tools/. That is in Individual Components > Compilers, Build tools and runtimes > MSVC ......(latest) 

### Clustering

In [None]:
# get the following packages if not already installed
!pip install -U scikit-learn
!pip install -U sentence-transformers

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import warnings
warnings.filterwarnings('ignore')

embedder = SentenceTransformer('all-MiniLM-L6-v2')

#keyphrases.csv in shared drive (not public data)
df = pd.read_csv('keyphrases.csv')
keyphrases = df['ngram_range_keyphrases'].to_numpy()

#embed keyphrases
corpus_embeddings = embedder.encode(keyphrases, convert_to_tensor=True)

#normalization
corpus_embeddings = corpus_embeddings / np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)

#### K-Means

In [None]:
from sklearn.cluster import KMeans

kmeans_arr  = []

for n_clusters in range(5, 16): 
    clustering_model = KMeans(n_clusters=n_clusters)
    clustering_model.fit(corpus_embeddings)
    cluster_assignment = clustering_model.labels_
    kmeans_arr.append(np.array(cluster_assignment))

In [None]:
clustered_sentences = [[[] for _ in range(n_clusters)] for n_clusters in range(5, 16)]

for n_clusters, arr in enumerate(kmeans_arr):
    for phrase_id, cluster_id in enumerate(arr):
        clustered_sentences[n_clusters][cluster_id].append(keyphrases[phrase_id])



In [None]:
#export as json
import json
index = 10
with open(f'clustered_sentences_{index}.json', 'w') as f:

    json.dump(clustered_sentences[index], f, indent=4, sort_keys=True)

#### Agglomerative Clustering

In [None]:
from sklearn.cluster import AgglomerativeClustering


clustering_model_AGC= AgglomerativeClustering(n_clusters=None, distance_threshold=1.5)
clustering_model_AGC.fit(corpus_embeddings)
cluster_assignment_AGC= clustering_model_AGC.labels_
cluster_assignment_AGC

In [None]:
clustered_sentences_AGC = {}
for phrase_id, cluster_id in enumerate(cluster_assignment_AGC):
    if cluster_id not in clustered_sentences:
        clustered_sentences[cluster_id] = []

    clustered_sentences[cluster_id].append(keyphrases[phrase_id])

#### Fast Clustering

### Search

In [None]:
# install the following if necessary
!pip install umap-learn altair datasets tqdm 
!pip install --use-pep517 annoy
!pip install ipywidgets
# if not in a virtual environment,
# !jupyter nbextension enable --py widgetsnbextension
# if in a virtual environment,
!jupyter nbextension enable --py --sys-prefix widgetsnbextension

In [None]:
#instantiate cohere and import libraries
# import umap
import json
import numpy as np
import pandas as pd 
import altair as alt
from tqdm import tqdm
from annoy import AnnoyIndex
# from annoy import AnnoyIndex
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', None)

In [None]:
model = SentenceTransformer("multi-qa-distilbert-cos-v1")

In [None]:
#load data
df = pd.read_csv('sample_narratives.csv')

narrative_df = pd.DataFrame({'id': df.id, 'narratives':  df['title'] + '. ' + df['selftext']})

# make lowercase 
narrative_df['narratives'] = narrative_df['narratives'].str.lower()

# to use old narratives as queries
oldnarrative_queries = list(json.load(open("local_data.json", encoding="utf-8"))["File"]) 

# # to use keywords as queries 
# queries = list(json.load(open("clustered_sentences_10.json"))) 
# queries_combined = [" ".join(q) for q in queries]

In [None]:
from functions.preprocessing import *
from multiprocessing import Pool, cpu_count, Manager

# lemmatize queries
tokenized_queries = list(map(lemmatize, oldnarrative_queries ))


# lemmatized new narratives
num_processes = cpu_count()
  
# get the series that contains the narratives
narratives_series =narrative_df["narratives"].to_numpy()

chunks = []
# split narrative_df into equal parts
if narratives_series.shape[0] % num_processes != 0:  # if it doesn't equally split leave the rest out and add it to the last subarray later
    chunks = np.split(narratives_series[:-(narrative_df.shape[0] % num_processes)], num_processes)
    chunks[-1] = np.append(chunks[-1], narratives_series[-(narratives_series.shape[0] % num_processes):])

else: 
    chunks = np.split(narratives_series, num_processes)

pool = Pool(processes=num_processes)

#create a shared list and a manager to preserve order 
result_arr = Manager().list()

for result in pool.imap(lemmatize_array, tqdm(chunks)):
    result_arr.append(result)

pool.close()





# narratives_series =narrative_df["narratives"].to_numpy()
# ch = np.split(narratives_series[:-(narrative_df.shape[0] % 20)], 20)
# ch[-1] = np.append(ch[-1], narratives_series[-(narratives_series.shape[0] % 20):])

In [None]:
result_df = pd.DataFrame({"tokenized_narratives": result_arr})

merged_narrative_df = pd.concat([narrative_df, result_df], axis=1)
merged_narrative_df.to_csv("Tokenized_narratives.csv")

##### Create Search Queries and Embeddings

In [None]:
# build embeddings for queries(the keywords)
query_embeddings = model.encode(sentences= oldnarrative_queries,
                                convert_to_numpy=True,
                                show_progress_bar=True, 
                                normalize_embeddings=True)

# build index 
query_search_index = AnnoyIndex(query_embeddings.shape[1], 'manhattan')

for index, embed_value in enumerate(query_embeddings):
    query_search_index.add_item(index, embed_value)

query_search_index.build(n_trees = 20, n_jobs = 3)

# query: old narrative based
query_search_index.save('old-narrative_queries_search-index.ann')
np.save('old-narrative_queries_embeddings.npy', query_embeddings)

## keywords
# query_search_index.save('old-narrative_query_search-index.ann')
# np.save('queries_embeddings.npy', query_embeddings)

In [None]:
# non-parallelized version
narratives_embedding = model.encode(sentences=narrative_df['narratives'].to_numpy(), 
                                    convert_to_numpy=True, 
                                    show_progress_bar=True, 
                                    normalize_embeddings=True)

np.save('narratives_embeddings_all.npy', narratives_embedding)
narratives_search_index = AnnoyIndex(np.array(narratives_embedding).shape[1], 'manhattan')

for index_embedding, embed_value in enumerate(tqdm(narratives_embedding)):
    narratives_search_index.add_item(index_embedding, embed_value)

narratives_search_index.build(n_trees = 20, n_jobs = -1)
narratives_search_index.save(f'narratives_search_index_all.ann')

##### Load embeddings and search indexes

In [None]:
# load embeddings and search indexes for new narratives 
narratives_embedding = np.load('narratives_embeddings.npy')
narratives_search_index = AnnoyIndex(np.array(narratives_embedding).shape[1], 'manhattan')
narratives_search_index.load('narratives_search_index.ann')

# oldnarrative based: load embeddings and search indexes queries
query_embeddings = np.load('old-narrative_queries_embeddings.npy')
query_search_index = AnnoyIndex(np.array(query_embeddings).shape[1], 'manhattan')
query_search_index.load('old-narrative_queries_search-index.ann')

# # keyword based: load embeddings and search indexes queries
# query_embeddings = np.load('queries_embeddings.npy')
# query_search_index = AnnoyIndex(np.array(query_embeddings).shape[1], 'manhattan')
# query_search_index.load('query_search_index.ann')

##### Simple Concatenating

In [None]:
#retrive nearest neighbors

results: pd.DataFrame

for _, qe in tqdm(enumerate(query_embeddings)): 
    similar_item_ids = narratives_search_index.get_nns_by_vector(qe, n=1000, include_distances=True)

    if _ == 0:
        results = pd.DataFrame(data={'texts': narrative_df['narratives'][similar_item_ids[0]], 
                            'ids': narrative_df['id'][similar_item_ids[0]],
                            'distance': similar_item_ids[1]})
    else:
        results = pd.concat([results, pd.DataFrame(data={'texts': narrative_df['narratives'][similar_item_ids[0]], 
                            'ids': narrative_df['id'][similar_item_ids[0]],
                            'distance': similar_item_ids[1]})])

results = results.drop_duplicates(subset=['texts'])
results = results.sort_values(by=['distance'], ascending=False)

results

##### Comparative Concatenating

In [None]:
def compare(id_to_distance_dictionary: dict, threshold: float) -> float:
    """
    Filters for stories which contain intersecting topics specified by the threshold. 

    :param id_to_distance_dictionary
    :param

    return: 
        tuple containing all the ids for relevant narratives and their respective distances
    """
   
    final_array = []

    for comparable_index, comparable_key in enumerate(id_to_distance_dictionary):
        # array we are currently trying to filter
        comparable_id_array = id_to_distance_dictionary[comparable_key][0]
        
        for index, key in enumerate(id_to_distance_dictionary):
            if key != comparable_key:
                temp_array = []
                percentage = 0
                id_array = id_to_distance_dictionary[key][0]

                for comparable_id in comparable_id_array: 
                    if id_array.__contains__(comparable_id): 
                        percentage += (1 / len(id_to_distance_dictionary)) * 100 
                    
            if percentage > threshold: 
                final_array.append(id)

    return final_array

In [None]:
similar_dict = {}
for index, query_embedding in tqdm(enumerate(query_embeddings)): 
    similar_item_ids = narratives_search_index.get_nns_by_vector(
                                    query_embedding,
                                    n=1000, 
                                    include_distances=True)

    similar_dict[f"query_{index}"] = similar_item_ids

In [None]:
len(similar_dict)

In [None]:
filterd_arr= compare(similar_dict)

##### Results

In [None]:
# add empty row to the bottom of the dataframe
empty_row = pd.DataFrame({'texts': [''], 'ids': [''], 'distance': [0]})
results = pd.concat([results, empty_row])

In [None]:
results.reset_index(drop=True, inplace=True)
results.to_csv('results.csv')

In [None]:
import pandas as pd

results = pd.read_csv('results.csv')

In [None]:
from paths import HOME
from functions.document_writer import docx_writer

docx_writer(num_people=5, data = results, path = HOME)