### Import Data

In [None]:
import json

def load_data(file):
    with open (file, "r", encoding="utf-8") as f:
        data = json.load(f) 
    return (data)

## Keyword Extraction

### Named entity Recognition

#### Import Libraries

In [None]:
import json
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from collections import Counter
import nltk 
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')


In [None]:
from nltk.corpus import wordnet as wn

In [None]:
syns = wordnet.synsets("program")

#### Key Word extractor 

In [None]:

# Define a function to extract keywords
def extract_keywords(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())

    # Remove stopwords
    stopwords_list = stopwords.words('english')
    tokens = [token for token in tokens if token not in stopwords_list]

    # Tag the tokens with their part of speech
    tagged_tokens = pos_tag(tokens)

    # Define a function to convert part of speech tags to WordNet compatible tags
    def get_wordnet_pos(treebank_tag):
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return ''

    # Lemmatize the tokens using WordNet
    lemmatizer = wordnet.WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token, pos=get_wordnet_pos(tag)) for token, tag in tagged_tokens]

    # Identify named entities using NLTK's named entity recognition (NER) module
    named_entities = ne_chunk(pos_tag(word_tokenize(text)))

    # Extract the most common lemmas and named entities
    keywords = [token for token, count in Counter(lemmatized_tokens + [chunk[0] for chunk in named_entities if hasattr(chunk, 'label')]).most_common(10)]

    return keywords

#### Implementaion

In [None]:
#download using nltk.download('punkt') if you get an nltk error
all_keywords = []
for index, story in enumerate(corpus):
    keywords = extract_keywords(story[index])
    all_keywords.append(keywords)

### TF-IDF

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
import nltk
nltk.download('stopwords')

# Load the English language model
nlp = spacy.load("en_core_web_sm")

# Get the stop words for English
spacy_stop_words = nlp.Defaults.stop_words

# choose the set of english stopwords
nltk_stop_words = set(stopwords.words('english'))

# Function to preprocess the input corpus
def preprocess_text(text):
    # tokenize a story within a corpus 
    tokens = word_tokenize(text.lower())

    # remove alphanumeric characters and stop words
    cleaned_tokens = [token for token in tokens if token.isalpha() and token not in spacy_stop_words]
    return cleaned_tokens

cleaned_corpus = [preprocess_text(doc) for doc in corpus]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
# vectorize using TfidfVectorizer which combines counting and noralized weighting
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)

feature_names = vectorizer.get_feature_names_out()

In [None]:
import numpy as np

def get_top_keywords(tfidf_scores, feature_names, top_n):
    sorted_scores = np.argsort(tfidf_scores)[::-1]
    top_keywords = [feature_names[i] for i in sorted_scores[:top_n]]
    return top_keywords

for i, doc in enumerate(corpus):
    tfidf_scores = tfidf_matrix[i].toarray()[0]
    top_keywords = get_top_keywords(tfidf_scores, feature_names, 10)
    print(f"Document {i+1} top keywords: {top_keywords}")


##### Experimental 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Define the keywords and phrases that are likely to appear in relevant stories
relevant_keywords = ['low socio-economic background', 'higher education', 'difficulties', 'overcome']

# Define the path to the file containing the stories
file_path = 'path/to/file.csv'

# Read in the stories from the file
with open(file_path, 'r', encoding='utf-8') as csv_file:
    csv_reader = csv.DictReader(csv_file)
    stories = [row['story'] for row in csv_reader]

# Define a vectorizer that will convert the stories into a matrix of TF-IDF features
vectorizer = TfidfVectorizer(stop_words='english', use_idf=True)

# Convert the stories into a matrix of TF-IDF features
story_matrix = vectorizer.fit_transform(stories)

# Loop through the stories and calculate the cosine similarity between each story and the relevant keywords
for i, story in enumerate(stories):
    # Convert the story into a matrix of TF-IDF features
    story_vec = vectorizer.transform([story])

    # Calculate the cosine similarity between the story and the relevant keywords
    similarity = cosine_similarity(story_vec, story_matrix[:, [vectorizer.vocabulary_.get(word) for word in relevant_keywords]])

    # If the similarity is above a certain threshold, print the story
   


### Rake_NLTK

**Use "rake_nltk.txt" to install all the necessary packages** (```pip install -r requirements.txt```).

In [None]:
from rake_nltk import Rake

r = Rake(min_length = 1)

r.extract_keywords_from_text(corpus[0])
r.get_ranked_phrases_with_scores()

### KeyBERT & KeyphraseVectorizers (best)

**Use "KeyBert_req.txt" to install all the necessary packages** (```pip install -r requirements.txt```).
However, before you install anything, make sure to fulfill the requirements below:
* Make sure you installed the following [cuda(especially nvcc)](https://nvidia.github.io/cuda-python/install.html), [spacy](https://spacy.io/usage#quickstart), [visual c++ >2017 and the windows SDK for C++](https://visualstudio.microsoft.com/visual-cpp-build-tools/). The links above should lead you to the installation instruction of each of these libraries in case the pip install of the requirements doesn't work. Visual c++ and the Windows SDK for C++ needs to be installed manually. <u><span style="background-color: #f70000">**Make sure to use Python <= 3.9.**</span><u>
    
      
    

In [None]:
from keyphrase_vectorizers import KeyphraseCountVectorizer
from keybert import KeyBERT
from paths import DATA 

#init model
kw_model = KeyBERT(model='all-MiniLM-L6-v2')
corpus = load_data(DATA)["File"]

In [None]:
#vectorizer based
keywords = kw_model.extract_keywords(
        docs=corpus, 
        vectorizer = KeyphraseCountVectorizer(spacy_pipeline='en_core_web_sm'), #passing vectorizer in, don't use keyphrase_ngram_range
        use_maxsum=True, 
        use_mmr=True,
        nr_candidates = 20,
        top_n = 10,
        diversity=0.5
        )

keywords

In [None]:
#ngram_range
keywords_2 = kw_model.extract_keywords(
        docs=corpus, 
        keyphrase_ngram_range = (1,3),
        use_maxsum=True, 
        use_mmr=True,
        stop_words ='english', 
        nr_candidates = 20,
        top_n = 10,
        diversity=0.5
        )

keywords_2

In [None]:
keyphrases = [q[0] for x in keywords for q in x]
keyphrases_2 = [q[0] for x in keywords_2 for q in x]

#change to numpy array 
import numpy as np

keyphrases = np.array(keyphrases)
keyphrases_2 = np.array(keyphrases_2)

#combine the two arrays into 2-d array
keyphrase_arr = np.stack((keyphrases, keyphrases_2), axis = 1)

#table of keyphrases
import pandas as pd
df = pd.DataFrame(keyphrase_arr, columns = ['verctorized_keyphrases', 'ngram_range_keyphrases'])
df.to_csv('keyphrases.csv', index = False)

## Data Prep

### Clustering (only for keyword based search, else skip)

In [None]:
# get the following packages if not already installed
!pip install -U scikit-learn
!pip install -U sentence-transformers

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import warnings
warnings.filterwarnings('ignore')

embedder = SentenceTransformer('all-MiniLM-L6-v2')

#keyphrases.csv in shared drive (not public data)
df = pd.read_csv('keyphrases.csv')
keyphrases = df['ngram_range_keyphrases'].to_numpy()

#embed keyphrases
corpus_embeddings = embedder.encode(keyphrases, convert_to_tensor=True)

#normalization
corpus_embeddings = corpus_embeddings / np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)

#### K-Means

In [None]:
from sklearn.cluster import KMeans

kmeans_arr  = []

for n_clusters in range(5, 16): 
    clustering_model = KMeans(n_clusters=n_clusters)
    clustering_model.fit(corpus_embeddings)
    cluster_assignment = clustering_model.labels_
    kmeans_arr.append(np.array(cluster_assignment))

In [None]:
clustered_sentences = [[[] for _ in range(n_clusters)] for n_clusters in range(5, 16)]

for n_clusters, arr in enumerate(kmeans_arr):
    for phrase_id, cluster_id in enumerate(arr):
        clustered_sentences[n_clusters][cluster_id].append(keyphrases[phrase_id])



In [None]:
#export as json
import json
index = 10
with open(f'clustered_sentences_{index}.json', 'w') as f:

    json.dump(clustered_sentences[index], f, indent=4, sort_keys=True)

#### Agglomerative Clustering

In [None]:
from sklearn.cluster import AgglomerativeClustering


clustering_model_AGC= AgglomerativeClustering(n_clusters=None, distance_threshold=1.5)
clustering_model_AGC.fit(corpus_embeddings)
cluster_assignment_AGC= clustering_model_AGC.labels_
cluster_assignment_AGC

In [None]:
clustered_sentences_AGC = {}
for phrase_id, cluster_id in enumerate(cluster_assignment_AGC):
    if cluster_id not in clustered_sentences:
        clustered_sentences[cluster_id] = []

    clustered_sentences[cluster_id].append(keyphrases[phrase_id])

### Search

In [None]:
# install the following if necessary
!pip install umap-learn altair datasets tqdm 
!pip install --use-pep517 annoy
!pip install ipywidgets
# if not in a virtual environment,
# !jupyter nbextension enable --py widgetsnbextension
# if in a virtual environment,
!jupyter nbextension enable --py --sys-prefix widgetsnbextension

In [2]:
import umap
import json
import numpy as np
import pandas as pd 
import altair as alt
from tqdm import tqdm
from annoy import AnnoyIndex
# from annoy import AnnoyIndex
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from functions.preprocessing import *
from multiprocessing import Pool, cpu_count

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', None)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nlplab\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
model = SentenceTransformer("multi-qa-distilbert-cos-v1")

In [None]:
# if not already filtered, load data and clean it 
narrative_df = pd.read_csv('unfiltered_sample_narratives.csv')

# make lowercase 
narrative_df['selftext'] = narrative_df['selftext'].str.lower()

# remove if there are any null values and narratives have less than 50 words
narrative_df = narrative_df.dropna()
narrative_df = narrative_df[narrative_df['selftext'].str.split().str.len().gt(50)]

narrative_df.to_csv("filtered_sample_narratives.csv")

In [4]:
# if already filtered, use the filtered (already cleaned) dataset
narrative_df = pd.read_csv("filtered_sample_narratives.csv")

# to use old narratives as queries
oldnarrative_queries = list(json.load(open("local_data.json", encoding="utf-8"))["File"]) 

# # to use keywords as queries 
# queries = list(json.load(open("clustered_sentences_10.json"))) 
# queries_combined = [" ".join(q) for q in queries]

##### Lemmatize

In [9]:
tokenized_queries = list(map(lemmatize, oldnarrative_queries))

In [6]:
#lemmatize narratives  
num_processes = 11
pool = Pool(processes=num_processes)

results = pool.imap(func=lemmatize,  
                    iterable= narrative_df['selftext'] ,
                    chunksize= 24370) 

pool.close()
pool.join()
 
narrative_df["tokenized_selftext"] = np.array([result for result in results])
narrative_df.to_csv("Tokenized_filtered_2018-01.csv", index=False)

##### Create Search Queries and Embeddings

In [7]:
#load lemmatized narratives
tokenized_narratives_df = pd.read_csv("Tokenized_filtered_2018-01.csv")

In [10]:
query_embeddings = model.encode(sentences= tokenized_queries,
                                convert_to_numpy=True,
                                show_progress_bar=True, 
                                normalize_embeddings=True)

query_search_index = AnnoyIndex(query_embeddings.shape[1], 'manhattan')

for index, embed_value in enumerate(query_embeddings):
    query_search_index.add_item(index, embed_value)

query_search_index.build(n_trees = 20, n_jobs = 3)


query_search_index.save('old-narrative-queries_search_index.ann')
np.save('old-narrative-queries_embeddings.npy', query_embeddings)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
# CPU version
narratives_embedding = model.encode(sentences=tokenized_narratives_df["tokenized_selftext"], 
                                    convert_to_numpy=True, 
                                    show_progress_bar=True)


narratives_search_index = AnnoyIndex(np.array(narratives_embedding).shape[1], 'manhattan')

for index_embedding, embed_value in enumerate(tqdm(narratives_embedding)):
    narratives_search_index.add_item(index_embedding, embed_value)

narratives_search_index.build(n_trees = 20, n_jobs = -1)
np.save('2018-01_narratives_embeddings.npy', narratives_embedding)
narratives_search_index.save(f'2018-01_narratives_search_index.ann')

In [11]:
# GPU version

pool = model.start_multi_process_pool()

narratives_embedding = model.encode_multi_process(sentences=tokenized_narratives_df["tokenized_selftext"], 
                                                pool=pool)
 
narratives_search_index = AnnoyIndex(np.array(narratives_embedding).shape[1], 'manhattan')

for index_embedding, embed_value in enumerate(tqdm(narratives_embedding)):
    narratives_search_index.add_item(index_embedding, embed_value)

narratives_search_index.build(n_trees = 20, n_jobs = -1)

np.save('2018-01_narratives_embeddings.npy', narratives_embedding)
narratives_search_index.save(f'2018-01_narratives_search_index.ann')

100%|██████████| 268070/268070 [00:10<00:00, 26503.86it/s]


True

##### Load embeddings and search indexes

In [None]:
# load embeddings and search indexes for new narratives 
narratives_embedding = np.load('2018-01_narratives_embeddings.npy')
narratives_search_index = AnnoyIndex(np.array(narratives_embedding).shape[1], 'manhattan')
narratives_search_index.load('2018-01_narratives_search_index.ann')

# oldnarrative based: load embeddings and search indexes queries
query_embeddings = np.load('old-narrative-queries_embeddings.npy')
query_search_index = AnnoyIndex(np.array(query_embeddings).shape[1], 'manhattan')
query_search_index.load('old-narrative-queries_search_index.ann')

##### Simple Concatenating (for a single query)

In [None]:
#retrive nearest neighbors
results_list = []

for index, query_embedding in tqdm(enumerate(query_embeddings)): 
    similar_item_ids = narratives_search_index.get_nns_by_vector(query_embedding, n=100, include_distances=True)

    result_df = pd.DataFrame(data={
        'selftext': narrative_df['selftext'][similar_item_ids[0]],
        'title': narrative_df['title'][similar_item_ids[0]],
        'ids': narrative_df['id'][similar_item_ids[0]],
        'distance': similar_item_ids[1]
    })

    results_list.append(result_df)

results = pd.concat(results_list)
results = results.drop_duplicates(subset=['selftext'])
results = results.sort_values(by=['distance'], ascending=True)

results.to_csv("results_tokenized_full.csv", index=False)

##### Comparative Concatenating

In [31]:
def compare(id_to_distance_dictionary: dict, threshold: float) -> float:
    """
    Filters for stories which contain intersecting topics specified by the threshold. 

    :param id_to_distance_dictionary
    :param

    return: 
        tuple containing all the ids for relevant narratives and their respective distances
    """
   
    filtered_id_distance_array = []

    for comparable_key in id_to_distance_dictionary:
        # array we are currently trying to filter
        comparable_id_array = id_to_distance_dictionary[comparable_key][0]
        comparable_distance_array = id_to_distance_dictionary[comparable_key][1]

        for comparable_id_index, comparable_id in enumerate(comparable_id_array):
            percentage = 0
            for tocompare_key in id_to_distance_dictionary:
                if tocompare_key != comparable_key:
                    tocompare_id_array = id_to_distance_dictionary[tocompare_key][0]

                    if tocompare_id_array.__contains__(comparable_id): 
                        percentage += (1 / (len(id_to_distance_dictionary)-1))

            if percentage >= threshold: 
                filtered_id_distance_array.append((comparable_id, comparable_distance_array[comparable_id_index],))

    return filtered_id_distance_array

In [34]:
similar_dict = {}
for index, query_embedding in tqdm(enumerate(query_embeddings)): 
    similar_item_ids = narratives_search_index.get_nns_by_vector(
                                    query_embedding,
                                    n=1000, 
                                    include_distances=True)

    similar_dict[f"query_{index}"] = similar_item_ids


filterd_arr= compare(similar_dict, threshold= 0.5) 

16it [00:00, 21.95it/s]


In [29]:
for x in similar_dict['query_0'][0]: 
    print(x)

36
53
58
60
83
86
113
117
147
149
165
167
186
215
250
273
278
299
311
325
362
364
380
387
429
449
462
478
488
490
495
502
513
514
520
525
564
570
575
584
586
587
588
598
617
629
654
656
691
705
730
757
762
775
780
788
790
800
802
815
828
842
845
846
853
861
878
880
887
900
903
920
923
969
977
993
995
1001
1008
1033
1047
1060
1067
1068
1073
1079
1081
1093
1096
1102
1137
1139
1145
1165
1172
1181
1191
1205
1248
1249
1256
1263
1292
1293
1306
1308
1320
1321
1343
1353
1373
1374
1379
1388
1431
1456
1496
1515
1518
1532
1569
1588
1593
1595
1604
1610
1618
1626
1642
1681
1682
1684
1687
1708
1710
1724
1728
1736
1778
1782
1788
1791
1798
1799
1833
1842
1849
1851
1873
1903
1912
1923
1933
1936
1962
1964
1972
2015
2085
2088
2130
2155
2158
2160
2165
2176
2184
2188
2193
2214
2223
2230
2255
2266
2268
2307
2319
2327
2328
2347
2359
2361
2364
2369
2383
2387
2388
2397
2403
2409
2456
2490
2520
2527
2536
2543
2544
2550
2554
2562
2582
2592
2600
2643
2685
2704
2714
2732
2738
2763
2814
2827
2836
2855
2869
2875
288

In [None]:
for index, similar_item_ids in enumerate(filterd_arr): 

    new_row = {
        'selftext': narrative_df['selftext'][similar_item_ids[0]],
        'title': narrative_df['title'][similar_item_ids[0]],
        'ids': narrative_df['id'][similar_item_ids[0]],
        'distance': similar_item_ids[1]
    }
    
    if index != 0:
        filtered_results_df.append(new_row, ignore_index=True)
    else: 
        filtered_results_df = pd.DataFrame([new_row])

filtered_results_df.to_csv("Filtered_Results_2018-01.csv", index=False)

##### Results

In [None]:
# add empty row to the bottom of the dataframe
import pandas as pd 
results_df = pd.read_csv("Filtered_Results_2018-01.csv.csv")
results_df = results_df[1:]
results_df.reset_index(drop=True)

In [None]:
from functions.document_writer import docx_writer
from paths import HOME

docx_writer(num_people=5, data = results_df, paths=HOME)