### Import Data

In [None]:
import json

def load_data(file):
    with open (file, "r", encoding="utf-8") as f:
        data = json.load(f) 
    return (data)

## Keyword Extraction

### Named entity Recognition

#### Import Libraries

In [None]:
import json
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from collections import Counter
import nltk 
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')


In [None]:
from nltk.corpus import wordnet as wn

In [None]:
syns = wordnet.synsets("program")

#### Key Word extractor 

In [None]:

# Define a function to extract keywords
def extract_keywords(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())

    # Remove stopwords
    stopwords_list = stopwords.words('english')
    tokens = [token for token in tokens if token not in stopwords_list]

    # Tag the tokens with their part of speech
    tagged_tokens = pos_tag(tokens)

    # Define a function to convert part of speech tags to WordNet compatible tags
    def get_wordnet_pos(treebank_tag):
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return ''

    # Lemmatize the tokens using WordNet
    lemmatizer = wordnet.WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token, pos=get_wordnet_pos(tag)) for token, tag in tagged_tokens]

    # Identify named entities using NLTK's named entity recognition (NER) module
    named_entities = ne_chunk(pos_tag(word_tokenize(text)))

    # Extract the most common lemmas and named entities
    keywords = [token for token, count in Counter(lemmatized_tokens + [chunk[0] for chunk in named_entities if hasattr(chunk, 'label')]).most_common(10)]

    return keywords

#### Implementaion

In [None]:
#download using nltk.download('punkt') if you get an nltk error
all_keywords = []
for index, story in enumerate(corpus):
    keywords = extract_keywords(story[index])
    all_keywords.append(keywords)

### TF-IDF

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
import nltk
nltk.download('stopwords')

# Load the English language model
nlp = spacy.load("en_core_web_sm")

# Get the stop words for English
spacy_stop_words = nlp.Defaults.stop_words

# choose the set of english stopwords
nltk_stop_words = set(stopwords.words('english'))

# Function to preprocess the input corpus
def preprocess_text(text):
    # tokenize a story within a corpus 
    tokens = word_tokenize(text.lower())

    # remove alphanumeric characters and stop words
    cleaned_tokens = [token for token in tokens if token.isalpha() and token not in spacy_stop_words]
    return cleaned_tokens

cleaned_corpus = [preprocess_text(doc) for doc in corpus]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
# vectorize using TfidfVectorizer which combines counting and noralized weighting
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)

feature_names = vectorizer.get_feature_names_out()

In [None]:
import numpy as np

def get_top_keywords(tfidf_scores, feature_names, top_n):
    sorted_scores = np.argsort(tfidf_scores)[::-1]
    top_keywords = [feature_names[i] for i in sorted_scores[:top_n]]
    return top_keywords

for i, doc in enumerate(corpus):
    tfidf_scores = tfidf_matrix[i].toarray()[0]
    top_keywords = get_top_keywords(tfidf_scores, feature_names, 10)
    print(f"Document {i+1} top keywords: {top_keywords}")


##### Experimental 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Define the keywords and phrases that are likely to appear in relevant stories
relevant_keywords = ['low socio-economic background', 'higher education', 'difficulties', 'overcome']

# Define the path to the file containing the stories
file_path = 'path/to/file.csv'

# Read in the stories from the file
with open(file_path, 'r', encoding='utf-8') as csv_file:
    csv_reader = csv.DictReader(csv_file)
    stories = [row['story'] for row in csv_reader]

# Define a vectorizer that will convert the stories into a matrix of TF-IDF features
vectorizer = TfidfVectorizer(stop_words='english', use_idf=True)

# Convert the stories into a matrix of TF-IDF features
story_matrix = vectorizer.fit_transform(stories)

# Loop through the stories and calculate the cosine similarity between each story and the relevant keywords
for i, story in enumerate(stories):
    # Convert the story into a matrix of TF-IDF features
    story_vec = vectorizer.transform([story])

    # Calculate the cosine similarity between the story and the relevant keywords
    similarity = cosine_similarity(story_vec, story_matrix[:, [vectorizer.vocabulary_.get(word) for word in relevant_keywords]])

    # If the similarity is above a certain threshold, print the story
   


### Rake_NLTK

**Use "rake_nltk.txt" to install all the necessary packages** (```pip install -r requirements.txt```).

In [None]:
from rake_nltk import Rake

r = Rake(min_length = 1)

r.extract_keywords_from_text(corpus[0])
r.get_ranked_phrases_with_scores()

### KeyBERT & KeyphraseVectorizers (best)

**Use "KeyBert_req.txt" to install all the necessary packages** (```pip install -r requirements.txt```).
However, before you install anything, make sure to fulfill the requirements below:
* Make sure you installed the following [cuda(especially nvcc)](https://nvidia.github.io/cuda-python/install.html), [spacy](https://spacy.io/usage#quickstart), [visual c++ >2017 and the windows SDK for C++](https://visualstudio.microsoft.com/visual-cpp-build-tools/). The links above should lead you to the installation instruction of each of these libraries in case the pip install of the requirements doesn't work. Visual c++ and the Windows SDK for C++ needs to be installed manually. <u><span style="background-color: #f70000">**Make sure to use Python <= 3.9.**</span><u>
    
      
    

In [None]:
from keyphrase_vectorizers import KeyphraseCountVectorizer
from keybert import KeyBERT
from paths import DATA 

#init model
kw_model = KeyBERT(model='all-MiniLM-L6-v2')
corpus = load_data(DATA)["File"]

In [None]:
#vectorizer based
keywords = kw_model.extract_keywords(
        docs=corpus, 
        vectorizer = KeyphraseCountVectorizer(spacy_pipeline='en_core_web_sm'), #passing vectorizer in, don't use keyphrase_ngram_range
        use_maxsum=True, 
        use_mmr=True,
        nr_candidates = 20,
        top_n = 10,
        diversity=0.5
        )

keywords

In [None]:
#ngram_range
keywords_2 = kw_model.extract_keywords(
        docs=corpus, 
        keyphrase_ngram_range = (1,3),
        use_maxsum=True, 
        use_mmr=True,
        stop_words ='english', 
        nr_candidates = 20,
        top_n = 10,
        diversity=0.5
        )

keywords_2

In [None]:
keyphrases = [q[0] for x in keywords for q in x]
keyphrases_2 = [q[0] for x in keywords_2 for q in x]

#change to numpy array 
import numpy as np

keyphrases = np.array(keyphrases)
keyphrases_2 = np.array(keyphrases_2)

#combine the two arrays into 2-d array
keyphrase_arr = np.stack((keyphrases, keyphrases_2), axis = 1)

#table of keyphrases
import pandas as pd
df = pd.DataFrame(keyphrase_arr, columns = ['verctorized_keyphrases', 'ngram_range_keyphrases'])
df.to_csv('keyphrases.csv', index = False)

## Data Prep

### Clustering

In [None]:
from sentence_transformers import SentenceTransformer
import warnings
warnings.filterwarnings('ignore')

embedder = SentenceTransformer('all-MiniLM-L6-v2')

df = pd.read_csv('keyphrases.csv')
keyphrases = df['ngram_range_keyphrases'].to_numpy()

#embed keyphrases
corpus_embeddings = embedder.encode(keyphrases, convert_to_tensor=True)

#normalization
corpus_embeddings = corpus_embeddings / np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)

#### K-Means

In [None]:
from sklearn.cluster import KMeans

kmeans_arr  = []

for n_clusters in range(5, 16): 
    clustering_model = KMeans(n_clusters=n_clusters)
    clustering_model.fit(corpus_embeddings)
    cluster_assignment = clustering_model.labels_
    kmeans_arr.append(np.array(cluster_assignment))

In [None]:
clustered_sentences = [[[] for _ in range(n_clusters)] for n_clusters in range(5, 16)]

for n_clusters, arr in enumerate(kmeans_arr):
    for phrase_id, cluster_id in enumerate(arr):
        clustered_sentences[n_clusters][cluster_id].append(keyphrases[phrase_id])



In [None]:
#export as json
import json
index = 10
with open(f'clustered_sentences_{index}.json', 'w') as f:

    json.dump(clustered_sentences[index], f, indent=4, sort_keys=True)

#### Agglomerative Clustering

In [None]:
from sklearn.cluster import AgglomerativeClustering


clustering_model_AGC= AgglomerativeClustering(n_clusters=None, distance_threshold=1.5)
clustering_model_AGC.fit(corpus_embeddings)
cluster_assignment_AGC= clustering_model_AGC.labels_
cluster_assignment_AGC

In [None]:
clustered_sentences_AGC = {}
for phrase_id, cluster_id in enumerate(cluster_assignment_AGC):
    if cluster_id not in clustered_sentences:
        clustered_sentences[cluster_id] = []

    clustered_sentences[cluster_id].append(keyphrases[phrase_id])

#### Fast Clustering

### Search

In [None]:
!pip install cohere umap-learn altair annoy datasets tqdm

In [None]:
#instantiate cohere and import libraries
import re
import umap
import json
import cohere
import pandas as pd 
import altair as alt
from tqdm import tqdm
from annoy import AnnoyIndex
from keys import COHERE_APIKEY
from sklearn.metrics.pairwise import cosine_similarity

co = cohere.Client(COHERE_APIKEY)

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', None)

In [None]:
df = pd.read_csv('sample_narratives.csv')
narrative_dict = dict(zip(df['id'], df['selftext']))
queries = list(json.load(open("clustered_sentences_10.json"))) 

In [None]:
df.shape[0] // 100

In [None]:
# get embeddings for each query
# separate selftex
embeds = co.embed(texts=df['selftext'].tolist(), model='embed-english-v2.0' ).embeddings

# build index 
search_index = AnnoyIndex(np.array(embeds).shape[1], 'angular')

for index, embed_value in enumerate(embeds):
    search_index.add_item(index, embed_value)

search_index.build(10)
search_index.save(f'search indexes/data_search_index.ann')

In [None]:
for index_q, query in enumerate(queries): 
    for index_sub_q, subquery in enumerate(query):
        # get embeddings for each query
        embeds = co.embed(texts=df['selftext'].tolist(), model='embed-english-v2.0' ).embeddings
        
        # build index 
        search_index = AnnoyIndex(np.array(embeds).shape[1], 'angular')

        for index, embed_value in enumerate(embeds):
            search_index.add_item(index, embed_value)
        
        search_index.build(10)
        search_index.save(f'searches/index_{index_q}/search_index_{index_sub_q}.ann')
    

## Filtering

### Heuristic Filtering

In [None]:

g_story = []
# Define the keywords and phrases 
relevant_keywords = ['low socio-economic background', 'higher education', 'difficulties', 'overcome']

# Define the keywords and phrases that are likely to appear in irrelevant stories
irrelevant_keywords = ['high-achieving', 'wealthy', 'privileged']


for story in corpus: 
    # Check if the story contains any of the relevant keywords
    relevant_count = sum(1 for keyword in relevant_keywords if keyword in story.lower())

    # Check if the story contains any of the irrelevant keywords
    irrelevant_count = sum(1 for keyword in irrelevant_keywords if keyword in story.lower())

    # If the story is relevant, print it
    if relevant_count > 0 and irrelevant_count == 0:
        g_story.append(story)


### Text Classification

In [None]:
import json
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score



In [None]:
labeled_data = corpus

# Split the data into training and testing sets
train_data = [data['text'] for data in labeled_data if data['label'] == 'relevant']
train_labels = [data['label'] for data in labeled_data if data['label'] == 'relevant']
test_data = [data['text'] for data in labeled_data if data['label'] != 'relevant']
test_labels = [data['label'] for data in labeled_data if data['label'] != 'relevant']

# Convert the text data into a bag-of-words representation
vectorizer = CountVectorizer(stop_words='english')
train_data_counts = vectorizer.fit_transform(train_data)
test_data_counts = vectorizer.transform(test_data)

In [None]:
# Train a Naive Bayes classifier on the labeled data
clf = MultinomialNB()
clf.fit(train_data_counts, train_labels)

# Evaluate the classifier on the test data
predictions = clf.predict(test_data_counts)
accuracy = accuracy_score(test_labels, predictions)
print(f"Accuracy: {accuracy}")

# Filter the relevant stories from the corpus
with open('corpus.json', 'r') as f:
    corpus = json.load(f)

relevant_stories = []
for story in corpus:
    story_counts = vectorizer.transform([story['text']])
    if clf.predict(story_counts) == 'relevant':
        relevant_stories.append(story)

print(f"Found {len(relevant_stories)} relevant stories.")
