### Import Data

In [7]:
import json

# importing data if in json format
def load_json_data(file: str) -> dict:
    try:
        with open(file, "r", encoding="utf-8") as f:
            data = json.load(f)
        return data
    except FileNotFoundError:
        print(f"File '{file}' not found.")
        return {}
    except json.JSONDecodeError:
        print(f"Error decoding JSON data from file '{file}'.")
        return {}

# get all txt files from Data Dump folder #TODO: move this to a util file
import os
import numpy as np
import subprocess

def load_txt_data(dir: str) -> np.ndarray:
    data_points = []
    if os.path.exists(dir):
        for file in os.listdir(dir):
            if file.endswith(".txt"):
                with open(os.path.join(dir, file), "r", encoding="utf-8") as f:
                    content = f.read()
                    data_points.append(content)
        return np.array(data_points)

def get_recent_query_data(): # todo: erica
    # Define the repository URL and the file path
    repo_url = "https://github.com/username/repo"
    file_path = "/path/to/local/file.txt"

    # Clone the repository to a temporary directory
    temp_dir = "/tmp/repo"
    subprocess.run(["git", "clone", repo_url, temp_dir])

    # Move the file from the temporary directory to the desired location
    subprocess.run(["mv", f"{temp_dir}/file.txt", file_path])

    # Remove the temporary directory
    subprocess.run(["rm", "-rf", temp_dir])

## Data Prep

### Search

In [8]:
# install the following if necessary
!pip install --use-pep517 annoy
!pip install ipywidgets
# if not in a virtual environment,
#jupyter nbextension enable --py widgetsnbextension
# if in a virtual environment,
!jupyter nbextension enable --py --sys-prefix widgetsnbextension

Collecting annoy
  Downloading annoy-1.17.3.tar.gz (647 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m647.5/647.5 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hBuilding wheels for collected packages: annoy
  Building wheel for annoy (pyproject.toml) ... [?25ldone
[?25h  Created wheel for annoy: filename=annoy-1.17.3-cp39-cp39-linux_x86_64.whl size=75070 sha256=5f63e7a64ff7cef577f23c2cde7b3acfca43e7280fadca74a2a6603119557a46
  Stored in directory: /home/mkelbessa/.cache/pip/wheels/09/a9/54/37478e65995fe712f7da465749da9ddb21db6b1a599d591ac7
Successfully built annoy
Installing collected packages: annoy
Successfully installed annoy-1.17.3
Collecting ipywidgets
  Downloading ipywidgets-8.1.1

In [1]:
import json
import numpy as np
import pandas as pd 
from tqdm import tqdm
from annoy import AnnoyIndex
from functions.preprocessing import *
from multiprocessing import Pool, cpu_count
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', None)

QA_MODEL_NAME = "multi-qa-distilbert-cos-v1"

# load model 
def load_model():
    model = SentenceTransformer(QA_MODEL_NAME)
    return model

# there seems to be a bug with the model, so we need to reload it in case it all the initial encoding fails
model = load_model()

def test_encode(model):
    sentence = """
    Amidst the verdant tapestry of the ancient forest, where sunlight dapples the forest floor and the rustling leaves whisper secrets of generations past, a curious menagerie 
    of creatures, from the tiniest iridescent insects to the towering, majestic oak trees,coexists in a harmonious symphony of life, while the distant echoes of a 
    bubbling brook weave a soothing melody through the very heart of this enchanting wilderness.
    """
    assert np.all(model.encode(sentences=sentence) != 0)

try:
    test_encode(model)
except Exception as e:
    print("Failed to encode sentence, reloading model")
    model = load_model()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\MoRevolution\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
sentence = """
Amidst the verdant tapestry of the ancient forest, where sunlight dapples the forest floor and the rustling leaves whisper secrets of generations past, a curious menagerie 
of creatures, from the tiniest iridescent insects to the towering, majestic oak trees,coexists in a harmonious symphony of life, while the distant echoes of a 
bubbling brook weave a soothing melody through the very heart of this enchanting wilderness.
"""

model.encode(sentences=sentence).shape

(768,)

In [23]:
temp_df = pd.read_json('F:/RS_2020-01.json', lines=True, chunksize=1000)

In [35]:
df["subreddit"]

3000             memes
3001     u_yeettime123
3002            europe
3003        AllenBWest
3004         blackcats
             ...      
3995            faputa
3996      WayOfTheBern
3997              Dabs
3998    gaypornhunters
3999          onewheel
Name: subreddit, Length: 1000, dtype: object

In [44]:
# list of subreddits to concentrate on 
# !wget https://raw.githubusercontent.com/username/repo/main/subreddits.csv -O subreddits.csv

subreddits = pd.read_csv("subreddits.csv").dropna() 

# load query data 
query_data = load_txt_data("Data Dump")

# load bulk data
import pandas as pd

def process_large_json_pd(file_path):
    chunksize = 100000  # lines
    df = pd.DataFrame()
    for chunk in pd.read_json(file_path, lines=True, chunksize=chunksize):
        
        # make lowercase 
        chunk = chunk.dropna(subset=['selftext'])

        chunk = chunk[chunk['subreddit'].isin(subreddits['subreddit'].values)]
    
        # remove if there are any null values and narratives have less than 50 words
        chunk = chunk[chunk['selftext'].str.split().str.len().gt(50)] #check if this is the right way to do it

        chunk['selftext'] = chunk['selftext'].str.lower()

        df = pd.concat([df, chunk], ignore_index=True)

    return df

from pyspark.sql import SparkSession
from pyspark.sql.functions import lower, split, length
from pyspark import pandas as ps
import time

filtered_bulk_data = process_large_json_spark('F:/RS_2020-01.json')
filtered_bulk_data.write.csv('filtered_bulk_data_RS_2020_01.csv')

filtered_bulk_data = process_large_json_pd('F:\RS_2020-01.json')


KeyboardInterrupt: 

In [45]:
direct_df = pd.read_json('F:/RS_2020-01.json', lines=True)

ValueError: Unmatched ''"' when when decoding 'string'

##### Lemmatize

In [None]:
tokenized_queries = list(map(lemmatize, oldnarrative_queries))

In [None]:
#lemmatize narratives  

num_processes = 11
pool = Pool(processes=num_processes)

results = pool.imap(func=lemmatize,  
                    iterable= narrative_df['selftext'],
                    chunksize= 24370) 

pool.close()
pool.join()
 
narrative_df["tokenized_selftext"] = np.array([result for result in results])
narrative_df.to_csv("Tokenized_filtered_2018-01.csv", index=False)

##### Create Search Queries and Embeddings

In [None]:
#load lemmatized narratives
tokenized_narratives_df = pd.read_csv("D:\Reddit Dataset\Submissions\Decompressed\csv\RS_2018-01_lemmatized.csv")

In [None]:
tokenized_narratives_df.shape

In [None]:
tokenized_narratives_df = tokenized_narratives_df[tokenized_narratives_df["subreddit"].isin(subreddits['subreddit'])]
tokenized_narratives_df.reset_index(drop=True, inplace=True)

In [None]:
tokenized_narratives_df.shape

In [None]:
query_embeddings = model.encode(sentences= tokenized_queries,
                                convert_to_numpy=True,
                                show_progress_bar=True, 
                                normalize_embeddings=True)

np.save('old-narrative-queries_embeddings.npy', query_embeddings)

query_search_index = AnnoyIndex(query_embeddings.shape[1], 'manhattan')

for index, embed_value in enumerate(query_embeddings):
    query_search_index.add_item(index, embed_value)

query_search_index.build(n_trees = 20, n_jobs = 3)

query_search_index.save('old-narrative-queries_search_index.ann')

Note: You can either use the CPU or GPU to encode (use gpu if possible as its faster), but don't use both at the same time so as not to do the computation twice.

In [None]:
# embedding narratives: CPU version
narratives_embedding = model.encode(sentences=tokenized_narratives_df["lemmatized_selftext"], 
                                    convert_to_numpy=True, 
                                    show_progress_bar=True)

np.save('RS_2018-01_narrative_embeddings.npy', narratives_embedding)

In [None]:
# embedding narratives: GPU version
pool = model.start_multi_process_pool()

narratives_embedding = model.encode_multi_process(sentences=tokenized_narratives_df["lemmatized_selftext"], 
                                                pool=pool)

model.stop_multi_process_pool(pool=pool)

np.save('RS_2018-01_narrative_embeddings.npy', narratives_embedding)

In [None]:
narratives_search_index = AnnoyIndex(np.array(narratives_embedding).shape[1], 'manhattan')

for index_embedding, embed_value in enumerate(tqdm(narratives_embedding)):
    narratives_search_index.add_item(index_embedding, embed_value)

narratives_search_index.build(n_trees = 20, n_jobs = -1)

narratives_search_index.save(f'RS_2018-01_narrative_search_index.ann')

##### Load embeddings and search indexes

In [None]:
# load embeddings and search indexes for new narratives 
narratives_embedding = np.load('RS_2018-01_narrative_embeddings.npy')
narratives_search_index = AnnoyIndex(np.array(narratives_embedding).shape[1], 'manhattan')
narratives_search_index.load('RS_2018-01_narrative_search_index.ann')

# oldnarrative based: load embeddings and search indexes queries
query_embeddings = np.load('old-narrative-queries_embeddings.npy')
query_search_index = AnnoyIndex(np.array(query_embeddings).shape[1], 'manhattan')
query_search_index.load('old-narrative-queries_search_index.ann')

##### Simple Concatenating (for a single query)

In [None]:
#retrive nearest neighbors
results_list = []

for index, query_embedding in tqdm(enumerate(query_embeddings)): 
    similar_item_ids = narratives_search_index.get_nns_by_vector(query_embedding, n=100, include_distances=True)

    result_df = pd.DataFrame(data={
        'selftext': narrative_df['selftext'][similar_item_ids[0]],
        'title': narrative_df['title'][similar_item_ids[0]],
        'ids': narrative_df['id'][similar_item_ids[0]],
        'distance': similar_item_ids[1]
    })

    results_list.append(result_df)

results = pd.concat(results_list)
results = results.drop_duplicates(subset=['selftext'])
results = results.sort_values(by=['distance'], ascending=True)

results.to_csv("results_tokenized_full.csv", index=False)

##### Comparative Concatenating

In [None]:
def compare(id_to_distance_dictionary: dict, threshold: float) -> float:
    """
    Filters for stories which contain intersecting topics specified by the threshold. 

    :param id_to_distance_dictionary
    :param

    return: 
        tuple containing all the ids for relevant narratives and their respective distances
    """
   
    filtered_id_distance_array = []

    for comparable_key in id_to_distance_dictionary:
        # array we are currently trying to filter
        comparable_id_array = id_to_distance_dictionary[comparable_key][0]
        comparable_distance_array = id_to_distance_dictionary[comparable_key][1]

        for comparable_id_index, comparable_id in enumerate(comparable_id_array):
            percentage = 0
            for tocompare_key in id_to_distance_dictionary:
                if tocompare_key != comparable_key:
                    tocompare_id_array = id_to_distance_dictionary[tocompare_key][0]

                    if tocompare_id_array.__contains__(comparable_id): 
                        percentage += (1 / (len(id_to_distance_dictionary)-1))

            if percentage >= threshold: 
                filtered_id_distance_array.append((comparable_id, comparable_distance_array[comparable_id_index],))

    return filtered_id_distance_array

In [None]:
similar_dict = {}
for index, query_embedding in tqdm(enumerate(query_embeddings)): 
    similar_item_ids = narratives_search_index.get_nns_by_vector(
                                    query_embedding,
                                    n=1000, 
                                    include_distances=True)

    similar_dict[f"query_{index}"] = similar_item_ids


filterd_arr= compare(similar_dict, threshold= 0.7) # filterd ids

In [None]:
(99999, distance)

In [None]:
tokenized_narratives_df.columns

In [None]:
for index, similar_item_ids in enumerate(filterd_arr): #todo: erika

    new_row = {
        'selftext': tokenized_narratives_df['selftext'][similar_item_ids[0]],
        'title': tokenized_narratives_df['title'][similar_item_ids[0]],
        'subreddit': tokenized_narratives_df['subreddit'][similar_item_ids[0]],
        'permalink': tokenized_narratives_df['permalink'][similar_item_ids[0]],
        'ids': tokenized_narratives_df['id'][similar_item_ids[0]],
        'distance': similar_item_ids[1]
    }

    if index != 0:
        filtered_results_df.loc[len(filtered_results_df)] = new_row
    else: 
        filtered_results_df = pd.DataFrame([new_row])

filtered_results_df.to_csv("Filtered_Results_RS_2018-01.csv", index=False)

##### Results

In [None]:
# add empty row to the bottom of the dataframe
import pandas as pd 
results_df = pd.read_csv("Filtered_Results_RS_2018-01.csv")
results_df.reset_index(drop=True)

In [None]:
results_df = results_df[results_df["distance"] < 20.00]

In [None]:
results_df.sort_values(by=["distance"])

In [None]:
from functions.document_writer import docx_writer
from paths import HOME

docx_writer(num_people=5, dataframe= results_df)