In [2]:
import json
import os

def extract_keys_and_elements(json_data):
    keys = []
    elements = []
    
    def traverse(data):
        if isinstance(data, dict):
            for key, value in data.items():
                keys.append(key)
                traverse(value)
        elif isinstance(data, list):
            for item in data:
                traverse(item)
        else:
            elements.append(data)
    
    traverse(json_data)
    return keys, elements

prefix = 'TOPIC_'
topic_list = ['luck', 'downtime', 'leader', 'bookeeping', 'complicated', 'complex', 'negative', 'extras']
word_list = []

for topic in topic_list:
    
    filename = os.path.join('TOPICS', prefix + topic + '.json')

    with open(filename, 'r') as file: json_data = json.load(file)

    keys, elements = extract_keys_and_elements(json_data)
    word_list = word_list + keys + elements

In [3]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize

import numpy as np
np.random.seed(400)

stemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 2:
            result.append(lemmatize_stemming(token))
    return result

processed_docs = []
reference_sheet = {}
for doc in word_list:
    processed_words = preprocess(doc)
    processed_docs.append(processed_words)
    for word in processed_words:
        if word in reference_sheet: reference_sheet[word].append(doc)
        else: reference_sheet[word] = [doc]
reference_sheet = {key: list(set(values)) for key, values in reference_sheet.items()}                      
            
print(len(processed_docs))
print(len(reference_sheet))

2308
1420


In [4]:
from gensim.corpora import Dictionary

dictionary = gensim.corpora.Dictionary(processed_docs)  
dictionary.save('dictionary')
dictionary = Dictionary.load('dictionary')
# for token, idx in dictionary.token2id.items(): print(token, idx)

In [5]:
def read_words_from_file(filename):
    word_list = []
    with open(filename, 'r') as file:
        for line in file:
            word = line.strip()
            word_list.append(word)
    return word_list

filename = 'words_to_remove.txt'
words_to_remove = read_words_from_file(filename)

print(words_to_remove, '\n')
print(len(dictionary))

word_ids = [dictionary.token2id[word] for word in words_to_remove if word in dictionary.token2id]
dictionary.filter_tokens(bad_ids=word_ids)
dictionary.compactify()
# for word_id in word_ids: dictionary.token2id.popitem(word_id)

print(len(dictionary), '\n')

from gensim.corpora import Dictionary

dictionary.save('trimmed_dictionary')
dictionary = Dictionary.load('trimmed_dictionary')
# for token, idx in auxiliar.token2id.items(): print(token, idx)

['game', 'player', 'play', 'card', 'like'] 

1420
1416 



In [6]:
import json
json_data = json.dumps(reference_sheet)
with open('reference_sheet_manual.json', 'w') as file: file.write(json_data)