In [1]:
import nltk
from nltk.corpus import wordnet

def generate_synonyms(word, num_synonyms):
    synonyms = []
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.append(lemma.name())
    synonyms = list(set(synonyms))  # Remove duplicates
    if len(synonyms) <= num_synonyms:
        return synonyms
    else:
        return synonyms[:num_synonyms]

In [2]:
from gensim.corpora import Dictionary

dictionary = Dictionary.load('corpus_dictionary')
print('Dictonary from corpus length: ', len(dictionary))
dictionary_manual = Dictionary.load('trimmed_dictionary')
print('Dictonary from bot length: ', len(dictionary_manual))

import json

with open('reference_sheet.json', 'r') as file: json_data = file.read()
reference_sheet = json.loads(json_data)
with open('reference_sheet_manual.json', 'r') as file: json_data = file.read()
reference_sheet_manual = json.loads(json_data)

Dictonary from corpus length:  105
Dictonary from bot length:  1416


In [4]:
n = 20

old_words = []
for token, idx in dictionary.token2id.items(): old_words = old_words + reference_sheet[token]
print(old_words[:10])
new_words = []
for word in old_words: new_words = new_words + generate_synonyms(word, n)
print(new_words[:10])

['absolutely', 'absolute', 'absolut', 'absolutly', 'absolution', 'action', 'actions', 'actually', 'actual', 'actuall']
['utterly', 'absolutely', 'dead', 'perfectly', 'downright', 'right-down', 'sheer', 'infrangible', 'absolute', 'rank']


In [5]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(400)
import nltk
#nltk.download('wordnet')

new_reference_sheet = {}

stemmer = SnowballStemmer("english")

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 2:
            word = lemmatize_stemming(token)
            if word in new_reference_sheet:
                if token not in new_reference_sheet[word]: new_reference_sheet[word].append(token)
            else: new_reference_sheet[word] = [token]
            result.append(word)
    return result

gensim_words = [preprocess(word) for word in new_words]

from gensim.corpora import Dictionary

new_dictionary = gensim.corpora.Dictionary(gensim_words)  
new_dictionary.save('addition_dictionary')
new_dictionary = Dictionary.load('addition_dictionary')

In [6]:
import gensim.corpora as corpora
from collections import defaultdict

merged_dict = corpora.Dictionary()

merged_dict.merge_with(dictionary)
merged_dict.merge_with(new_dictionary)
merged_dict.merge_with(dictionary_manual)

merged_ref_dict = defaultdict(list)

for key, value in reference_sheet.items():
    merged_ref_dict[key].extend(value)

for key, value in new_reference_sheet.items():
    merged_ref_dict[key].extend(value)

for key, value in reference_sheet_manual.items():
    merged_ref_dict[key].extend(value)
    
for key, value in merged_ref_dict.items(): merged_ref_dict[key] = list(set(value))
merged_ref_dict = dict(merged_ref_dict)

In [8]:
def read_words_from_file(filename):
    word_list = []
    with open(filename, 'r') as file:
        for line in file:
            word = line.strip()
            word_list.append(word)
    return word_list

filename = 'words_to_remove.txt'
words_to_remove = read_words_from_file(filename)

print(words_to_remove, '\n')
print(len(merged_dict))

word_ids = [merged_dict.token2id[word] for word in words_to_remove if word in merged_dict.token2id]
merged_dict.filter_tokens(bad_ids=word_ids)
merged_dict.compactify()

print(len(merged_dict), '\n')

import json
json_data = json.dumps(merged_ref_dict)
with open('merged_reference_sheet.json', 'w') as file: file.write(json_data)
    
from gensim.corpora import Dictionary
merged_dict.save('merged_dictionary')   

['game', 'player', 'play', 'card', 'like', 'fun', 'time', 'mechan'] 

2371
2371 

