In [1]:
import os
import json
import re
import nltk
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
# nltk.download('averaged_perceptron_tagger')

# loading corpus and WordNet dictionary
aliceTXTfile = 'C:\\Users\\apoll\\Deep Learning\\data\\alice_in_wonderland.txt'
WordNetJSONfile = 'C:\\Users\\apoll\\OneDrive\\Documents\\GitHub\\dict-debias\\data\\dict_wn.json'

with open(aliceTXTfile, encoding='utf-8') as f:
    lines = f.readlines()
    alice_text = ''.join(lines[23:])
with open(WordNetJSONfile, 'r', encoding='utf-8') as file:
    wordnet_dict = json.load(file)

In [2]:
def rank_words_by_frequency(text, agg):
    # split text into clean words
    text = re.sub(r'[^a-z]', ' ', text.lower())
    words = text.split()
    # count and sort words by frequency and
    word_counts = Counter(words)
    sorted_words = sorted(word_counts.items(), key=lambda x: x[1])  
    total_frequency = sum(word_counts.values())
    selected_words = []
    cumulative_frequency = 0
    # add the least frequent words to the selected list until their cumulative frequency reaches 30%
    for word, frequency in sorted_words:
        selected_words.append(word)
        cumulative_frequency += frequency
        if cumulative_frequency >= agg * total_frequency:
            break
    print(f"Words marked as uncommon: {len(set(selected_words))} ({round(len(set(selected_words))/len(set(words))*100,1)}% of total {len(set(words))} words)")
    return selected_words

# marking words from alice book with cummulative frequence of 30% as uncommon and adding to list
uncommon_words = rank_words_by_frequency(alice_text, 0.3)

Words marked as uncommon: 2395 (93.2% of total 2569 words)


In [3]:
stop_words = set(nltk.corpus.stopwords.words('english'))
def is_content_word(word):
    pos_tags = nltk.tag.pos_tag([word])
    if pos_tags:
        tag = pos_tags[0][1]
        return tag.startswith('NN') or tag.startswith('VB') or tag.startswith('JJ') or tag.startswith('RB')
    return False
filteredWordNet = {}

# filtering WordNet dictionary to only keep words that are nouns, adjectives, verbs or adverbs and are not stopwords
# and are more than 2 letters long and only contain letters for illustrative simplicity
for word, gloss in wordnet_dict.items():
    if all(c.isalpha() for c in word) and len(word) > 2 and word not in stop_words and is_content_word(word):
        #removing parentheses in glosses
        cg = [[s for s in inner_list if '(' not in s and ')' not in s] for inner_list in gloss] 
        filteredWordNet[word] = cg

print(f'{len(filteredWordNet)} ({round(len(filteredWordNet) / len(wordnet_dict) * 100, 1)}% of original dictionary entries kept)')

77103 (52.3% of original dictionary entries kept)


In [4]:
# furter filtering dictinary into its intersection with the uncommon words from our small corpus
filteredUncommonWordNet = {key: value for key, value in filteredWordNet.items() if key in uncommon_words}
print(f'{len(filteredUncommonWordNet)} ({round(len(filteredUncommonWordNet) / len(filteredWordNet) * 100, 1)}% of filtered dictionary entries)')

1749 (2.3% of filtered dictionary entries)


In [5]:
def process_text(text, maxLen):
    # convert text to lowercase and replace non-letter characters including newlines with a space
    text = re.sub(r'\n+|[^a-z\s.,?!:;\(\)]', ' ', text.lower())
    # tokenize text into sentences
    sentences = nltk.sent_tokenize(text)
    # remove empty, too long or too short sentences for demonstration simplicity
    sentences = [sentence.strip() for sentence in sentences if len(sentence.strip().split()) < maxLen and len(sentence.strip()) > 3]
    return sentences

maxGlossLen = max(len(value) for values in filteredUncommonWordNet.values() for value in values)                                                                                                                                      
sentences = process_text(alice_text, maxGlossLen)
print("Total sentences:", len(sentences))

Total sentences: 1616


In [6]:
%%time
def find_most_similar_sentence(main_sentence, sentences_to_compare):
    # load a pre-trained BERT model for sentence embeddings
    model = SentenceTransformer('stsb-roberta-large')
    # compute embeddings for the main sentence and sentences to compare
    main_sentence_embedding = model.encode([main_sentence])[0]
    sentences_to_compare_embeddings = model.encode(sentences_to_compare)
    # calculate cosine similarity between the main sentence embedding and each sentence to compare embedding
    similarity_scores = cosine_similarity([main_sentence_embedding], sentences_to_compare_embeddings)[0]
    # find the index of the most similar sentence
    most_similar_index = similarity_scores.argmax()
    # return the most similar sentence and its similarity score
    most_similar_sentence = sentences_to_compare[most_similar_index]
    similarity_score = similarity_scores[most_similar_index]
    
    return most_similar_sentence, similarity_score

# matching corpus sentences to dictionary definitions by similarity 
results = []
for sentence in sentences:
    words = sentence.split()
    for word in set(words):
        word = ''.join(filter(str.isalpha, word))
        if word in filteredUncommonWordNet and any(len(words) == len(gloss) for gloss in filteredUncommonWordNet[word]):
            glosses = [' '.join(sen) for sen in filteredUncommonWordNet[word]]
            bestGloss, bestScore = find_most_similar_sentence(sentence, glosses)
            results.append((word, sentence, bestScore, bestGloss))
print("Total uncommon words in corpus that would be enhanced with dictionary definitions:", len(results))

  return self.fget.__get__(instance, owner)()


Total uncommon words in corpus that would be enhanced with dictionary definitions: 633
CPU times: total: 40min 57s
Wall time: 23min 4s


In [7]:
# sort results by similarity score (in descending order)
sortedResults = sorted(set(results), key=lambda x: x[2], reverse=True)

# print results by most similar sentence correspondence
for n, result in enumerate(sortedResults):
    print(n+1, " Word:", result[0], "in sentence:", result[1])
    print("   Similarity score:", result[2], "with most similar gloss:", result[3])

1  Word: riddle in sentence: have you guessed the riddle yet?
   Similarity score: 0.8892132 with most similar gloss: explain a riddle
2  Word: mad in sentence: you re mad.
   Similarity score: 0.8269786 with most similar gloss: roused to anger
3  Word: silence in sentence: silence all round, if you please!
   Similarity score: 0.8001489 with most similar gloss: the state of being silent as when no one is speaking
4  Word: mean in sentence: don t you mean  purpose ?
   Similarity score: 0.79305077 with most similar gloss: have in mind as a purpose
5  Word: purple in sentence: said the queen, turning purple.
   Similarity score: 0.7746612 with most similar gloss: become purple
6  Word: mad in sentence: i m mad.
   Similarity score: 0.7684982 with most similar gloss: roused to anger
7  Word: business in sentence: yes, it  is  his business!
   Similarity score: 0.7675899 with most similar gloss: business concerns collectively
8  Word: cards in sentence: you re nothing but a pack of cards!