In [22]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name())
print(torch.cuda.current_device())
import pandas as pd
import random, requests
from utils.preprocess import DataPreProcessor
import gensim.downloader as api
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize

# Load data
eng_emails_raw = pd.read_pickle('Data/en_emails_raw.pkl')
# ch_emails_raw = pd.read_pickle('Data/ch_emails_raw.pkl')

True
NVIDIA GeForce RTX 2060
0


In [29]:
token = 'dasdasdda'
request_url = f"http://api.conceptnet.io/c/en/{token}"
retrieved_edges = requests.get(request_url).json()['edges']
retrieved_words = [node['end']['label'] for node in retrieved_edges]
retrieved_words = list(set(word for word in retrieved_words if word != token))
print(random.choice(retrieved_words))

IndexError: list index out of range

In [None]:
class ModelPreprocessor(DataPreProcessor):
    def __init__(self):
        self.word2vec_corpus = api.load('word2vec-google-news-300')
        super().__init__() # base class doesn't tokenize

    def tokenize(self,doc):
        """
        function to tokenize text into constituent words, using NLTK API.
        return list of tokens from input text: List[str]
        """
        text = doc["pipe_text"]
        tokens = word_tokenize(text)
        return tokens

    #function to perform search against Word2Vec
    def get_word2vec_embedding(self,token):
        try:
            token_embedding = self.word2vec_corpus[token]
            return token_embedding
        except KeyError:
            return None
    
    #function to perform search against WordNet
    def wordnet_replacement(self,token):
        """
        function to generate a synonym of input token using Wordnet
        input: token
        conditions: token must not have a Word2Vec embedding
        output: replacement token -> str, otherwise None
        """
        # token should not be found in word2vec
        NoneType = type(None)
        assert not isinstance(self.get_word2vec_embedding(token),NoneType)
        synonyms = []
        synsets = wn.synsets(token)
        if len(synsets) == 0: return []
        for s in synsets:
            lemma_names = s.lemma_names()
            for ln in lemma_names:
                ln = ln.lower().replace('_',' ')
                if (ln != token) and (ln not in synonyms):
                    synonyms.append(ln)
        return random.choice(synonyms) if synonyms else None
        
    #function to perform search against ConceptNet (requires API call using requests library)
    def conceptnet_replacement(self,token):
        """
        function to generate a replacement token from ConceptNet
        input: token
        conditions: token must not have a Word2Vec embedding, and has no synonym from WordNet
        output: replacement token -> str, otherwise None
        """
        # checks to ensure token does not have word2vec embedding and has no WordNet replacement
        NoneType = type(None)
        assert not isinstance(self.get_word2vec_embedding(token),NoneType), "token already has a Word2Vec embedding"
        assert not isinstance(self.wordnet_replacement(token),NoneType), "token can be replaced by WordNet"
        request_url = f"http://api.conceptnet.io/c/en/{token}" # API call to ConceptNet
        retrieved_edges = requests.get(request_url).json()['edges']
        retrieved_words = [node['end']['label'] for node in retrieved_edges] # collect similar words
        retrieved_words = list(set(word for word in retrieved_words if word != token)) # remove duplicates and existing token
        return random.choice(retrieved_words) if retrieved_words else None
        
    #TODO: function to generate random embedding element for missing word, refer to research paper
    def random_replacement(self,token):
        pass