### Import

In [None]:
import gzip
import json
import random
import time
from nltk import word_tokenize
from tqdm import tqdm
import contractions

# Imports for the synonym dictionary 
import requests
from bs4 import BeautifulSoup
from nltk.wsd import lesk

#wordnet imports
#importing NLTK corpus synonym dictionary, pos-tagging and checking the outputs
import nltk
from nltk.corpus import wordnet
from nltk import pos_tag
import random


#Word2Vec imports
import gensim.models

In [None]:
#wordnet downloads
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
random.seed(42)

### Read in the data

In [None]:
#here
def reading_in(path):
    thing=[]
    for line in gzip.open(path):
        review_data = json.loads(line)
        subthing = dict()
        for key in review_data:
            subthing[key]= review_data[key]
        thing.append(subthing)
    return thing

In [None]:
train = reading_in('../data/classification/music_reviews_train.json.gz')
#dev = reading_in('../data/classification/music_reviews_dev.json.gz') # do not use!
#test = reading_in('../data/classification/music_reviews_test_masked.json.gz') #do not use!

In [None]:
train_x = [i['reviewText'] for i in train if "reviewText" in i.keys() and "sentiment" in i.keys()]
train_y =  [i['sentiment'] for i in train if "reviewText" in i.keys() and "sentiment" in i.keys()]

### Preparing the data

In [None]:
#old tokenizer
#train_x_back = [' '.join(sen) for sen in train_x_split] may not need convertes tokenised back to single string

In [None]:
#Tokenizer
def tokenize_corpus(corpus):
    return [word_tokenize(contractions.fix(s)) for s in corpus]

In [None]:
train_x_split = tokenize_corpus(train_x)

# Webscrape synonyms

In [None]:
##Alternative to PyDictionary
##THIS IS COPIED FROM SLACK
# https://stackoverflow.com/questions/52910297/pydictionary-word-has-no-synonyms-in-the-api

def synonyms(term):
    response = requests.get('https://www.thesaurus.com/browse/{}'.format(term))
    soup = BeautifulSoup(response.text, 'lxml')
    soup.find('section', {'class': 'css-17ofzyv e1ccqdb60'})
    return [span.text for span in soup.findAll('a', {'class': 'css-1kg1yv8 eh475bn0'})] # 'css-1gyuw4i eh475bn0' for less relevant synonyms

word = "I"
print(synonyms(word))

['he ', 'it ', 'she ', 'sie ', 'they ', 've ', 'xe ', 'you ', 'ze ', 'zie ']


In [None]:
def gen_synth_web(sentence,prob = 0.5):
    new_sen = []
    for word in sentence:
        threshold = random.random()
        if prob > threshold:
            #print(threshold) can remove later kept for debugging
            syn = synonyms(word)
            if len(syn) != 0:
                #print("selected", word) can remove later kept for debugging
                new_sen.append(random.choice(syn))
            else:new_sen.append(word)
        else:
            new_sen.append(word)
    return new_sen

In [None]:
#for i in tqdm(train_x_split[:1000]):#running with 0:1000 dont uncomment as will mess up file 
#    with open ("../data/synthetic1.txt","a") as f:
#        f.write(' '.join(gen_synth_web(i))+"\n")
#        time.sleep(5)

#### Example of webscrape synthetic sentence 

In [None]:
print(train_x_split[6])

['I', 'love', 'all', 'of', 'his', 'music', '!', '!']


In [None]:
print(gen_synth_web(train_x_split[6]))

['I', 'passion ', 'fully ', 'of', 'his', 'music', '!', '!']


# Wordnet

In [None]:
relevant = ["NN","NNS","RB","RBR","RBS","VB","VBG","VBD","VBN","VBP","VBZ","JJ","JJR","JJS"]#The wordclasses that can be exchanged with synonyms (not functional words) #maybe not RB
wordnet_tag_map = {'NN': ['n'],'NNS': ['n'],'JJ': ['a'],'JJR': ['a'],'JJS': ['a'],'RB': ['r'],'RBR': ['r'],'RBS': ['r'], 'VB': ['v'], 'VBG': ['v'], 'VBD': ['v'], 'VBN': ['v'], 'VBP': ['v'], 'VBZ': ['v']} #Making pos-tags coincide
aux_verbs = ['am','are','be','been','being','did','do','does','had','has','have','having','is','was','were','will'] #Used to avoid changing function words
def change_wordnet(sentence):
    sentence = nltk.pos_tag(sentence)    #pos-tagging sentence here
    new_sentence = [] #to append to and then return as final sentence
    for word in sentence:
        if word[1] in relevant and word[0] not in aux_verbs: #Not changing functional words
            tag = wordnet_tag_map[word[1]]
            chosen = lesk(sentence, word[0], tag[0]) #from all synsets choose the most likely in a smart way, thanks to library!
            if chosen is None:
                new_sentence.append(word[0]) #sometimes no synset is chosen so we just give the new sentence the original word
            else:
                chosen_w = random.choice(chosen.lemma_names())
                new_sentence.append(chosen_w)
        else:
            new_sentence.append(word[0])
    return new_sentence

#### Example of wordnet synthetic sentence 

In [None]:
print(train_x_split[3])

['I', 'love', 'Dallas', 'Holms', 'music', 'and', 'voice', '!', 'Thank', 'You', '!', 'I', 'will', 'be', 'attending', 'all', 'his', 'concerts', 'in', 'heaven', ',', 'forever', '!']


In [None]:
print(change_wordnet(train_x_split[3]))

['I', 'know', 'Dallas', 'Holms', 'medicine', 'and', 'voice', '!', 'Thank', 'You', '!', 'I', 'will', 'be', 'attend', 'all', 'his', 'concert', 'in', 'Heaven', ',', 'forever_and_a_day', '!']


#### write to file

In [None]:
wn_x_train = [change_wordnet(i) for i in train_x_split]

In [None]:
with open("../data/synthetic/wordnet.csv",'w') as f:
    f.writelines('\t'.join(s) + '\n' for s in wn_x_train)

# Word2vec

In [None]:
gooEmbs = gensim.models.KeyedVectors.load_word2vec_format('../data/GoogleNews-50k.bin', binary=True)
print('loading finished')

loading finished


In [None]:
gooEmbs.most_similar('run')

[('runs', 0.6569936275482178),
 ('running', 0.6062965989112854),
 ('drive', 0.4834049642086029),
 ('ran', 0.4764978289604187),
 ('scamper', 0.46932122111320496),
 ('go', 0.4631645083427429),
 ('walk', 0.45697975158691406),
 ('homerun', 0.45180249214172363),
 ('roundtripper', 0.43272683024406433),
 ('sacrifice_fly', 0.42050617933273315)]

In [None]:
relevant = ["NN","NNS","RB","RBR","RBS","VB","VBG","VBD","VBN","VBP","VBZ","JJ","JJR","JJS"]#The wordclasses that can be exchanged with synonyms (not functional words) #maybe not RB
wordnet_tag_map = {'NN': ['n'],'NNS': ['n'],'JJ': ['a'],'JJR': ['a'],'JJS': ['a'],'RB': ['r'],'RBR': ['r'],'RBS': ['r'], 'VB': ['v'], 'VBG': ['v'], 'VBD': ['v'], 'VBN': ['v'], 'VBP': ['v'], 'VBZ': ['v']} #Making pos-tags coincide
aux_verbs = ['am','are','be','been','being','did','do','does','had','has','have','having','is','was','were','will'] #Used to avoid changing function words
def change_word2vec(original_sentence, prob = 0.5):
    sentence = nltk.pos_tag(sentence)    #pos-tagging sentence here
    new_sentence = [] #to append to and then return as final sentence
    for word in sentence:
        if word[1] in relevant and word[0] not in aux_verbs:
            if random.random() > prob:
                try:
                    syno = gooEmbs.most_similar(word[0])[0] #taking the most similar word from word2vec
                    new_sentence.append(syno[0])
                except:
                    new_sentence.append(word[0])    
            else: 
                new_sentence.append(word[0])
        else:
            new_sentence.append(word[0])
    return new_sentence

In [None]:
print(change_word2vec(train_x[1]))
len(train_x)

['This', 'tapes', 'can', 'scarcely', 'be', 'understood', 'and', 'it', 'was', 'listing', 'for', 'sold', 'as', '``', 'very', 'good', "''", '.', 'It', "'s", 'VERY', 'BAD', '.']


99946

In [None]:
#new_x = []
#for  i in train_x[:1000]:
    #new_x.append(change_word2vec(i))

#new_x

[['And',
  'creative',
  '!',
  'Love',
  'his',
  'music',
  '-',
  'the',
  'words',
  ',',
  'the',
  'message',
  '!',
  'Some',
  'of',
  'my',
  'favorite',
  'songs',
  'on',
  'this',
  'CD',
  '.',
  'I',
  'should',
  "'ve",
  'bought',
  'it',
  'years',
  'last',
  '!'],
 ['This',
  'tapes',
  'can',
  'hardly',
  'be',
  'understood',
  'and',
  'it',
  'is',
  'listing',
  'for',
  'sold',
  'as',
  '``',
  'extremely',
  'great',
  "''",
  '.',
  'It',
  "'s",
  'VERY',
  'BAD',
  '.'],
 ['Sell',
  'the',
  'CD',
  '.',
  'Do',
  'not',
  'sell',
  'the',
  'MP3',
  'albums',
  '.',
  'Download',
  'was',
  'no',
  'shorter',
  'availabe',
  '.',
  'But',
  'you',
  'do',
  "n't",
  'find',
  'that',
  'out',
  'until',
  'after',
  'you',
  "'ve",
  'bought',
  'it',
  '.'],
 ['I',
  'love',
  'Dallas',
  'Holms',
  'music',
  'and',
  'voices',
  '!',
  'Thank',
  'You',
  '!',
  'I',
  'will',
  'be',
  'attend',
  'all',
  'his',
  'concert',
  'in',
  'heaven',
  ',

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=b88dfe01-c7e1-473c-bcfd-798313fc6522' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>