### Import

In [1]:
import gzip
import json
import random
import time
from nltk import word_tokenize

# Imports for the synonym dictionary 
import requests
from bs4 import BeautifulSoup
from nltk.wsd import lesk


In [2]:
random.seed(42)

### Read in the data

In [3]:
#here
def reading_in(path):
    thing=[]
    for line in gzip.open(path):
        review_data = json.loads(line)
        subthing = dict()
        for key in review_data:
            subthing[key]= review_data[key]
        thing.append(subthing)
    return thing

In [4]:
train = reading_in('../data/classification/music_reviews_train.json.gz')
#dev = reading_in('../data/classification/music_reviews_dev.json.gz') # do not use!
#test = reading_in('../data/classification/music_reviews_test_masked.json.gz') #do not use!

In [5]:
train_x = [i['reviewText'] for i in train if "reviewText" in i.keys() and "sentiment" in i.keys()]
train_y =  [i['sentiment'] for i in train if "reviewText" in i.keys() and "sentiment" in i.keys()]

### Preparing the data

In [6]:
train_x_split = [sen.split(' ') for sen in train_x]

"""
Here we want a loop for replacing the words
"""

train_x_back = [' '.join(sen) for sen in train_x_split]

# Webscrape synonyms

In [7]:
##Alternative to PyDictionary
##THIS IS COPIED FROM SLACK
# https://stackoverflow.com/questions/52910297/pydictionary-word-has-no-synonyms-in-the-api

def synonyms(term):
    response = requests.get('https://www.thesaurus.com/browse/{}'.format(term))
    soup = BeautifulSoup(response.text, 'lxml')
    soup.find('section', {'class': 'css-17ofzyv e1ccqdb60'})
    return [span.text for span in soup.findAll('a', {'class': 'css-1kg1yv8 eh475bn0'})] # 'css-1gyuw4i eh475bn0' for less relevant synonyms

word = "I"
print(synonyms(word))

['he ', 'it ', 'she ', 'sie ', 'they ', 've ', 'xe ', 'you ', 'ze ', 'zie ']


In [8]:
def gen_synth(sentence,prob = 0.5):
    new_sen = []
    for word in sentence:
        threshold = random.random()
        if prob > threshold:
            #print(threshold) can remove later kept for debugging
            syn = synonyms(word)
            if len(syn) != 0:
                #print("selected", word) can remove later kept for debugging
                new_sen.append(random.choice(syn))
            else:new_sen.append(word)
        else:
            new_sen.append(word)
    return new_sen

In [9]:
len(train_x)

99946

In [10]:
from tqdm import tqdm

In [11]:
print([0,1,2,3,4,5][2:4])#indexing sanity check can delecte later

[2, 3]


In [12]:
#for i in tqdm(train_x_split[:1000]):#running with 0:1000 dont uncomment as will mess up file 
#    with open ("../data/synthetic1.txt","a") as f:
#        f.write(' '.join(gen_synth(i))+"\n")
#        time.sleep(5)

In [13]:
print(train_x_split[6])

['I', 'love', 'all', 'of', 'his', 'music!!']


In [14]:
print(gen_synth(train_x_split[6]))

['I', 'passion ', 'fully ', 'of', 'his', 'music!!']


# Wordnet

In [37]:
#importing NLTK corpus synonym dictionary, pos-tagging and checking the outputs
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
from nltk.corpus import wordnet
from nltk import pos_tag
import random
syns = wordnet.synsets("bad")
syns

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[Synset('bad.n.01'),
 Synset('bad.a.01'),
 Synset('bad.s.02'),
 Synset('bad.s.03'),
 Synset('bad.s.04'),
 Synset('regretful.a.01'),
 Synset('bad.s.06'),
 Synset('bad.s.07'),
 Synset('bad.s.08'),
 Synset('bad.s.09'),
 Synset('bad.s.10'),
 Synset('bad.s.11'),
 Synset('bad.s.12'),
 Synset('bad.s.13'),
 Synset('bad.s.14'),
 Synset('badly.r.05'),
 Synset('badly.r.06')]

In [46]:
relevant = ["NN","NNS","RB","RBR","RBS","VB","VBG","VBD","VBN","VBP","VBZ","JJ","JJR","JJS"]#The wordclasses that can be exchanged with synonyms (not functional words) #maybe not RB
wordnet_tag_map = {'NN': ['n'],'NNS': ['n'],'JJ': ['a'],'JJR': ['a'],'JJS': ['a'],'RB': ['r'],'RBR': ['r'],'RBS': ['r'], 'VB': ['v'], 'VBG': ['v'], 'VBD': ['v'], 'VBN': ['v'], 'VBP': ['v'], 'VBZ': ['v']} #Making pos-tags coincide
def change(original_sentence):
    sentence = word_tokenize(original_sentence)    #tokenizing
    sentence = nltk.pos_tag(sentence)    #pos-tagging sentence here
    new_sentence = [] #to append to and then return as final sentence
    for word in sentence:
        if word[1] in relevant:
            tag = wordnet_tag_map[word[1]]
            chosen = lesk(original_sentence, word[0], tag[0]) #from all synsets cchoose the most likely in a smart way, thanks to library!
            if chosen is None:
                new_sentence.append(word[0]) #sometimes no synset is chosen so we just give the new sentence the original word
            else:
                chosen_w = random.choice(chosen.lemma_names())
                new_sentence.append(chosen_w)
        else:
            new_sentence.append(word[0])
    return new_sentence

In [33]:
print(train_x[1])

This tape can hardly be understood and it was listed for sale as "very good".  It's VERY BAD.


In [47]:
print(change(train_x[1]))

['This', 'tape', 'can', 'hardly', 'personify', 'understood', 'and', 'it', 'embody', 'list', 'for', 'sales_agreement', 'as', '``', 'very', 'good', "''", '.', 'It', "'s", 'very', 'BAD', '.']


In [19]:
wn_x_train = [change(i) for i in train_x_split]


TypeError: expected string or bytes-like object

In [None]:
v = ['I', 'love', 'all', 'of', 'his', 'music!!']
for word in v:
    print(wordnet.synsets(word))

[Synset('iodine.n.01'), Synset('one.n.01'), Synset('i.n.03'), Synset('one.s.01')]
[Synset('love.n.01'), Synset('love.n.02'), Synset('beloved.n.01'), Synset('love.n.04'), Synset('love.n.05'), Synset('sexual_love.n.02'), Synset('love.v.01'), Synset('love.v.02'), Synset('love.v.03'), Synset('sleep_together.v.01')]
[Synset('all.a.01'), Synset('all.s.02'), Synset('wholly.r.01')]
[]
[]
[]


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=b88dfe01-c7e1-473c-bcfd-798313fc6522' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>