# D-AT-GRU
## Imports

In [1]:
import itertools
import math
import time
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data.dataloader as dataloader

import torchvision
import torchvision.datasets as dsets
import torchvision.transforms as transforms

import matplotlib.pyplot as plt

from semeval2014.semeval_base import *

from nltk.tokenize import TweetTokenizer

## Hyper Paramètres

## Load Dataset

In [3]:
trainfile="semeval2014/restaurants-trial.xml"
testfile="semeval2014/Restaurants_Test_Data_PhaseA.xml"
corpus = Corpus(ET.parse(trainfile).getroot().findall('sentence'))
unseen = Corpus(ET.parse(testfile).getroot().findall('sentence'))
b1 = BaselineAspectExtractor(corpus)
predicted = b1.tag(unseen.corpus)
corpus.write_out('test.predicted-aspect.xml', predicted, short=False)

## Show 10 sentences and categories example in train

In [4]:
for index, sentence in zip(range(10), corpus.corpus):
    print(sentence.text)
    for categorie in sentence.aspect_categories:
        print(categorie.term, categorie.polarity)
    print("")

All the appetizers and salads were fabulous, the steak was mouth watering and the pasta was delicious!!!
food positive

And really large portions.
food positive

Go inside and you won't want to leave.
anecdotes/miscellaneous positive

Save yourself the time and trouble and skip this one!
anecdotes/miscellaneous negative

The sweet lassi was excellent as was the lamb chettinad and the garlic naan but the rasamalai was forgettable.
food conflict

Service was quick.
service positive

Oh, don't even let me start with how expensive the bills were!
price negative

Service is top notch.
service positive

The best thing I tasted were the lambc hops.
food positive

Overall I would recommend it and go back again.
anecdotes/miscellaneous positive



## Load Pre-trained Embedding Vectors from Glove (little dataset)

In [22]:
embeddings_dict = {}
embedding_dim=50
with open("embeddings/glove.6B."+ str(embedding_dim) +"d.txt", 'r', encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

## Exemple d'embedding

In [8]:
print(embeddings_dict['cat'])

[ 0.45281  -0.50108  -0.53714  -0.015697  0.22191   0.54602  -0.67301
 -0.6891    0.63493  -0.19726   0.33685   0.7735    0.90094   0.38488
  0.38367   0.2657   -0.08057   0.61089  -1.2894   -0.22313  -0.61578
  0.21697   0.35614   0.44499   0.60885  -1.1633   -1.1579    0.36118
  0.10466  -0.78325   1.4352    0.18629  -0.26112   0.83275  -0.23123
  0.32481   0.14485  -0.44552   0.33497  -0.95946  -0.097479  0.48138
 -0.43352   0.69455   0.91043  -0.28173   0.41637  -1.2609    0.71278
  0.23782 ]


### Prepare glove embeddings to pytorch

In [24]:
# Get training vocab length
tknzr = TweetTokenizer() # Use tweetTokenizer because of the internet review style
vocab = {}
for sentence in corpus.corpus:
    for word in tknzr.tokenize(sentence.text):
        if word not in vocab:
            vocab[word] = 1        
        else:
            vocab[word] = vocab[word] + 1

# Prepare embeddings vocab matrix
pretrained_embeds = np.zeros((len(vocab), embedding_dim))
for i, word in enumerate(vocab):
    try: 
        pretrained_embeds[i] = embeddings_dict[word]
    except KeyError:
        pretrained_embeds[i] = np.random.normal(scale=0.5, size=(embedding_dim, ))

All
And
Go
won't
Save
The
rasamalai
Service
Oh
don't
I
lambc
Overall
I've
Even
In
Rao's
Wed
Anyway
But
Tom
Kha
Try
Reasonable
Everything
Sala
Thai
well-portioned
Add
Unfortunately
NOT
Their
vomit-inducing
YUCK
A
didn't
wouldn't
While
Our
My
Good
Consistently
Japanese
Tapas
Ruby
Foo's
Now
it's
can't
Don't
One
Night
Tokyo
Definately
You
As
Tristate
With
Indian
couldn't
We
dissappointed
This
we'll
OU
MUST
TRY
THIS
RESTAURANT
Have
Ginger
House
haven't
virgnin
Drawbacks
It
Brooklyn
Joya
Great
If
they'd
Was
I'm
What
Excellent
Definitely
Italian
New
York
City
Love
YUKA
we're
we've
guaranteeed
isn't
After
HUGE
you'll
NY
Went
We've
375


## D-AT-GRU Model

In [19]:
class D_AT_GRU(nn.Module):
    def __init__(self, pretrained_embeds):
        super(D_AT_GRU, self).__init__() 
        hidden_size=300
        num_layers=1
        bias=True
        batch_first=True
        aspect_size=300
        vocab_size, embed_dim = pretrained_embeds.size()
        self.word_embeddings = nn.Embedding(vocab_size, embed_dim)
        # Chargement des embeddings glove
        self.word_embeddings.load_state_dict({'weight': pretrained_embeds})
    
        self.gru = nn.GRU(input_size= embed_dim, hidden_size= hidden_size,  
                          num_layers= num_layers, bias= bias, 
                          batch_first= batch_first, bidirectional=False)
        self.aspect_embeddings = nn.Embedding(aspect_size, embed_dim)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        embeds = self.word_embeddings(x)
        output, _ = self.gru(embeds)     
        return self.softmax(output)