In [71]:
from gensim.models import KeyedVectors
from os import path
from nltk.tokenize import RegexpTokenizer
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from nltk.corpus import stopwords
from spacy.lang.en.stop_words import STOP_WORDS
import spacy
import nltk

In [59]:
def make_token(review):

  # drop points like . . . . . . . . . ., and not useful characters. 
  tokenizer = RegexpTokenizer("\w+\'?\w+|\w+") 
  
  return tokenizer.tokenize(str(review))

def remove_stopwords(review):

    #load stop words 
    stop_words = stopwords.words('english')
    exceptionStopWords = {
      'again',
      'against',
      'ain',
      'almost',
      'among',
      'amongst',
      'amount',
      'anyhow',
      'anyway',
      'aren',
      "aren't",
      'below',
      'bottom',
      'but',
      'cannot',
      'couldn',
      "couldn't",
      'didn',
      "didn't",
      'doesn',
      "doesn't",
      'don',
      "don't",
      'done',
      'down',
      'except',
      'few',
      'hadn',
      "hadn't",
      'hasn',
      "hasn't",
      'haven',
      "haven't",
      'however',
      'isn',
      "isn't",
      'least',
      'mightn',
      "mightn't",
      'move',
      'much',
      'must',
      'mustn',
      "mustn't",
      'needn',
      "needn't",
      'neither',
      'never',
      'nevertheless',
      'no',
      'nobody',
      'none',
      'noone',
      'nor',
      'not',
      'nothing',
      'should',
      "should've",
      'shouldn',
      "shouldn't",
      'too',
      'top',
      'up',
      'very'
      'wasn',
      "wasn't",
      'well',
      'weren',
      "weren't",
      'won',
      "won't",
      'wouldn',
      "wouldn't"
}

    # union and clean basic stop words
    stop_words = set(stop_words).union(STOP_WORDS)
    final_stop_words = stop_words-exceptionStopWords

    return [token for token in review if token not in final_stop_words]

def lemmatization(review):
    
    # Part-of-speech tagging. When you switch off -  disable - parser, tagger, ner it could work more faster 
    nlp = spacy.load("en",disable=['parser', 'tagger', 'ner']) 
    lemma_result = []
    
    for words in review:
        doc = nlp(words)
        for token in doc:
            lemma_result.append(token.lemma_)
    return lemma_result


def pipeline(review):
    review = make_token(review)
    review = remove_stopwords(review)
    return lemmatization(review)

In [60]:
class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout,embedding_weights):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_weights)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
        # если было бы три леира тогда умножили на 3,  так у нас два (N_LAYERS = 2)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, text_lengths):
        #x [sent length , batch size]
        embedded = self.embedding(x) #[sentect len,batch size,embedding dim]
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths) # это сделано для того, что бы выровнять матрицу ввиду того, что длина банчей у каждого разная. https://stackoverflow.com/questions/51030782/why-do-we-pack-the-sequences-in-pytorch
        packed_output, (hidden, cell) = self.rnn(packed_embedded)#output[sent length,batch size,hiddendin*num of directions],[numberlayers*num of dir,batch size,hid dim]
        #[f0,b0,f1,b1,.......fn,bn]
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)) # кантакатинация выхода от двух hidden N_LAYERS 
        return self.fc(hidden.squeeze(0))

In [61]:
def word2idx(embedding_model,review):
    index_review = []
    for word in review:
        try:
            index_review.append(embedding_model.vocab[word].index)
        except: 
             pass
    return torch.tensor(index_review)

In [72]:
def predict_sentiment(sentence, word_vectors):
    tokenized = pipeline(sentence)
    indexed = word2idx(word_vectors,tokenized)
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    prediction = torch.sigmoid(the_model(tensor,torch.LongTensor([len(indexed)]).to(device)))
    return prediction.item()

In [73]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/artem/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [74]:
PATH_W2V = '/home/artem/pr_sissi_sentiment_model_google-cloud-functions/models'
PATH_model = '/home/artem/pr_sissi_sentiment_model_google-cloud-functions/models'

In [75]:
word_vectors = KeyedVectors.load(path.join(PATH_W2V,'word2vec.model'))
embedding_weights = torch.Tensor(word_vectors.vectors)
padding_value = len(word_vectors.index2word)

In [76]:
INPUT_DIM = padding_value
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5

In [77]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
the_model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT, embedding_weights).to(device)
the_model.load_state_dict(torch.load(path.join(PATH_model,'sissi_sentiment_12122019.pth'),  map_location='cpu'))

In [78]:
predict_sentiment("comedy movie", word_vectors)

0.7459672093391418