In [None]:

from google.colab import drive 
drive.mount('/content/gdrive')


Mounted at /content/gdrive


In [None]:
!pip install git+https://github.com/BoseCorp/py-googletrans.git --upgrade
!pip install vaderSentiment
!pip install transformers

!pip3 install pycld3
!pip3 install regex

import nltk
import pandas as pd
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from googletrans import Translator
from transformers import pipeline
import numpy as np 
import IPython
import operator
import re
import time

Collecting git+https://github.com/BoseCorp/py-googletrans.git
  Cloning https://github.com/BoseCorp/py-googletrans.git to /tmp/pip-req-build-ahetwzdt
  Running command git clone -q https://github.com/BoseCorp/py-googletrans.git /tmp/pip-req-build-ahetwzdt
Building wheels for collected packages: googletrans
  Building wheel for googletrans (setup.py) ... [?25l[?25hdone
  Created wheel for googletrans: filename=googletrans-2.3.0-cp36-none-any.whl size=16447 sha256=8ef378f4320d307b986c49d47a449df754b2c66ec58523525e1aed30d7ad3ebb
  Stored in directory: /tmp/pip-ephem-wheel-cache-negur4hu/wheels/6a/fc/9e/2d31d95d9e97da5166afd8225a6f3b6850dc2c6e84accefbfc
Successfully built googletrans
Installing collected packages: googletrans
Successfully installed googletrans-2.3.0
Collecting vaderSentiment
[?25l  Downloading https://files.pythonhosted.org/packages/76/fc/310e16254683c1ed35eeb97386986d6c00bc29df17ce280aed64d55537e9/vaderSentiment-3.3.2-py2.py3-none-any.whl (125kB)
[K     |█████████████

# installations

In [None]:


#taken from : https://github.com/sagorbrur/bendeep/blob/master/bendeep/sentiment.py

# from pathlib import Path

import pandas as pd
import torch
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
from sklearn.feature_extraction.text import CountVectorizer
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm, tqdm_notebook
import json


# Collected from : https://github.com/sagorbrur/bendeep/tree/master/models/sentiment

model_path = "./gdrive/My Drive/banglaSentiment/sentiment/senti_trained.pt"
vocab_path = "./gdrive/My Drive/banglaSentiment/sentiment/vocab.txt"

def save_dict_to_file(dic):
    f = open('vocab.txt','w')
    f.write(str(dic))
    f.close()


class Sequences_train(Dataset):
    def __init__(self, path, max_seq_len):
        self.max_seq_len = max_seq_len
        df = pd.read_csv(path)
        vectorizer = CountVectorizer(min_df=0.015)
        vectorizer.fit(df.review.tolist())

        save_dict_to_file(vectorizer.vocabulary_)
        self.token2idx = vectorizer.vocabulary_
        self.token2idx['<PAD>'] = max(self.token2idx.values()) + 1

        tokenizer = vectorizer.build_analyzer()
        self.encode = lambda x: [self.token2idx[token] for token in tokenizer(x)
                                 if token in self.token2idx]
        self.pad = lambda x: x + (max_seq_len - len(x)) * [self.token2idx['<PAD>']]
        
        sequences = [self.encode(sequence)[:max_seq_len] for sequence in df.review.tolist()]
        sequences, self.labels = zip(*[(sequence, label) for sequence, label
                                    in zip(sequences, df.sentiment.tolist()) if sequence])
        self.sequences = [self.pad(sequence) for sequence in sequences]

    def __getitem__(self, i):
        assert len(self.sequences[i]) == self.max_seq_len
        return self.sequences[i], self.labels[i]
    
    def __len__(self):
        return len(self.sequences)


class Sequences_infer(Dataset):
    def __init__(self, vocab_path, max_seq_len):
        self.max_seq_len = max_seq_len
        vectorizer = CountVectorizer(min_df=0.015)
        
        vocab = open(vocab_path, 'r').read()
        vocab = vocab.replace("'", "\"")
        vocab = json.loads(vocab)
        # print(vocab)
        self.token2idx = vocab
        # print(self.token2idx)
        self.token2idx['<PAD>'] = max(self.token2idx.values()) + 1
        # print(self.token2idx)

        tokenizer = vectorizer.build_analyzer()
        self.encode = lambda x: [self.token2idx[token] for token in tokenizer(x)
                                 if token in self.token2idx]
        self.pad = lambda x: x + (max_seq_len - len(x)) * [self.token2idx['<PAD>']]


class RNN(nn.Module):
    def __init__(
        self,
        vocab_size,
        batch_size,
        embedding_dimension=100,
        hidden_size=128, 
        n_layers=1,
        device='cpu',
    ):
        super(RNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.device = device
        self.batch_size = batch_size
        
        self.encoder = nn.Embedding(vocab_size, embedding_dimension)
        self.rnn = nn.GRU(
            embedding_dimension,
            hidden_size,
            num_layers=n_layers,
            batch_first=True,
        )
        self.decoder = nn.Linear(hidden_size, 1)
        
    def init_hidden(self):
        return torch.randn(self.n_layers, self.batch_size, self.hidden_size).to(self.device)
    
    def forward(self, inputs):
        # Avoid breaking if the last batch has a different size
        batch_size = inputs.size(0)
        if batch_size != self.batch_size:
            self.batch_size = batch_size
            
        encoded = self.encoder(inputs)
        output, hidden = self.rnn(encoded, self.init_hidden())
        output = self.decoder(output[:, :, -1]).squeeze()
        return output

def collate(batch):
    inputs = torch.LongTensor([item[0] for item in batch])
    target = torch.FloatTensor([item[1] for item in batch])
    return inputs, target

def train(data_path, batch_size = 64, epochs=100, model_name="trained.pt"):
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  dataset = Sequences_train(data_path, max_seq_len=128)
  train_loader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate)
  model = RNN(
    hidden_size=128,
    vocab_size=len(dataset.token2idx),
    device=device,
    batch_size=batch_size,
  )
  model = model.to(device)
  criterion = nn.BCEWithLogitsLoss()
  optimizer = optim.Adam([p for p in model.parameters() if p.requires_grad], lr=0.001)
  model.train()
  train_losses = []
  for epoch in range(epochs):
      progress_bar = tqdm_notebook(train_loader, leave=False)
      losses = []
      total = 0
      for inputs, target in progress_bar:
          inputs, target = inputs.to(device), target.to(device
                                                      )
          model.zero_grad()
          
          output = model(inputs)
      
          loss = criterion(output, target)
          
          loss.backward()
                
          nn.utils.clip_grad_norm_(model.parameters(), 3)

          optimizer.step()
          
          progress_bar.set_description(f'Loss: {loss.item():.3f}')
          
          losses.append(loss.item())
          total += 1
      
      epoch_loss = sum(losses) / total
      train_losses.append(epoch_loss)

      tqdm.write(f'Epoch #{epoch + 1}\tTrain Loss: {epoch_loss:.3f}')

  # torch.save(model, model_name)
  torch.save(model.state_dict(), model_name)


def analyze(model_path, vocab_path, text, batch_size=64):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    dataset = Sequences_infer(vocab_path, max_seq_len=128)
    # model = torch.load(model_path)
    model = RNN(
    hidden_size=128,
    vocab_size=len(dataset.token2idx),
    device=device,
    batch_size=batch_size,
    )
    model = model.to(device)
    model.load_state_dict(torch.load(model_path))
    model.eval()
    sum=0
    pos = 0
    neg = 0
    for txt in range(len(text)):
      with torch.no_grad():
        try:
          test_vector = torch.LongTensor([dataset.pad(dataset.encode(str(text[txt])))]).to(device)
  
        except:
          print("Long sentence?\n")
          continue
        
        output = model(test_vector)
        prediction = torch.sigmoid(output).item()

        if prediction > 0.5:
          sum+=1
          pos+=1
        else:
          sum-=1
          neg+=1
    print("total sentences = ",len(text))
    print("total positive sentences = ",pos)
    print("total negative sentences = ",neg)

    return sum

In [None]:
%%time
#https://github.com/Markopolo-ai/DatasetCollection/blob/master/data/prothom-alo.csv
df=pd.read_csv('./gdrive/My Drive/banglaSentiment/prothom-alo.csv')

for i in range(len(df)):
  print("\n\n-> Making Prediction for Document = ",i+1)
  parsed_text = df.Text[i].split('।')
  '''
  for rnglish,we can do : parsed_text = df.Text[i].split('.')
  and then send parsed_text to textblob,vader or other models that understand english well for sentiment
  '''
  sum = analyze(model_path, vocab_path, parsed_text)
  if(sum >= 0):
    print("-------------------------Positive Document-------------------------")
  else:
    print("-------------------------Negative Document-------------------------")




-> Making Prediction for Document =  1
total sentences =  23
total positive sentences =  9
total negative sentences =  14
-------------------------Negative Document-------------------------


-> Making Prediction for Document =  2
total sentences =  36
total positive sentences =  11
total negative sentences =  25
-------------------------Negative Document-------------------------


-> Making Prediction for Document =  3
total sentences =  34
total positive sentences =  13
total negative sentences =  21
-------------------------Negative Document-------------------------


-> Making Prediction for Document =  4
total sentences =  23
total positive sentences =  10
total negative sentences =  13
-------------------------Negative Document-------------------------


-> Making Prediction for Document =  5
total sentences =  33
total positive sentences =  7
total negative sentences =  26
-------------------------Negative Document-------------------------


-> Making Prediction for Document =