In [1]:
!pip install sentence_transformers
!pip install hazm
!pip install transformers



In [2]:
%cd sample_data

/content/sample_data


In [3]:
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from hazm import *


In [4]:
tokenizer = AutoTokenizer.from_pretrained("HooshvareLab/bert-base-parsbert-uncased")
parsbert_embedding_model = AutoModel.from_pretrained("HooshvareLab/bert-base-parsbert-uncased")
labse_embedding_model = SentenceTransformer('sentence-transformers/LaBSE')
normalizer = Normalizer()

def generate_bert_embedding(tweet):
    tweet = normalizer.normalize(tweet)
    tokens = tokenizer.tokenize(tweet)
    input_ids = torch.tensor([tokenizer.convert_tokens_to_ids(tokens)])

    with torch.no_grad():
        outputs = parsbert_embedding_model(input_ids)
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze()

    return embedding.tolist()

def generate_labse_embedding(tweet):

    embeddings = labse_embedding_model.encode(tweet)
    return embeddings.tolist()


Downloading (‚Ä¶)lve/main/config.json:   0%|          | 0.00/434 [00:00<?, ?B/s]

Downloading (‚Ä¶)solve/main/vocab.txt:   0%|          | 0.00/1.22M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/654M [00:00<?, ?B/s]

Downloading (‚Ä¶)be010/.gitattributes:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading (‚Ä¶)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (‚Ä¶)/2_Dense/config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

Downloading (‚Ä¶)168ebbe010/README.md:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

Downloading (‚Ä¶)8ebbe010/config.json:   0%|          | 0.00/804 [00:00<?, ?B/s]

Downloading (‚Ä¶)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

Downloading (‚Ä¶)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (‚Ä¶)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (‚Ä¶)be010/tokenizer.json:   0%|          | 0.00/9.62M [00:00<?, ?B/s]

Downloading (‚Ä¶)okenizer_config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading (‚Ä¶)168ebbe010/vocab.txt:   0%|          | 0.00/5.22M [00:00<?, ?B/s]

Downloading (‚Ä¶)ebbe010/modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

In [5]:
class NeuralNetClassifier(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.linear1 = nn.Linear(input_dim, 1000)
        self.activation1 = nn.LeakyReLU(0.1)
        self.linear2 = nn.Linear(1000, 512)
        self.activation2 = nn.ReLU()
        self.linear3 = nn.Linear(512, 64)
        self.dropout = nn.Dropout(p=0.5)
        self.activation3 = nn.LeakyReLU(0.1)
        self.linear4 = nn.Linear(64, 16)
        self.activation4 = nn.ReLU()
        self.linear5 = nn.Linear(16, output_dim)

    def forward(self, input):
        x = self.linear1(input)
        x = self.activation1(x)
        x = self.linear2(x)
        x = self.activation2(x)
        x = self.linear3(x)
        x = self.dropout(x)
        x = self.activation3(x)
        x = self.linear4(x)
        x = self.activation4(x)
        x = self.linear5(x)
        return x

In [6]:
feelings = ["Happy", "Sad", "Angry", "Neutral", "Emotional"]
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

best_model = NeuralNetClassifier(1536, 5)
best_model.load_state_dict(torch.load('saved_model.pth'))
best_model = best_model.to(device)


file = open('angry_words.txt', 'r')
angry_lines = file.readlines()
file.close()

file = open('passionate_words.txt', 'r')
emotional_lines = file.readlines()
file.close()

file = open('sad_words.txt', 'r')
sad_lines = file.readlines()
file.close()

file = open('happy_words.txt', 'r')
happy_lines = file.readlines()
file.close()
list_of_happy_emojis = ['ü§£', 'üòÇ', '‚ú®', 'üéâ', 'üî•', '>>>', 'üíÉ']
list_of_sad_emojis = ['üò≠', 'üò¢', 'üòî', '<<<', 'üòì', 'üñ§', '):', ':(']
list_of_passionate_emojis = ['ü•∫', 'üòç', 'üíò', '‚ù§Ô∏è', 'ü´¶', '‚ù§Ô∏è‚Äçüî•']
list_of_angry_emojis = ['üòê', 'ü§¨', 'üñï']

def predict_sentence_feeling(test_tweet):
    labse_output = generate_labse_embedding(test_tweet)
    bert_output = generate_bert_embedding(test_tweet)
    tmp_embedding = bert_output + labse_output

    sentence_tensor = torch.tensor(tmp_embedding)
    sentence_tensor = sentence_tensor.unsqueeze(0)
    sentence_tensor = sentence_tensor.to(device)

    for emoji in list_of_happy_emojis:
      if emoji in test_tweet:
        return 0

    for emoji in list_of_sad_emojis:
      if emoji in test_tweet:
        return 1

    for emoji in list_of_angry_emojis:
      if emoji in test_tweet:
        return 2

    for emoji in list_of_passionate_emojis:
      if emoji in test_tweet:
        return 4

    best_model.eval()
    with torch.no_grad():
        output = best_model(sentence_tensor)

    probabilities = F.softmax(output, dim=1)
    conf_score, predicted_label = torch.max(probabilities, dim=1)
    predicted_label = predicted_label.item()
    if conf_score.item() >= 0.8:
      return predicted_label
    else:
      words_of_sentence = test_tweet.split(" ")
      for word in words_of_sentence:
        word_with_enter = word + '\n'
        for keyword in angry_lines:
          if keyword in word or keyword == word_with_enter:
            return 2

      for word in words_of_sentence:
        word_with_enter = word + '\n'
        for keyword in sad_lines:
          if keyword in word or keyword == word_with_enter:
            return 1

      for word in words_of_sentence:
        word_with_enter = word + '\n'
        for keyword in emotional_lines:
          if keyword in word or keyword == word_with_enter:
            return 4


      for word in words_of_sentence:
        word_with_enter = word + '\n'
        for keyword in happy_lines:
          if keyword in word or keyword == word_with_enter:
            return 0


      return predicted_label

In [7]:
tweet_topic_list = [0, 0, 0, 0, 0 ,0, 0, 0]
tweet_sentiments = [0, 0, 0, 0, 0]

In [8]:
from transformers import pipeline

data = pd.read_csv('NewTweets.csv')

tweet_sentiment_dict = {}

for idx, column in enumerate(data):
  if idx % 5 == 0:
    print(idx)
  tweet_sentiments = [0, 0, 0, 0, 0]
  for tweet in data[column].values:
    if type(tweet) is str:
      tweet = normalizer.normalize(tweet)
      current_pred = predict_sentence_feeling(tweet)
      tweet_sentiments[current_pred] += 1
  export_list_tweet_sentiments = []
  for num in tweet_sentiments:
    export_list_tweet_sentiments.append(str(num))
  tweet_sentiment_dict[str(column)] = '#'.join(export_list_tweet_sentiments)


0
5
10
15
20
25
30


In [9]:
import json
with open("sentiments.txt", "w") as fp:
    json.dump(tweet_sentiment_dict, fp)  # encode dict into JSON
print("Done writing dict into .txt file")

Done writing dict into .txt file


In [10]:
!pip install xformers

Collecting xformers
  Downloading xformers-0.0.21-cp310-cp310-manylinux2014_x86_64.whl (167.0 MB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m167.0/167.0 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xformers
Successfully installed xformers-0.0.21


In [11]:
data = pd.read_csv('NewTweets.csv')

tweet_topics_dict = {}

pipe = pipeline("text-classification", model="HooshvareLab/bert-fa-base-uncased-clf-persiannews")

for idx, column in enumerate(data):
  print(idx)
  tweet_topic_list = [0, 0, 0, 0, 0 ,0, 0, 0]
  for tweet in data[column].values:
    if type(tweet) is str:
      label = pipe(tweet)[0]['label']
      if label == 'ÿßŸÇÿ™ÿµÿßÿØ€å':
        tweet_topic_list[0] += 1
      elif label == 'ÿ®€åŸÜ ÿßŸÑŸÖŸÑŸÑ':
        tweet_topic_list[1] += 1
      elif label == 'ÿ≥€åÿßÿ≥€å':
        tweet_topic_list[2] += 1
      elif label == 'ÿπŸÑŸÖ€å ŸÅŸÜÿßŸàÿ±€å':
        tweet_topic_list[3] += 1
      elif label == 'ŸÅÿ±ŸáŸÜ⁄Ø€å ŸáŸÜÿ±€å':
        tweet_topic_list[4] += 1
      elif label == 'Ÿàÿ±ÿ≤ÿ¥€å':
        tweet_topic_list[5] += 1
      elif label == 'Ÿæÿ≤ÿ¥⁄©€å':
        tweet_topic_list[6] += 1
      elif label == 'ÿßÿ¨ÿ™ŸÖÿßÿπ€å':
        tweet_topic_list[7] += 1

  export_list_tweet_topics = []
  for num in tweet_topic_list:
    export_list_tweet_topics.append(str(num))
  tweet_topics_dict[str(column)] = '#'.join(export_list_tweet_topics)

Downloading (‚Ä¶)lve/main/config.json:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/651M [00:00<?, ?B/s]

Downloading (‚Ä¶)okenizer_config.json:   0%|          | 0.00/62.0 [00:00<?, ?B/s]

Downloading (‚Ä¶)solve/main/vocab.txt:   0%|          | 0.00/1.20M [00:00<?, ?B/s]

Downloading (‚Ä¶)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30


In [12]:
with open("topics.txt", "w") as fp:
    json.dump(tweet_topics_dict, fp)  # encode dict into JSON
print("Done writing dict into .txt file")


Done writing dict into .txt file
