Unzipping the data

In [1]:
!unzip Snapp.zip

Archive:  Snapp.zip
  inflating: Snappfood - Sentiment Analysis.csv  


Importing packages

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
import numpy as np
from sklearn.metrics import classification_report
from sklearn.metrics import classification_report, confusion_matrix
from collections import Counter
import math

Installing packages

In [3]:
!pip install -q hazm
!pip install nltk

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.7/316.7 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m34.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.6/233.6 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for nltk (setup.py) ... [?25l[?25hdone
  Building wheel for libwapiti (setup.py) ... [?25l[?25hdone
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Importing packages

In [2]:
from hazm import Normalizer,Stemmer,Lemmatizer
from hazm import stopwords_list
from hazm import *
normalizer = Normalizer()
stemr = Stemmer()
lemmatizer = Lemmatizer()

Reading the data

In [3]:
df = pd.read_csv('/content/Snappfood - Sentiment Analysis.csv' , on_bad_lines='skip' , delimiter='\t')
df.head()

Unnamed: 0.1,Unnamed: 0,comment,label,label_id
0,,واقعا حیف وقت که بنویسم سرویس دهیتون شده افتضاح,SAD,1.0
1,,قرار بود ۱ ساعته برسه ولی نیم ساعت زودتر از مو...,HAPPY,0.0
2,,قیمت این مدل اصلا با کیفیتش سازگاری نداره، فقط...,SAD,1.0
3,,عالللی بود همه چه درست و به اندازه و کیفیت خوب...,HAPPY,0.0
4,,شیرینی وانیلی فقط یک مدل بود.,HAPPY,0.0


Preparing the dataset

In [4]:
df.drop(df[df['label'] == "1"].index, inplace=True)
df.drop(df[df['label'] == "0"].index, inplace=True)
X=df['comment'].values
y=df['label'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

Implementing the IFIDF vectorizer

In [5]:
class TfIdfVectorizer:
    def __init__(self):
        self.vocab = None
        self.doc_freqs = None
    
    def fit(self, documents):
        self.vocab = set()
        self.doc_freqs = Counter()
        for doc in documents:
            tokens = word_tokenize(doc)
            normal_tokens = list()
            for token in tokens:
              normal_tokens.append(lemmatizer.lemmatize(stemr.stem(token)))
            self.vocab.update(normal_tokens)
            self.doc_freqs.update(set(normal_tokens))
        print(len(self.vocab))
    
    def transform(self, documents):
        tf_vectors = []
        for doc in documents:
            doc = word_tokenize(doc)
            tf = Counter(doc)
            tf_vector = [tf[word] for word in self.vocab]
            tf_vector = [freq / max(tf.values()) for freq in tf_vector] # normalize
            idf_vector = [math.log(len(documents) / self.doc_freqs[word]) for word in self.vocab]
            tf_idf_vector = [tf_vector[i] * idf_vector[i] for i in range(len(self.vocab))]
            tf_vectors.append(tf_idf_vector)
        return tf_vectors


Making labels from y_train

In [6]:
np.unique(y_train)

array(['HAPPY', 'SAD'], dtype=object)

Getting TFIDF vectors

In [7]:
tfidf = TfIdfVectorizer()
tfidf.fit(X_train)
tfidf_vectors_train = tfidf.transform(X_train)
tfidf_vectors_test = tfidf.transform(X_test)

9869


Predicting labels of test data using TFIDF vectors

In [8]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
clf_pf = GaussianNB()
clf_pf.partial_fit(tfidf_vectors_train, y_train, classes=np.unique(y_train))
y_pred = clf_pf.predict(tfidf_vectors_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       HAPPY       0.51      0.88      0.64       650
         SAD       0.70      0.24      0.36       740

    accuracy                           0.54      1390
   macro avg       0.60      0.56      0.50      1390
weighted avg       0.61      0.54      0.49      1390



Implementing the PPMI vectorizer

In [52]:
class PPMIVectorizer():

  def __init__(self):
      self.vocab = None
      self.doc_freqs = None
      self.word_in_doc_list = dict()
      self.word_count = dict()
      self.tokenized_sentences = list()
  
  def fit(self, documents):
      self.vocab = set()
      self.doc_freqs = Counter()
      self.word_index_dict = dict()
      self.vocab_list = list()
      for doc in documents:
          tokens = word_tokenize(doc)
          normal_tokens = list()
          for token in tokens:
            normal_tokens.append(lemmatizer.lemmatize(stemr.stem(token)))
          self.vocab.update(normal_tokens)
          self.doc_freqs.update(set(normal_tokens))
          self.tokenized_sentences.append(normal_tokens)

      self.vocab_list = list(self.vocab)
      for word_index in range(len(self.vocab)):
        self.word_index_dict[self.vocab_list[word_index]] = word_index
      self._cal_ppmi(documents)


  def _cal_ppmi(self, documents):
      ppmi_matrix = np.zeros((len(self.vocab), len(self.vocab)))
      token_count = 0
      for sentence in self.tokenized_sentences:
        for token_index in range(len(sentence)):
            real_token = sentence[token_index]
            ppmi_token_index = self.word_index_dict[real_token]
            for neighbor_token_index in range(len(sentence)):
              neighbor_token = sentence[neighbor_token_index]
              ppmi_neighbor_token_index = self.word_index_dict[neighbor_token]
              ppmi_matrix[ppmi_token_index][ppmi_neighbor_token_index] += 1
              ppmi_matrix[ppmi_neighbor_token_index][ppmi_token_index] += 1

      PMI = ppmi_matrix.sum(axis=0).sum() * ppmi_matrix /  np.outer((ppmi_matrix.sum(axis=1)), (ppmi_matrix.sum(axis=0)))
      for row in PMI:
        for element_index in range(len(row)):
          if row[element_index] > 1:
            row[element_index] = math.log(row[element_index])
          else:
            row[element_index] = 0
      self.PPMI = PMI
  
  
  def transform(self, documents):
      ppmi = []
      for doc in documents:
        doc = word_tokenize(doc)
        token_ppmi = np.zeros(len(self.vocab))
        count = 0
        for token in doc:
          token = lemmatizer.lemmatize(stemr.stem(token))
          if token in self.word_index_dict:
            ppmi_token_index = self.word_index_dict[token]
            count += 1
            token_ppmi += self.PPMI[ppmi_token_index]
        if(count != 0):
          ppmi.append(token_ppmi/count)
        else:
          ppmi.append(token_ppmi)
      return ppmi
        


In [53]:
ppmi = PPMIVectorizer()
ppmi.fit(X_train)
ppmi_vectors_train = ppmi.transform(X_train)
ppmi_vectors_test = ppmi.transform(X_test)
clf_pf = GaussianNB()
clf_pf.partial_fit(ppmi_vectors_train, y_train, classes=np.unique(y_train))
y_pred = clf_pf.predict(ppmi_vectors_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       HAPPY       0.86      0.69      0.77       650
         SAD       0.77      0.90      0.83       740

    accuracy                           0.81      1390
   macro avg       0.82      0.80      0.80      1390
weighted avg       0.81      0.81      0.80      1390

