In [None]:
!pip install hazm



In [None]:
!pip install parsivar

Collecting parsivar
  Using cached parsivar-0.2.3.1-py3-none-any.whl.metadata (242 bytes)
Using cached parsivar-0.2.3.1-py3-none-any.whl (18.0 MB)
Installing collected packages: parsivar
Successfully installed parsivar-0.2.3.1


In [None]:
from hazm import WordTokenizer, Stemmer, stopwords_list
from parsivar import Normalizer
import pandas as pd
import numpy as np
import re
from collections import Counter
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.utils import resample

In [None]:
df = pd.read_csv('data.csv')

In [None]:
df.head()

Unnamed: 0,Text,Score,Suggestion
0,این اولین تجربه من برای خرید ایفون هست امروز...,100,1
1,خرید این محصول رو توصیه میکنم,84,1
2,1 ساله این گوشی رو دارم هیچ نقطه ضعفی ازش ند...,60,1
3,سلام خدمت دوستان این گوشی از همه نظر عالی کیف...,96,1
4,سلام دوستانی که نگران شکستن صفحه نمایش هستند ا...,92,1


In [None]:
df.shape

(3261, 3)

In [None]:
def map_label(score, suggestion):
  if score >= 70 and suggestion == 1:
    return 2 # Positive Data
  elif score >= 40 and suggestion == 3:
    return 1 # Neutral Data
  elif score <= 40 and suggestion == 2:
    return 0 # Negative
  else:
    return -1 # Noisy Data

In [None]:
df['label'] = df.apply(lambda row: map_label(row['Score'], row['Suggestion']), axis = 1)

In [None]:
df.drop(df[df['label'] == -1].index, inplace = True)

In [None]:
df.shape

(2157, 4)

In [None]:
df.value_counts('label')

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
2,1827
1,302
0,28


In [None]:
normalizer = Normalizer(statistical_space_correction = True)
tokenizer = WordTokenizer()
stemmer = Stemmer()

In [None]:
def preprocess_text(text, apply_stemming = True):
  stopwords = set(stopwords_list())

  text = normalizer.normalize(text)
  text = re.sub(r'-+', '', text)
  text = re.sub(r'\d+', '', text)
  text = re.sub(r'\s+', ' ',text).strip()

  tokens = tokenizer.tokenize(text)

  processed = []
  for token in tokens:
    if token not in stopwords:
      if apply_stemming:
        processed.append(stemmer.stem(token))
      else:
        processed.append(token)

  return ' '.join(processed)

In [None]:
df['Tokens'] = df['Text'].apply(preprocess_text)

In [None]:
def build_vocab(text, min_freq = 1):
  counter = Counter()
  for tokens in text:
    counter.update(tokens)

  vocab = {'<PAD>': 0, '<UNK>': 1}
  for word, freq in counter.items():
    if freq >= min_freq:
      vocab[word] = len(vocab)

  return vocab

In [None]:
tokenized_texts = df['Tokens'].tolist()
vocab = build_vocab(tokenized_texts)

In [None]:
def tokens_to_indices(tokens, vocab):
  return [vocab.get(token, vocab['<UNK>']) for token in tokens]

In [None]:
def pad_sequence(seq, max_len, pad_value = 0):
  return seq[:max_len] + [pad_value] * max(0, max_len - len(seq))

In [None]:
class DigikalaDataset(Dataset):
  def __init__(self, dataframe, vocab, max_len = 30):
    self.vocab = vocab
    self.max_len = max_len
    self.texts = dataframe['Text'].tolist()
    self.labels = dataframe['label'].tolist()
    self.tokens_list = [preprocess_text(text) for text in self.texts]
    self.indexed_padded = [pad_sequence(tokens_to_indices(tokens, vocab), max_len) for tokens in self.tokens_list]

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    text_tensor = torch.tensor(self.indexed_padded[idx], dtype = torch.long)
    label_tensor = torch.tensor(self.labels[idx], dtype = torch.long)
    return text_tensor, label_tensor

In [None]:
train_df, val_df = train_test_split(df, test_size = 0.2, random_state = 42, stratify = df['label'])
df_majority = train_df[train_df['label'] == 2]
df_class1 = train_df[train_df['label'] == 1]
df_class0 = train_df[train_df['label'] == 0]
df_class1_upsampled = resample(df_class1,
                               replace=True,
                               n_samples=len(df_majority),
                               random_state=42)

df_class0_upsampled = resample(df_class0,
                               replace=True,
                               n_samples=len(df_majority),
                               random_state=42)

train_df_balanced = pd.concat([df_majority, df_class1_upsampled, df_class0_upsampled])
train_df = train_df_balanced.sample(frac=1).reset_index(drop=True)

train_dataset = DigikalaDataset(train_df, vocab, max_len = 50)
val_dataset = DigikalaDataset(val_df, vocab, max_len = 50)

train_dataloader = DataLoader(train_dataset, batch_size = 32, shuffle = True)
val_dataloader = DataLoader(val_dataset, batch_size = 32)

In [None]:
class Classifier(nn.Module):
  def __init__(self, vocab_size, embedding_dim = 128, hidden_dim = 128, output_dim = 3, dropout_prob = 0.3):
    super(Classifier, self).__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = 0)
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first = True, bidirectional = True)
    self.dropout = nn.Dropout(dropout_prob)
    self.fc1 = nn.Linear(hidden_dim * 2, hidden_dim)
    self.batch_norm = nn.BatchNorm1d(hidden_dim)
    self.relu = nn.ReLU()
    self.fc2 = nn.Linear(hidden_dim, output_dim)

  def forward(self, x):
    embedded = self.embedding(x)

    lstm_out, (h_n, h_c) = self.lstm(embedded)
    h_n = torch.cat((h_n[-2, :, :], h_n[-1, :, :]), dim = 1)
    h_n = self.dropout(h_n)
    fc1_out = self.fc1(h_n)
    fc1_out = self.batch_norm(fc1_out)
    fc1_out = self.relu(fc1_out)


    output = self.fc2(fc1_out)
    return output

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Classifier(len(vocab), embedding_dim = 128, hidden_dim = 128, output_dim = 3).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.0001)

In [None]:
def evaluate(model, dataloader):
  model.eval()
  all_preds, all_labels = [], []

  with torch.no_grad():
    for inputs, labels in dataloader:
      inputs, labels = inputs.to(device), labels.to(device)
      outputs = model(inputs)
      preds = torch.argmax(outputs, dim = 1)
      all_preds.extend(preds.cpu().numpy())
      all_labels.extend(labels.cpu().numpy())

  acc = accuracy_score(all_labels, all_preds)
  precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average = 'macro')

  return acc, precision, recall, f1

In [None]:
epochs = 100
for epoch in range(epochs):
  model.train()
  total_loss = 0
  for inputs, labels in train_dataloader:
    inputs, labels = inputs.to(device), labels.to(device)
    optimizer.zero_grad()
    outputs = model(inputs)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
    total_loss += loss.item()

  avg_train_loss = total_loss / len(train_dataloader)
  val_acc, val_precision, val_recall, val_f1 = evaluate(model, val_dataloader)
  print(f'Epoch {epoch + 1}/{epochs}, Train Loss: {avg_train_loss:.4f}, Val Acc: {val_acc:.4f} | Val Precision: {val_precision:.4f} | Val Recall: {val_recall:.4f} | Val F1: {val_f1:.4f}')


Epoch 1/100, Train Loss: 0.9357, Val Acc: 0.5069 | Val Precision: 0.3598 | Val Recall: 0.4295 | Val F1: 0.3191
Epoch 2/100, Train Loss: 0.7296, Val Acc: 0.5231 | Val Precision: 0.3563 | Val Recall: 0.4312 | Val F1: 0.3255
Epoch 3/100, Train Loss: 0.6185, Val Acc: 0.5463 | Val Precision: 0.3842 | Val Recall: 0.4729 | Val F1: 0.3670
Epoch 4/100, Train Loss: 0.5400, Val Acc: 0.5579 | Val Precision: 0.3681 | Val Recall: 0.4181 | Val F1: 0.3468
Epoch 5/100, Train Loss: 0.4803, Val Acc: 0.6273 | Val Precision: 0.3901 | Val Recall: 0.4769 | Val F1: 0.3938
Epoch 6/100, Train Loss: 0.4350, Val Acc: 0.6505 | Val Precision: 0.3952 | Val Recall: 0.4406 | Val F1: 0.3947
Epoch 7/100, Train Loss: 0.4007, Val Acc: 0.6435 | Val Precision: 0.3823 | Val Recall: 0.4286 | Val F1: 0.3817
Epoch 8/100, Train Loss: 0.3678, Val Acc: 0.6829 | Val Precision: 0.3985 | Val Recall: 0.4348 | Val F1: 0.4031
Epoch 9/100, Train Loss: 0.3399, Val Acc: 0.6806 | Val Precision: 0.3845 | Val Recall: 0.4199 | Val F1: 0.3897
E

In [None]:
model.eval()
for i in range(5):
  x, y_true = val_dataset[i]
  x = x.unsqueeze(0).to(device)
  pred_logits = model(x)
  y_pred = torch.argmax(pred_logits, dim = 1)
  print(f"Text: {val_dataset.texts[i]}")
  print(f"True Label: {val_dataset.labels[i]}")
  print(f"Predicted Label: {y_pred.item()}")
  print("----------------------------------------------------")



Text: کنسول که حرف نداره   واقعا عالیه  اما درباره بازی   باید بگم خوبه اما  قسمت داستانیش خیلی خوب نیست   در کمتر از 7 یا 8 ساعت تموم میشه   
True Label: 2
Predicted Label: 2
----------------------------------------------------
Text: به گوشش قطع شد بعده یک سال 
True Label: 1
Predicted Label: 1
----------------------------------------------------
Text: ما که راضی هستیم  
True Label: 2
Predicted Label: 2
----------------------------------------------------
Text: خیلی عالیه از همه نظر من این لپ تاپو حدود یک سال یا بیشتره دارمش ولی یک دفعه بهش یه فلش ویروسی زدم کلا صفحش پرید ولی فرستادم گارانتی درست شد  الان هم بازیای کالاف و بتل فیلد و کانتر رو به راحتی اجرا میکنه ولی یک مقدار سرعت سیستم اومده پایین  شاید برای اینه که خیلی ازش کار کشیدم     
True Label: 2
Predicted Label: 2
----------------------------------------------------
Text: من اینو از دیجی خریدم مهمترین چیزی که کاربر باهاش سروکار داره و دایم با اون با سیستم ارتباط برقرار میکنه صفحه نمایشه که در این لپ تاپ ایسوس واقعا در عدم کیفیت