# Labeling Post-Processed Crawled datas

## Install and Import Libraries

In [None]:
# Install required packages
!pip install pyyaml==5.4.1

!pip install -q transformers
!pip install -q hazm
!pip install -q clean-text[gpl]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyyaml==5.4.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 4.9 MB/s 
[?25hInstalling collected packages: pyyaml
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 6.0
    Uninstalling PyYAML-6.0:
      Successfully uninstalled PyYAML-6.0
Successfully installed pyyaml-5.4.1
[K     |████████████████████████████████| 4.9 MB 5.4 MB/s 
[K     |████████████████████████████████| 120 kB 45.2 MB/s 
[K     |████████████████████████████████| 6.6 MB 38.5 MB/s 
[K     |████████████████████████████████| 316 kB 5.3 MB/s 
[K     |████████████████████████████████| 233 kB 64.9 MB/s 
[K     |████████████████████████████████| 1.4 MB 57.9 MB/s 
[?25h  Building wheel for nltk (setup.py) ... [?25l[?25hdone
  Building wheel for libwapiti (setup.py) ... [?25l[?25hdone
[K     |█████████████████████

In [None]:
from transformers import BertConfig, BertTokenizer
from transformers import BertModel

from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

from tqdm.notebook import tqdm

import hazm
import pandas as pd
import numpy as np
import os
import torch
import torch.nn as nn
import torch.nn.functional as F

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

## Functions for Model Usage

In [None]:
class Dataset(torch.utils.data.Dataset):
    """ Create a PyTorch dataset. """

    def __init__(self, tokenizer, comments, targets=None, label_list=None, max_len=128):
        self.comments = comments
        self.targets = targets
        self.has_target = isinstance(targets, list) or isinstance(targets, np.ndarray)

        self.tokenizer = tokenizer
        self.max_len = max_len

        
        self.label_map = {label: i for i, label in enumerate(label_list)} if isinstance(label_list, list) else {}
    
    def __len__(self):
        return len(self.comments)

    def __getitem__(self, item):
        comment = str(self.comments[item])

        if self.has_target:
            target = self.label_map.get(str(self.targets[item]), str(self.targets[item]))

        encoding = self.tokenizer.encode_plus(
            comment,
            add_special_tokens=True,
            truncation=True,
            max_length=self.max_len,
            return_token_type_ids=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt')
        
        inputs = {
            'comment': comment,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'token_type_ids': encoding['token_type_ids'].flatten(),
        }

        if self.has_target:
            inputs['targets'] = torch.tensor(target, dtype=torch.long)
        
        return inputs


def create_data_loader(x, y, tokenizer, max_len, batch_size, label_list):
    dataset = Dataset(
        comments=x,
        targets=y,
        tokenizer=tokenizer,
        max_len=max_len, 
        label_list=label_list)
    
    return torch.utils.data.DataLoader(dataset, batch_size=batch_size)

In [None]:
class Model(nn.Module):

    def __init__(self, config):
        super(Model, self).__init__()

        self.bert = BertModel.from_pretrained(MODEL_NAME_OR_PATH)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
    
    def forward(self, input_ids, attention_mask, token_type_ids):
        _, pooled_output = self.bert(
            input_ids=input_ids, 
            attention_mask=attention_mask, 
            token_type_ids=token_type_ids,
            return_dict=False)
        
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits 

In [None]:
def predict(model, comments, tokenizer, max_len=128, batch_size=32):
    data_loader = create_data_loader(comments, None, tokenizer, max_len, batch_size, None)
    
    predictions = []
    prediction_probs = []

    
    model.eval()
    with torch.no_grad():
        for dl in tqdm(data_loader, position=0):
            input_ids = dl['input_ids']
            attention_mask = dl['attention_mask']
            token_type_ids = dl['token_type_ids']

            # move tensors to GPU if CUDA is available
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            token_type_ids = token_type_ids.to(device)
            
            # compute predicted outputs by passing inputs to the model
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids)
            
            # convert output probabilities to predicted class
            _, preds = torch.max(outputs, dim=1)

            predictions.extend(preds)
            prediction_probs.extend(F.softmax(outputs, dim=1))

    predictions = torch.stack(predictions).cpu().detach().numpy()
    prediction_probs = torch.stack(prediction_probs).cpu().detach().numpy()

    return predictions, prediction_probs

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f'device: {device}')

train_on_gpu = torch.cuda.is_available()

if not train_on_gpu:
    print('CUDA is not available.  Training on CPU ...')
else:
    print('CUDA is available!  Training on GPU ...')

device: cpu
CUDA is not available.  Training on CPU ...


## Load Data

In [None]:
!gdown https://drive.google.com/u/0/uc?id=1dVrIwWlKSNSxQbD5AY-Zck01LZ7hx6hg&export=download
!gdown https://drive.google.com/u/0/uc?id=1YSIM8YGEtEw6octHnk5XAKJdk5wmqNLn&export=download
!gdown https://drive.google.com/u/0/uc?id=1Trz7F5aZk4rc_ghU07RrNOrSLSCpW4BR&export=download

!mkdir data

!mv "hamrah_aval.csv" "data/hamrah_aval.csv"
!mv "irancell.csv" "data/irancell.csv"
!mv "rightel.csv" "data/rightel.csv"

Downloading...
From: https://drive.google.com/u/0/uc?id=1dVrIwWlKSNSxQbD5AY-Zck01LZ7hx6hg
To: /content/hamrah_aval.csv
100% 325k/325k [00:00<00:00, 116MB/s]
Downloading...
From: https://drive.google.com/u/0/uc?id=1YSIM8YGEtEw6octHnk5XAKJdk5wmqNLn
To: /content/irancell.csv
100% 363k/363k [00:00<00:00, 131MB/s]
Downloading...
From: https://drive.google.com/u/0/uc?id=1Trz7F5aZk4rc_ghU07RrNOrSLSCpW4BR
To: /content/rightel.csv
100% 59.1k/59.1k [00:00<00:00, 52.9MB/s]


In [None]:
hamrah_aval = pd.read_csv("data/hamrah_aval.csv", encoding='utf-8')
irancell = pd.read_csv("data/irancell.csv", encoding='utf-8')
rightel = pd.read_csv("data/rightel.csv", encoding='utf-8')

In [None]:
hir = pd.concat([hamrah_aval, irancell, rightel], ignore_index=True)
hir = hir.drop_duplicates()
hir = hir.reset_index()
hir = hir.drop(columns=['index'])
hir

Unnamed: 0,created_time,user_id,text,is_retweet,is_quote,is_reply,tags
0,2022-07-19 19:18:27,1355911468902981639,هیچ کس تنها نیست … … همراه اول,False,False,True,{'hamrah_aval'}
1,2022-07-19 19:13:32,614986877,بلی، ایرانسل از همراه اول بدتره حتی,False,False,True,"{'hamrah_aval', 'irancell'}"
2,2022-07-19 19:09:10,941220112845164544,بسته سه ماهه همراه اول رو دو دوره یعنی شش ماه ...,False,False,True,{'hamrah_aval'}
3,2022-07-19 19:05:41,822277959176306688,همراه اول هم همینه,False,False,True,{'hamrah_aval'}
4,2022-07-19 19:00:44,1111363641545236480,بسته‌ی سه گیگ ماهانه‌ی اینترنت همراه اول می‌خر...,False,False,False,{'hamrah_aval'}
...,...,...,...,...,...,...,...
2261,2022-07-10 07:35:43,1474119738490040324,این چه سمی بود! من خیلی بچه بودم زنها تازه تاز...,False,False,True,{'rightel'}
2262,2022-07-09 20:08:21,1355434208022421504,کل روز اینترنت قطع بود حتی ملت ۹۱۱ هم نمی‌تونس...,True,False,False,{'rightel'}
2263,2022-07-09 19:29:11,967671943750184960,آره باید تو خود رایتل هم شارژش کنی تا وصل شه.,False,False,True,{'rightel'}
2264,2022-07-09 17:41:06,1280293767581499394,رایتل,False,False,True,{'rightel'}


In [None]:
hir.to_csv("data/hir.csv", index=None)

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import shutil
shutil.copy("data/hir.csv", "/content/gdrive/MyDrive/crawler/csv_files_20220719/hir.csv")

'/content/gdrive/MyDrive/crawler/csv_files_20220719/hir.csv'

## Sentiment Analysis Models

### Sentipers [-2, -1, 0, 1, 2] Model

In [None]:
# general config
MAX_LEN = 128
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
TEST_BATCH_SIZE = 16

EPOCHS = 10
EEVERY_EPOCH = 1000
LEARNING_RATE = 2e-5
CLIP = 0.0

MODEL_NAME_OR_PATH = 'HooshvareLab/bert-fa-base-uncased'
OUTPUT_PATH = '/content/bert-fa-base-uncased-deepsentipers-sentiment-analysis/pytorch_model.bin'

os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

In [None]:
labels = [-2, -1, 0, 1, 2]

In [None]:
# create a key finder based on label 2 id and id to label

label2id = {label: i for i, label in enumerate(labels)}
id2label = {v: k for k, v in label2id.items()}

print(f'label2id: {label2id}')
print(f'id2label: {id2label}')

label2id: {-2: 0, -1: 1, 0: 2, 1: 3, 2: 4}
id2label: {0: -2, 1: -1, 2: 0, 3: 1, 4: 2}


In [None]:
# setup the tokenizer and configuration

tokenizer = BertTokenizer.from_pretrained(MODEL_NAME_OR_PATH)
config = BertConfig.from_pretrained(
    MODEL_NAME_OR_PATH, **{
        'label2id': label2id,
        'id2label': id2label,
    })

print(config.to_json_string())

Downloading:   0%|          | 0.00/1.20M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440 [00:00<?, ?B/s]

{
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": -2,
    "1": -1,
    "2": 0,
    "3": 1,
    "4": 2
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "-2": 0,
    "-1": 1,
    "0": 2,
    "1": 3,
    "2": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 100000
}



#### Load Model for Prediction

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!ls /content/gdrive/MyDrive/finetuned_parsbert_sentipers_5

finetuned_parsbert_sentipers.pt  test.csv  train.csv  valid.csv


In [None]:
model = Model(config=config)
model = model.to(device)

print('model', type(model))

Downloading:   0%|          | 0.00/654M [00:00<?, ?B/s]

Some weights of the model checkpoint at HooshvareLab/bert-fa-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


model <class '__main__.Model'>


In [None]:
path = F"/content/gdrive/MyDrive/finetuned_parsbert_sentipers_5/finetuned_parsbert_sentipers.pt" 
model.load_state_dict(torch.load(path))

<All keys matched successfully>

Custom input test

unfortunately Hazm library doesn't fix the informal texts properly. we will test Hazm and our model on the tweet `همراه اول کامل قطعه` and it's normal form `همراه اول کامل قطع است`.

In [None]:
import hazm

informalNormalizer = hazm.InformalNormalizer()
normalizer = hazm.Normalizer()
stemmer = hazm.Stemmer()
lemmatizer = hazm.Lemmatizer()

In [None]:
text = 'همراه اول کامل قطعه'
text = informalNormalizer.normalize(text)
text

[[['همراه'], ['اول'], ['کامل'], ['قطعه']]]

In [None]:
text = 'همراه اول کامل قطعه'
text = normalizer.normalize(text)
text = stemmer.stem(text)
text = lemmatizer.lemmatize(text)
text

'همراه اول کامل قطعه'

In [None]:
xtmp_test = [text]
test_comments = np.array(xtmp_test)
preds, probs = predict(model, test_comments, tokenizer, max_len=128)

labels[preds[0]]

  0%|          | 0/1 [00:00<?, ?it/s]

2

In [None]:
xtmp_test = ['همراه اول کامل قطع است']
test_comments = np.array(xtmp_test)
preds, probs = predict(model, test_comments, tokenizer, max_len=128)

labels[preds[0]]

  0%|          | 0/1 [00:00<?, ?it/s]

-1

using on dataset

In [None]:
xtmp_test = list(hir["text"])
test_comments = np.array(xtmp_test)
preds, probs = predict(model, test_comments, tokenizer, max_len=128)

  0%|          | 0/71 [00:00<?, ?it/s]

#### Save processed dataset to drive

In [None]:
scores = [labels[pred] for pred in preds]
hir["sentiment_scores"] = scores
hir

Unnamed: 0,created_time,user_id,text,is_retweet,is_quote,is_reply,tags,sentiment_scores
0,2022-07-19 19:18:27,1355911468902981639,هیچ کس تنها نیست … … همراه اول,False,False,True,{'hamrah_aval'},1
1,2022-07-19 19:13:32,614986877,بلی، ایرانسل از همراه اول بدتره حتی,False,False,True,"{'hamrah_aval', 'irancell'}",-1
2,2022-07-19 19:09:10,941220112845164544,بسته سه ماهه همراه اول رو دو دوره یعنی شش ماه ...,False,False,True,{'hamrah_aval'},0
3,2022-07-19 19:05:41,822277959176306688,همراه اول هم همینه,False,False,True,{'hamrah_aval'},1
4,2022-07-19 19:00:44,1111363641545236480,بسته‌ی سه گیگ ماهانه‌ی اینترنت همراه اول می‌خر...,False,False,False,{'hamrah_aval'},-1
...,...,...,...,...,...,...,...,...
2261,2022-07-10 07:35:43,1474119738490040324,این چه سمی بود! من خیلی بچه بودم زنها تازه تاز...,False,False,True,{'rightel'},-1
2262,2022-07-09 20:08:21,1355434208022421504,کل روز اینترنت قطع بود حتی ملت ۹۱۱ هم نمی‌تونس...,True,False,False,{'rightel'},-1
2263,2022-07-09 19:29:11,967671943750184960,آره باید تو خود رایتل هم شارژش کنی تا وصل شه.,False,False,True,{'rightel'},0
2264,2022-07-09 17:41:06,1280293767581499394,رایتل,False,False,True,{'rightel'},1


In [None]:
hir.to_csv("data/sentiHir.csv", index=None)

In [None]:
shutil.copy("data/sentiHir.csv", "/content/gdrive/MyDrive/crawler/csv_files_20220719/sentiHir.csv")

'/content/gdrive/MyDrive/crawler/csv_files_20220719/sentiHir.csv'

## Fake detector models

### Load sentiHir.csv

In [None]:
!gdown https://drive.google.com/u/0/uc?id=1-0ruqjg5XgLeholNVW1UDGN4wCGZLvMB&export=download

!mv "sentiHir.csv" "data/sentiHir.csv"

In [None]:
sentiHir = pd.read_csv("data/sentiHir.csv", encoding="utf-8")

### fake-revie-dataset by Joni Salminen [CG, OR] Model

In [None]:
# general config
MAX_LEN = 128
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
TEST_BATCH_SIZE = 16

EPOCHS = 10
EEVERY_EPOCH = 1000
LEARNING_RATE = 2e-5
CLIP = 0.0

MODEL_NAME_OR_PATH = 'HooshvareLab/bert-fa-base-uncased'
OUTPUT_PATH = '/content/bert-fa-base-uncased-cg-or-fake-review-detection/pytorch_model.bin'

os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

In [None]:
labels = ["CG", "OR"]

In [None]:
# create a key finder based on label 2 id and id to label

label2id = {label: i for i, label in enumerate(labels)}
id2label = {v: k for k, v in label2id.items()}

print(f'label2id: {label2id}')
print(f'id2label: {id2label}')

label2id: {'CG': 0, 'OR': 1}
id2label: {0: 'CG', 1: 'OR'}


In [None]:
# setup the tokenizer and configuration

tokenizer = BertTokenizer.from_pretrained(MODEL_NAME_OR_PATH)
config = BertConfig.from_pretrained(
    MODEL_NAME_OR_PATH, **{
        'label2id': label2id,
        'id2label': id2label,
    })

print(config.to_json_string())

{
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "CG",
    "1": "OR"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "CG": 0,
    "OR": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 100000
}



#### Load Model for Prediction

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!ls /content/gdrive/MyDrive/finetuned_parsbert_fake_reveiws_dataset/

finetuned_parsbert_fake_reveiws_dataset.pt  test.csv  train.csv  valid.csv


In [None]:
model = Model(config=config)
model = model.to(device)

print('model', type(model))

Some weights of the model checkpoint at HooshvareLab/bert-fa-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


model <class '__main__.Model'>


In [None]:
path = F"/content/gdrive/MyDrive/finetuned_parsbert_fake_reveiws_dataset/finetuned_parsbert_fake_reveiws_dataset.pt" 
model.load_state_dict(torch.load(path))

<All keys matched successfully>

Custom input test

In [None]:
xtmp_test = ['خوب نبود']
test_comments = np.array(xtmp_test)
preds, probs = predict(model, test_comments, tokenizer, max_len=128)

labels[preds[0]]

  0%|          | 0/1 [00:00<?, ?it/s]

'OR'

using on dataset

In [None]:
xtmp_test = list(sentiHir["text"])
test_comments = np.array(xtmp_test)
preds, probs = predict(model, test_comments, tokenizer, max_len=128)

  0%|          | 0/71 [00:00<?, ?it/s]

#### Save processed dataset to drive

In [None]:
sentiHir["is_real"] = preds
sentiHir

Unnamed: 0,created_time,user_id,text,is_retweet,is_quote,is_reply,tags,sentiment_scores,is_real
0,2022-07-19 19:18:27,1355911468902981639,هیچ کس تنها نیست … … همراه اول,False,False,True,{'hamrah_aval'},1,1
1,2022-07-19 19:13:32,614986877,بلی، ایرانسل از همراه اول بدتره حتی,False,False,True,"{'hamrah_aval', 'irancell'}",-1,1
2,2022-07-19 19:09:10,941220112845164544,بسته سه ماهه همراه اول رو دو دوره یعنی شش ماه ...,False,False,True,{'hamrah_aval'},0,1
3,2022-07-19 19:05:41,822277959176306688,همراه اول هم همینه,False,False,True,{'hamrah_aval'},1,1
4,2022-07-19 19:00:44,1111363641545236480,بسته‌ی سه گیگ ماهانه‌ی اینترنت همراه اول می‌خر...,False,False,False,{'hamrah_aval'},-1,1
...,...,...,...,...,...,...,...,...,...
2261,2022-07-10 07:35:43,1474119738490040324,این چه سمی بود! من خیلی بچه بودم زنها تازه تاز...,False,False,True,{'rightel'},-1,1
2262,2022-07-09 20:08:21,1355434208022421504,کل روز اینترنت قطع بود حتی ملت ۹۱۱ هم نمی‌تونس...,True,False,False,{'rightel'},-1,1
2263,2022-07-09 19:29:11,967671943750184960,آره باید تو خود رایتل هم شارژش کنی تا وصل شه.,False,False,True,{'rightel'},0,1
2264,2022-07-09 17:41:06,1280293767581499394,رایتل,False,False,True,{'rightel'},1,1


In [None]:
sentiHir.to_csv("data/fake_sentiHir.csv", index=None)

In [None]:
shutil.copy("data/fake_sentiHir.csv", "/content/gdrive/MyDrive/crawler/csv_files_20220719/fake_sentiHir.csv")

'/content/gdrive/MyDrive/crawler/csv_files_20220719/fake_sentiHir.csv'

### Digikala [verified, rejected] Model

In [None]:
# general config
MAX_LEN = 512
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
TEST_BATCH_SIZE = 16

EPOCHS = 10
EEVERY_EPOCH = 1000
LEARNING_RATE = 2e-5
CLIP = 0.0

MODEL_NAME_OR_PATH = 'HooshvareLab/bert-fa-zwnj-base'
OUTPUT_PATH = '/content/bert-fa-zwnj-base-digikala-fake-review-detection/pytorch_model.bin'

os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

In [None]:
labels = [0, 1]

In [None]:
# create a key finder based on label 2 id and id to label

label2id = {label: i for i, label in enumerate(labels)}
id2label = {v: k for k, v in label2id.items()}

print(f'label2id: {label2id}')
print(f'id2label: {id2label}')

label2id: {0: 0, 1: 1}
id2label: {0: 0, 1: 1}


In [None]:
# setup the tokenizer and configuration

tokenizer = BertTokenizer.from_pretrained(MODEL_NAME_OR_PATH)
config = BertConfig.from_pretrained(
    MODEL_NAME_OR_PATH, **{
        'label2id': label2id,
        'id2label': id2label,
    })

print(config.to_json_string())

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/134 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/292 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/565 [00:00<?, ?B/s]

{
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": 0,
    "1": 1
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "0": 0,
    "1": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 42000
}



#### Load Model for Prediction

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!ls /content/gdrive/MyDrive/finetuned_parsbert_fake_reveiws_digikala_2Xverifeid/

finetuned_parsbert_fake_reveiws_digikala_2Xverifeid.bin  train.csv
test.csv						 valid.csv


In [None]:
model = Model(config=config)
model = model.to(device)

print('model', type(model))

In [None]:
path = F"/content/gdrive/MyDrive/finetuned_parsbert_fake_reveiws_digikala_2Xverifeid/finetuned_parsbert_fake_reveiws_digikala_2Xverifeid.bin" 
model.load_state_dict(torch.load(path))

<All keys matched successfully>

Custom input test

In [None]:
xtmp_test = ['خوب نبود']
test_comments = np.array(xtmp_test)
preds, probs = predict(model, test_comments, tokenizer, max_len=512)

labels[preds[0]]

  0%|          | 0/1 [00:00<?, ?it/s]

1

using on dataset

In [None]:
xtmp_test = list(sentiHir["text"])
test_comments = np.array(xtmp_test)
preds, probs = predict(model, test_comments, tokenizer, max_len=512)

  0%|          | 0/54 [00:00<?, ?it/s]

#### Save processed dataset to drive

In [None]:
sentiHir["is_real"] = preds
sentiHir

Unnamed: 0,created_time,user_id,text,is_retweet,is_quote,is_reply,tags,sentiment_scores,is_real
0,2022-09-16 13:26:59,1275726195745796097,حضور دارند. در صورت بروز هر گونه مشکل در خصوص ...,False,False,True,{'hamrah_aval'},0,0
1,2022-09-16 13:26:49,1275726195745796097,مشترک گرامی، سلام ضمن عرض پوزش از شما برای مشک...,False,False,True,{'hamrah_aval'},0,0
2,2022-09-16 13:26:24,51343107,برنامه شب‌های #آسمان_شب_برای_همه در کنار مردم ...,False,False,True,{'hamrah_aval'},1,0
3,2022-09-16 13:02:34,1251217170681323520,همراه اول به زین هم وصل می‌شد,False,False,True,{'hamrah_aval'},0,1
4,2022-09-16 12:31:10,1042096474656591872,نداریم نت نداریم ن همراه اول ن ایرانسل,False,False,True,"{'hamrah_aval', 'irancell'}",0,0
...,...,...,...,...,...,...,...,...,...
1704,2022-09-07 17:16:30,885589212136972290,ریدن! رایتل گویا بهتره که ندارم,False,False,True,{'rightel'},1,0
1705,2022-09-07 15:29:16,1429803427631095810,اطلس با همه خط‌ها خوبه برای خط همراه اول Vpnif...,False,False,True,{'rightel'},1,1
1706,2022-09-07 15:23:49,871961013805076480,برای رایتل هم باید معرفی‌نامه از شرکت می‌بردیم...,True,False,False,{'rightel'},0,0
1707,2022-09-07 13:52:26,1459908116779343872,کلا بندر نت ایرانسل موجیه. اگه مودم دارید، سیم...,False,False,True,"{'hamrah_aval', 'irancell', 'rightel'}",-1,0


In [None]:
sentiHir.to_csv("data/fake_sentiHir.csv", index=None)

In [None]:
shutil.copy("data/fake_sentiHir.csv", "/content/gdrive/MyDrive/crawler/csv_files/fake_sentiHir_digikala_fake_detection.csv")

'/content/gdrive/MyDrive/crawler/csv_files/fake_sentiHir_digikala_fake_detection.csv'

## Review

In [None]:
sentiHir[sentiHir["sentiment_scores"]==2]

Unnamed: 0,created_time,user_id,text,is_retweet,is_quote,is_reply,tags,sentiment_scores,is_real
136,2022-07-18 10:23:08,1430389746681925634,هیچکس تنها نیست هلیا، همراه اول,False,False,True,{'hamrah_aval'},2,1
207,2022-07-17 16:20:27,1525550371879280641,دا هیچکس تنها نیست، همراه اول,False,False,True,{'hamrah_aval'},2,1
240,2022-07-17 11:24:01,1472674346590248961,همراه اول خیلی ممنون,False,False,True,{'hamrah_aval'},2,1
251,2022-07-17 07:23:08,1283324161574817793,عید #غدیر عالی امسال با کار خوب همراه اول #کهک...,False,False,False,{'hamrah_aval'},2,1
339,2022-07-16 02:54:26,1544977381076701188,منم سه روز پیش به اسم هدیه اینترنت همراه اول ب...,False,False,True,{'hamrah_aval'},2,1
341,2022-07-15 22:58:38,1394744720388153348,سرعت اینترنت همراه اول. حرفی ندارم,False,False,False,{'hamrah_aval'},2,1
522,2022-07-13 20:15:04,1386609761773293573,من الان ۶ماهی هست رایتلم و جدا نسبت به همراه ا...,False,False,True,"{'hamrah_aval', 'rightel', 'irancell'}",2,1
804,2022-07-12 12:52:08,39644707,خدا رو شکر دمشون گرم واقعا برای بچه‌ها داستان ...,False,False,True,{'hamrah_aval'},2,1
1064,2022-07-10 08:21:28,1544381852907175937,من همراه اول دارم خیلی خوبه,False,False,True,{'hamrah_aval'},2,1
1293,2022-07-17 16:12:02,1279040029382377478,وای چه عالی چراغ اول روشن شد ایرانسل لطفا,False,False,True,{'irancell'},2,1


In [None]:
sentiHir[sentiHir["is_real"]==0]

Unnamed: 0,created_time,user_id,text,is_retweet,is_quote,is_reply,tags,sentiment_scores,is_real
33,2022-07-19 11:25:56,1259601174346547200,۲۰ گیگ شصت هزار تومن، ۳۰ گیگ نود هزار تومن. هم...,False,False,False,{'hamrah_aval'},0,0
50,2022-07-19 06:10:56,477566968,I'm at Avval Hamrahe Tower | برج همراه اول in ...,False,False,False,{'hamrah_aval'},0,0
86,2022-07-18 19:40:41,1333657131451551744,نفری یکی بزنید تا بقیه هم بتونن بزنن کد‌های شا...,False,True,False,"{'hamrah_aval', 'irancell'}",0,0
89,2022-07-18 19:00:37,808229128512081920,شارژ بفرستم؟ همراه اول یا ایرانسل,False,False,True,"{'hamrah_aval', 'irancell'}",0,0
121,2022-07-18 12:11:26,1354449851002445824,ھمراہ اول (:,False,False,True,{'hamrah_aval'},0,0
...,...,...,...,...,...,...,...,...,...
2198,2022-07-13 06:55:59,1457053462689161221,همچنین رایتل,False,False,True,{'rightel'},0,0
2217,2022-07-12 14:12:10,395275311,و رایتل,False,False,True,{'rightel'},0,0
2230,2022-07-11 22:49:47,930561831659335680,#رایتل و دست اندرکاران اون از صدر تا ذیل #تنظی...,False,False,False,{'rightel'},0,0
2237,2022-07-11 05:34:44,896599607798640644,فراخوان رایتل پیرامون شناسایی تأمین‌کنندگان فع...,False,False,False,{'rightel'},0,0
