In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from transformers import AutoModel,AutoTokenizer,AutoModelForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from dataclasses import dataclass

@dataclass
class config:
    # Hyperparameters
    BATCH_SIZE = 32
    MAX_LENGTH = 64
    NUM_EPOCHS = 30
    B1 = 0.9
    B2 = 0.999
    _LAMBDA = 0.01
    LEARNING_RATE = 2e-5
    LOGFILE = 'logfile.txt'
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # DEVICE = 'cpu'

In [3]:
import re
from dadmatools.models.normalizer import Normalizer
from dataclasses import dataclass
from tqdm import tqdm

class Preprocessing:
    def __init__(self):
        self.tokenizer = AutoTokenizer.from_pretrained("HooshvareLab/bert-base-parsbert-uncased")

    def __call__(self,text):
        # Step 1: Normalize
        text = self.normalize(text)
        # # Step 2: Remove English characters
        text = self.remove_english_chars(text)
        # # Step 3: Remove repeatable characters
        # text = self.remove_repeatable_chars(text)
        # # Step 4: Remove Arabic diacritics
        text = self.remove_arabic_diacritics(text)
        # # Step 5: Remove non-Persian characters
        text = self.remove_non_persian_chars(text)
        # # Step 6: Remove hashtags
        text = self.remove_hashtags(text)
        # # Step 7: Remove Persian numerals
        text = self.remove_persian_numerals(text)
        # text = self.remove_hash_symbol(self.lemmatize(text))
        
        
        return text
    
    # Step 1: Normalize
    def normalize(self,text):
        normalizer = Normalizer(
            full_cleaning=True,
        )
        return normalizer.normalize(text)

    def remove_hash_symbol(self,text):
        # Remove hash symbol (#)
        try:
            cleaned_text = text.replace('#', '')
        except:
            cleaned_text = text
        return cleaned_text


    # Step 2: remove any engish character
    def remove_english_chars(self,text):
        english_chars_pattern = re.compile(r'[a-zA-Z]')
        cleaned_text = re.sub(english_chars_pattern, '', text)
        return cleaned_text



    # Step 4: remove arabic diactrics
    def remove_arabic_diacritics(self,text):
        """
            Some common Arabic diacritical marks include:
                Fatha (ً): Represents the short vowel "a" or "u" when placed above a letter.
                Kasra (ٍ): Represents the short vowel "i" when placed below a letter.
                Damma (ٌ): Represents the short vowel "u" when placed above a letter.
                Sukun (ـْ): Indicates the absence of any vowel sound.
                Shadda (ّ): Represents consonant doubling or gemination.
                Tanween (ًٌٍ): Represents the nunation or the "n" sound at the end of a word.
        """

        """
            The regular expression [\u064B-\u065F] represents a character range that covers the Unicode code points for Arabic diacritics.
        """
        # مرحبا بكم <== "مَرْحَبًا بِكُمْ"
        arabic_diacritics_pattern = re.compile(r'[\u064B-\u065F]')
        cleaned_text = re.sub(arabic_diacritics_pattern, '', text)
        return cleaned_text

    # Step 5: remove any non-persian chars
    def remove_non_persian_chars(self,text):
        import emoji
        import regex

        def get_emojis(text):
            emojis = []
            for char in text:
                if regex.match(r'\p{So}', char):
                    emojis.append(char)
            return emojis

        # emojis = get_emojis(text)
        persian_chars_pattern = re.compile(r'[^\u0600-\u06FF\uFB8A\u067E\u0686\u06AF\u200C\u200F]+')
        cleaned_text = re.sub(persian_chars_pattern, ' ', text)
        # cleaned_text += ' '.join(emojis)
        return cleaned_text

    # Step 6: remove # sign from text while keeping the information included into hashtags
    def remove_hashtags(self,text):
        # Regular expression to match hashtags
        hashtag_pattern = r'#\w+'
        
        def extract_and_replace(match):
            # Extract the text from the matched hashtag and remove the '#' sign
            hashtag_text = match.group(0)[1:]
            return hashtag_text
        
        # Use the 're.sub' function with the 'extract_and_replace' function as the replacement
        cleaned_text = re.sub(hashtag_pattern, extract_and_replace, text)
        
        return cleaned_text

    # Step 7: remove persian numeric characters from text
    def remove_persian_numerals(self,text):
        # Define a translation table to map Persian numerals to None (remove them)
        persian_numerals = {
            ord('۰'): None,
            ord('۱'): None,
            ord('۲'): None,
            ord('۳'): None,
            ord('۴'): None,
            ord('۵'): None,
            ord('۶'): None,
            ord('۷'): None,
            ord('۸'): None,
            ord('۹'): None
        }
        # Use str.translate() to remove Persian numerals
        cleaned_text = text.translate(persian_numerals)
        return cleaned_text


In [4]:
train = open('preprocess_data/train.txt',encoding='utf-8').readlines()
test = open('preprocess_data/test.txt',encoding='utf-8').readlines()
X = []
y = []
p = Preprocessing()
label_mapping = {
    'ANGRY':'anger',
    'HATE':'disgust',
    'FEAR':'fear',
    'HAPPY':'joy',
    'SAD':'sad',
    'SURPRISE':'surprise',
    'OTHER':'other'
}
for item in train:
    _X,_y = item.split(",")
    X.append(_X.strip())
    y.append(label_mapping[_y.strip()])

for item in test:
    _X,_y = item.split(",")
    X.append(_X.strip())
    y.append(label_mapping[_y.strip()])
arman = pd.DataFrame({'tweet': X, 'emotion': y})
arman.head()

Unnamed: 0,tweet,emotion
0,خیلی کوچیک هستن و سایزشون بدرد نمیخوره میخوام ...,sad
1,از صدای پرنده دم دمای صبح متنفرم متنفرم متنفرم,disgust
2,کیفیتش خیلی خوبه با شک خریدم ولی واقعا راضیم ب...,sad
3,چون همش با دوربین ثبت‌شده ایا میشه اعتراض زد و...,other
4,این وضع ب طرز خنده داری گریه داره,sad


In [6]:
import math

label_mapping = {
    'anger': 'ANGRY',
    'disgust': 'HATE',
    'fear': 'FEAR',
    'joy': 'HAPPY',
    'sad': 'SAD',
    'surprise': 'SURPRISE'
}
anger = pd.read_csv('PersianTweets/anger.csv')
disgust = pd.read_csv('PersianTweets/disgust.csv')
fear = pd.read_csv('PersianTweets/fear.csv')
sad = pd.read_csv('PersianTweets/sad.csv')
joy = pd.read_csv('PersianTweets/joy.csv')
surprise = pd.read_csv('PersianTweets/surprise.csv')
dataset = pd.concat([anger,disgust,fear,sad,joy,surprise],axis=0)
dataset = dataset[['tweet','emotion']]
combined = pd.concat([dataset,arman])

In [8]:
tqdm.pandas()
combined['tweet'] = combined['tweet'].progress_apply(p)
combined.to_csv('full_preprocessed_dataset.csv')

  0%|          | 0/121137 [00:00<?, ?it/s]

100%|██████████| 121137/121137 [06:00<00:00, 336.06it/s]


In [9]:
def calculate_class_weights(df, target_column, labels_dict):
    emotions = df[target_column]
    class_indices = emotions.map(labels_dict)
    class_counts = class_indices.value_counts().sort_index()
    total_samples = len(class_indices)
    class_weights = torch.tensor(total_samples / (class_counts * len(class_counts)), dtype=torch.float)
    return class_weights

target_column = 'emotion'
labels_dict = {'anger': 0, 'disgust': 1, 'fear': 2, 'sad': 3, 'joy': 4, 'surprise': 5, 'other': 6}
class_weights = calculate_class_weights(combined, target_column, labels_dict)

In [10]:
combined['emotion'].value_counts()

sad         35493
joy         28925
anger       21149
fear        18440
surprise    13745
other        1879
disgust      1506
Name: emotion, dtype: int64

In [12]:
from sklearn.model_selection import train_test_split
# X , y = dataset['tweet'].tolist() , dataset['emotion'].tolist()
train , test = train_test_split(combined,test_size=0.1,stratify=combined['emotion'])
X_train , y_train = train['tweet'].tolist() , train['emotion'].tolist()
X_test , y_test = test['tweet'].tolist() , test['emotion'].tolist()

In [16]:
# import math
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = AutoTokenizer.from_pretrained("HooshvareLab/bert-base-parsbert-uncased")
        self.max_length = config.MAX_LENGTH
        # self.labels_dict = {'SAD': 0, 'HAPPY': 1,'SURPRISE': 2, 'FEAR': 3, 'HATE': 4, 'ANGRY': 5,'OTHER': 6}
        self.labels_dict = {'anger':0,'disgust':1,'fear':2,'sad':3,'joy':4,'surprise':5,'other':6}

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        # self.tokenizer.add_special_tokens(text)
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )

        inputs = {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(self.labels_dict[label])
        }
        return inputs

In [17]:
class SequenceClassifierFactory:
    def __init__(self):
        pass

    @staticmethod
    def create_model(model_name, num_labels):
        model_name = model_name.lower()
        if model_name == 'parsbert':
            tokenizer = AutoTokenizer.from_pretrained("HooshvareLab/bert-base-parsbert-uncased")
            model = AutoModelForSequenceClassification.from_pretrained("HooshvareLab/bert-base-parsbert-uncased", num_labels=num_labels)
        elif model_name == 'xlm-roberta':
            tokenizer = AutoTokenizer.from_pretrained('checkpoints/models--xlm-roberta-large/snapshots/')
            model = AutoModelForSequenceClassification.from_pretrained('checkpoints/models--xlm-roberta-large/snapshots/', num_labels=num_labels)
        elif model_name == 'xlm-emo':
            tokenizer = AutoTokenizer.from_pretrained("m3hrdadfi/xlm-mlm-17-1280-emotion")
            model = AutoModelForSequenceClassification.from_pretrained("m3hrdadfi/xlm-mlm-17-1280-emotion", num_labels=num_labels)
        else:
            raise ValueError(f"Model '{model_name}' not supported.")
        
        return tokenizer, model

In [18]:

from transformers import AutoConfig , AutoTokenizer , AutoModel

def change_grad(module,_req_grad_=False):
    for param in module.parameters():
        param.requires_grad = _req_grad_
    
num_labels = 7
model_name = 'parsbert'
tokenizer , model = SequenceClassifierFactory().create_model(model_name,num_labels)


Some weights of the model checkpoint at HooshvareLab/bert-base-parsbert-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassifica

In [19]:
num_epochs = 5
train_dataset = TextDataset(X_train,y_train,max_length=100)
test_dataset = TextDataset(X_test,y_test,max_length=100)
train_dataloader = torch.utils.data.DataLoader(train_dataset,batch_size=64,shuffle=True)
test_dataloader = torch.utils.data.DataLoader(test_dataset,batch_size=128,shuffle=False)

In [20]:
from sklearn.metrics import accuracy_score, f1_score

criterion = nn.CrossEntropyLoss(weight=class_weights.cuda())
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
model = model.cuda()
for _iter in range(num_epochs):
    train_loss = 0.0
    train_predictions = []
    train_labels = []
    model.train()
    for idx,batch in enumerate(train_dataloader):
        input_ids = batch['input_ids'].cuda()
        attention_mask = batch['attention_mask'].cuda()
        labels = batch['labels'].cuda()
        outputs = model(input_ids,attention_mask).logits
        loss = criterion(outputs,labels)

        # Backward pass and update model parameters
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        batch_predictions = torch.argmax(outputs, dim=1).cpu().tolist()
        train_predictions.extend(batch_predictions)
        train_labels.extend(labels.cpu().tolist())

        train_loss+=loss.item()
        if idx % 50 == 0:
            print("Epoch:",_iter,"batch:",idx,"loss:",loss.item())

        # Calculate accuracy and F-score
    train_accuracy = accuracy_score(train_labels, train_predictions)
    train_fscore = f1_score(train_labels, train_predictions, average='macro')

    print("Total Loss:", train_loss / len(train_dataloader))
    print("Train Accuracy:", train_accuracy)
    print("Train F-score:", train_fscore)


    test_predictions = []
    test_labels = []
    test_loss = 0.0
    model.eval()
    for idx, batch in enumerate(test_dataloader):
        input_ids = batch['input_ids'].cuda()
        attention_mask = batch['attention_mask'].cuda()
        labels = batch['labels'].cuda()
        outputs = model(input_ids, attention_mask).logits
        loss = criterion(outputs, labels)

        batch_predictions = torch.argmax(outputs, dim=1).cpu().tolist()
        test_predictions.extend(batch_predictions)
        test_labels.extend(labels.cpu().tolist())
        test_loss += loss.item()
    test_accuracy = accuracy_score(test_labels, test_predictions)
    test_fscore = f1_score(test_labels, test_predictions, average='macro')
    print("Total Loss (Test):", test_loss / len(test_dataloader))
    print("Test Accuracy:", test_accuracy)
    print("Test F-score:", test_fscore)

    torch.save(model.state_dict(),'checkpoints/ParsBertOnTweets100k.pt')

Epoch: 0 batch: 0 loss: 1.8143632411956787
Epoch: 0 batch: 25 loss: 1.541748046875
Epoch: 0 batch: 50 loss: 1.3507308959960938
Epoch: 0 batch: 75 loss: 0.7798649668693542
Epoch: 0 batch: 100 loss: 0.5820590257644653
Epoch: 0 batch: 125 loss: 0.3716413080692291
Epoch: 0 batch: 150 loss: 0.35400620102882385
Epoch: 0 batch: 175 loss: 0.26377174258232117
Epoch: 0 batch: 200 loss: 0.39654573798179626
Epoch: 0 batch: 225 loss: 0.3441065549850464
Epoch: 0 batch: 250 loss: 0.3368087410926819
Epoch: 0 batch: 275 loss: 0.1494172215461731
Epoch: 0 batch: 300 loss: 0.21539512276649475
Epoch: 0 batch: 325 loss: 0.2687482535839081
Epoch: 0 batch: 350 loss: 0.21493662893772125
Epoch: 0 batch: 375 loss: 0.08349525928497314
Epoch: 0 batch: 400 loss: 0.17957624793052673
Epoch: 0 batch: 425 loss: 0.17172101140022278
Epoch: 0 batch: 450 loss: 0.0331953689455986
Epoch: 0 batch: 475 loss: 0.1509796530008316
Epoch: 0 batch: 500 loss: 0.16187503933906555
Epoch: 0 batch: 525 loss: 0.18249565362930298
Epoch: 0 

KeyboardInterrupt: 

In [None]:
# torch.cuda.empty_cache()
# trainer = Trainer(model=model,train_loader=train_dataloader,val_loader=test_dataloader)
# trainer.train(num_epochs=10)

KeyError: 77333

In [None]:
def predict(texts,local_ids, model, device, threshold=0.5):
    model.eval()  # Set the model to evaluation mode
    inputs = []  # To store the input tensors
    # anger, sadness, surprise, happiness, fear, disgust, other
    label_dict = {0: 'sadness', 1: 'happiness', 2: 'surprise', 3: 'fear', 4: 'disgust', 5: 'anger', 6: 'other'}
    num_classes = len(label_dict)
    # texts = texts['tweet'].tolist()
    # local_id = texts['local_id'].tolist()
    
    # Tokenize and preprocess the texts
    tokenizer = AutoTokenizer.from_pretrained('HooshvareLab/bert-base-parsbert-uncased')
    max_length = 64
    for text in tqdm(texts):
        encoding = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            truncation=True,
            max_length=max_length,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].squeeze().to(device)
        attention_mask = encoding['attention_mask'].squeeze().to(device)
        inputs.append({'input_ids': input_ids, 'attention_mask': attention_mask})

    # Make predictions
    with torch.no_grad():
        predictions = []
        prob_matrix = []  # To store the probability distribution for each text
        for input_data in tqdm(inputs):
            input_ids = input_data['input_ids'].unsqueeze(0)  # Add batch dimension
            attention_mask = input_data['attention_mask'].unsqueeze(0)  # Add batch dimension

            outputs = model(input_ids, attention_mask).logits
            predicted_probs = torch.softmax(outputs, dim=1).squeeze().cpu().numpy()
            prob_matrix.append(predicted_probs)
            
            # Apply threshold to determine emotions present in text
            predicted_labels = [label_dict[idx] for idx, prob in enumerate(predicted_probs) if prob > threshold]
            predictions.append(label_dict[torch.argmax(outputs, dim=1).item()])

    csv_data = []
    for local_id, text, primary_emotion, probs in zip(local_ids, texts, predictions, prob_matrix):
        emotion_probs = [1 if prob > threshold else 0 for prob in probs]
        row = [local_id, text, primary_emotion] + emotion_probs
        csv_data.append(row)

    # Create a DataFrame from the CSV data
    columns = ["local_id", "tweet", "primary_emotion", "anger", "sadness", "fear", "happiness", "disgust", "surprise", "other"]
    df = pd.DataFrame(csv_data, columns=columns)
    df.to_csv('checkpoints/predictions_2.csv', index=False)

    return predictions


texts = pd.read_csv('final/data_emotion_without_label.csv',encoding='utf-8')['tweet'].tolist()
local_id = pd.read_csv('final/data_emotion_without_label.csv',encoding='utf-8')['local_id'].tolist()

x = predict(texts=texts,local_ids=local_id,model=model,device='cuda',threshold=0.35)

100%|██████████| 500/500 [00:00<00:00, 1418.10it/s]
100%|██████████| 500/500 [00:05<00:00, 84.42it/s]


In [None]:
"""
Oversampling and Undersampling: Oversampling and undersampling techniques can be used with text data, but there are some challenges. In oversampling, simply duplicating text samples might not be effective, as it can lead to overfitting. Techniques like SMOTE or generating synthetic examples through text augmentation (e.g., paraphrasing) can be more appropriate. For undersampling, randomly removing text samples may result in loss of valuable information. Undersampling methods that consider text properties, like Tomek links, might be more suitable.

Synthetic Data Generation: Generating synthetic examples for text data can be more complex than in tabular data. Techniques like SMOTE might need adaptations to consider the sequential nature of text. Additionally, methods like Word2Vec or GPT-based language models can be used for generating semantically similar but contextually different text samples.

Data Augmentation: Data augmentation methods for text involve introducing variations in text content while preserving meaning. Techniques like synonym replacement, word swapping, and paraphrasing can be used to create augmented versions of the minority class text samples.

Class-Weighted Loss: Class-weighted loss can be applied to text data as well. It considers the importance of each class in the loss calculation during training.

Imbalanced-learn Library: If you're using Python, the imbalanced-learn library offers resampling techniques tailored for imbalanced datasets. While some techniques might need adjustments for text data, the library can still provide a good starting point.

"""

"""
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
import nlpaug.augmenter.word as naw

# Load a text dataset (e.g., 20 Newsgroups)
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

X, y = newsgroups.data, newsgroups.target

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = vectorizer.fit_transform(X_train)

# Apply oversampling using SMOTE
oversampler = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train_tfidf, y_train)

# Apply data augmentation using nlpaug
aug = naw.SynonymAug(aug_src='wordnet', aug_max=10)
X_train_augmented = [aug.augment(text) for text in X_train]

# Print sample sizes before and after resampling/augmentation
print("Original class distribution:", {class_label: sum(y_train == class_label) for class_label in set(y_train)})
print("After SMOTE resampling:", {class_label: sum(y_train_resampled == class_label) for class_label in set(y_train_resampled)})
print("After data augmentation:", {class_label: sum(y_train == class_label) for class_label in set(y_train)})

# Now you can use X_train_resampled and y_train_resampled for training


"""

'\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom imblearn.over_sampling import SMOTE\nimport nlpaug.augmenter.word as naw\n\n# Load a text dataset (e.g., 20 Newsgroups)\nnewsgroups = fetch_20newsgroups(subset=\'all\', remove=(\'headers\', \'footers\', \'quotes\'))\n\nX, y = newsgroups.data, newsgroups.target\n\n# Split the dataset into training and test sets\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n\n# Apply TF-IDF vectorization\nvectorizer = TfidfVectorizer(max_features=10000)\nX_train_tfidf = vectorizer.fit_transform(X_train)\n\n# Apply oversampling using SMOTE\noversampler = SMOTE(random_state=42)\nX_train_resampled, y_train_resampled = oversampler.fit_resample(X_train_tfidf, y_train)\n\n# Apply data augmentation using nlpaug\naug = naw.SynonymAug(aug_src=\'wordnet\', aug_max=10)\nX_train_augmented = [aug.augmen

In [None]:
import re
from typing import Any
import hazm
from dadmatools.models.normalizer import Normalizer
from dataclasses import dataclass
from hazm import Lemmatizer
class Preprocessing:
    def __init__(self):
        pass

    def __call__(self,text):
        # Step 1: Normalize
        text = self.normalize(text)
        text = self.remove_english_chars(text)
        # text = self.remove_repeatable_chars(text)
        text = self.remove_arabic_diacritics(text)
        text = self.remove_non_persian_chars(text)

        text = self.remove_persian_numerals(text)
        text = self.remove_hashtags(text)
        return text
    
    # Step 1: Normalize
    def normalize(self,text):
        normalizer = Normalizer(
            full_cleaning=True,
        )
        return normalizer.normalize(text)

    # Step 2: remove any engish character
    def remove_english_chars(self,text):
        english_chars_pattern = re.compile(r'[a-zA-Z]')
        cleaned_text = re.sub(english_chars_pattern, '', text)
        return cleaned_text

    # Step 3: remove repeatable characters
    def remove_repeatable_chars(self,text):
        return hazm.Normalizer().normalize(text)

    # Step 4: remove arabic diactrics
    def remove_arabic_diacritics(self,text):
        """
            Some common Arabic diacritical marks include:
                Fatha (ً): Represents the short vowel "a" or "u" when placed above a letter.
                Kasra (ٍ): Represents the short vowel "i" when placed below a letter.
                Damma (ٌ): Represents the short vowel "u" when placed above a letter.
                Sukun (ـْ): Indicates the absence of any vowel sound.
                Shadda (ّ): Represents consonant doubling or gemination.
                Tanween (ًٌٍ): Represents the nunation or the "n" sound at the end of a word.
        """

        """
            The regular expression [\u064B-\u065F] represents a character range that covers the Unicode code points for Arabic diacritics.
        """
        # مرحبا بكم <== "مَرْحَبًا بِكُمْ"
        arabic_diacritics_pattern = re.compile(r'[\u064B-\u065F]')
        cleaned_text = re.sub(arabic_diacritics_pattern, '', text)
        return cleaned_text

    # Step 5: remove any non-persian chars
    def remove_non_persian_chars(self,text):
        persian_chars_pattern = re.compile(r'[^\u0600-\u06FF\uFB8A\u067E\u0686\u06AF\u200C\u200F]+')
        cleaned_text = re.sub(persian_chars_pattern, ' ', text)
        return cleaned_text

    # Step 6: remove # sign from text while keeping the information included into hashtags
    def remove_hashtags(self,text):
        # Regular expression to match hashtags
        hashtag_pattern = r'#\w+'
        
        def extract_and_replace(match):
            # Extract the text from the matched hashtag and remove the '#' sign
            hashtag_text = match.group(0)[1:]
            return hashtag_text
        
        # Use the 're.sub' function with the 'extract_and_replace' function as the replacement
        cleaned_text = re.sub(hashtag_pattern, extract_and_replace, text)
        
        return cleaned_text
    

    # Step 7: remove persian numeric characters from text
    def remove_persian_numerals(self,text):
        # Define a translation table to map Persian numerals to None (remove them)
        persian_numerals = {
            ord('۰'): None,
            ord('۱'): None,
            ord('۲'): None,
            ord('۳'): None,
            ord('۴'): None,
            ord('۵'): None,
            ord('۶'): None,
            ord('۷'): None,
            ord('۸'): None,
            ord('۹'): None
        }
        # Use str.translate() to remove Persian numerals
        cleaned_text = text.translate(persian_numerals)
        return cleaned_text


In [None]:
import pandas as pd
from tqdm import tqdm
train_file = pd.read_csv('train_emoPars.csv',encoding='utf-8')
test_file = pd.read_csv('test_emoPars.csv',encoding='utf-8')
p = Preprocessing()
tqdm.pandas()
train_file['text'] = train_file['text'].progress_apply(p)
test_file['text'] = test_file['text'].progress_apply(p)

FileNotFoundError: [Errno 2] No such file or directory: 'train_emoPars.csv'

In [None]:
train_file.to_csv('train_cleaned_emopars.csv')
test_file.to_csv('test_cleaned_emopars.csv')

In [None]:
train_file.to_csv('test_cleaned_emopars.csv')

In [None]:
train_file['texts']

0       من خیلی خودسانسوری می‌کنم تو اینستا هر چی فالو...
1               بعد اتمام جلسه مجلس روند بازار برگشت بورس
2       کاربران توییتر در جریان طوفان توییتری اعتراض ب...
3       وحشی شدن معده بعد از رسیدن به ایران اجتناب ناپ...
4       سحام نیوز بیانیه مشترک عربستان و امارات با پرو...
                              ...                        
2995                    بعد از کرونا یه صفایی بکنیم با هم
2996                              قبلش یه پیتزا بده حداقل
2997    از رفقا می‌خواهم که از بین اپوزیسیون حزب یا سا...
2998    از هر دریچه‌ای؛ چه موازنه قوا با امریکا چه تام...
2999          چند نفر تا حتما هستن میخوام آمار بگیرم بگین
Name: texts, Length: 3000, dtype: object

In [None]:
import pandas as pd
from tqdm import tqdm
y = pd.read_csv('datasets/data_emotion_without_label.csv')
p = Preprocessing()
tqdm.pandas()
y['tweet'] = y['tweet'].progress_apply(p)

  0%|          | 0/500 [00:00<?, ?it/s]

100%|██████████| 500/500 [00:02<00:00, 219.87it/s]


In [None]:
y.to_csv('data_emotion_without_label.csv')

In [None]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [None]:
combined = pd.concat([train,test],axis=0)
p = Preprocessing()
tqdm.pandas()
combined['text'] = combined['text'].progress_apply(p)
combined.to_csv('final/arman.csv')

100%|██████████| 7308/7308 [00:18<00:00, 386.83it/s]


In [None]:
x = pd.read_csv('checkpoints/predictions_1.csv')
y = pd.read_csv('res.csv')
y['primary_emotion'] = x['primary_emotion']
y.to_csv('final.csv')

In [None]:
anger = pd.read_csv('PersianTweets/anger.csv')
disgust = pd.read_csv('PersianTweets/disgust.csv')
fear = pd.read_csv('PersianTweets/fear.csv')
joy = pd.read_csv('PersianTweets/joy.csv')
sad = pd.read_csv('PersianTweets/sad.csv')
surprise = pd.read_csv('PersianTweets/surprise.csv')

In [None]:
len(anger) , len(disgust) , len(fear) , len(joy) , len(sad) , len(surprise)

(20069, 925, 17624, 28024, 34328, 12859)

In [None]:
all_tweets = pd.concat([anger,disgust,fear,joy,sad,surprise],axis=0)
all_tweets.to_csv('PersianTweets/allDataset.csv')

In [None]:
pd.read_csv('PersianTweets/allDataset.csv')

Unnamed: 0.1,Unnamed: 0,tweet,replyCount,retweetCount,likeCount,quoteCount,hashtags,sourceLabel,emotion
0,0,دیشب خواب دیدم بمبی چیزی زدن نورش خیلی خیره کن...,0,3,2,0,['No2IR'],Twitter Web App,anger
1,1,تبر زدی بر ریشه‌اَم، جوانه رویید جایِ زخم\nران...,0,0,8,0,['سين_کاف'],Twitter for Android,anger
2,2,پدر سوخته ای که بابام بهم میگه دو معنی داره که...,1,0,11,0,['پدر_ایرانی'],Twitter for Android,anger
3,3,با خود مواجه شوید و اخم نکنید. اقتدار در نگاه ...,0,0,1,0,"['جذبه', 'اخم']",Twitter for iPhone,anger
4,4,مدح تو را در شادی و در غم نوشتند\nبا این همه ا...,4,6,36,0,['نبأ_عظیم'],Twitter Web App,anger
...,...,...,...,...,...,...,...,...,...
113824,12854,متعجبم چرا لیبرالها و غرب گداها و فتنه‌گران سا...,0,0,2,0,['دلار_اصلاح_طلبان'],Twitter for Android,surprise
113825,12855,چیزی که متعجبم میکنه اینکه از حق خودمختاری مرد...,1,0,0,0,['ترکمنچای_چینی'],Twitter for iPhone,surprise
113826,12856,متعجبم از اون دسته عزیزانی که هنوز هشتگ #رای_ب...,0,0,2,0,"['رای_بی_رای', 'نه_به_جمهوی_اسلامی']",Twitter for Android,surprise
113827,12857,#ظریف دهان همه منتقدانش را بست؟؟؟؟!!!!..... حق...,0,1,48,0,['ظریف'],Twitter for Android,surprise
