### Import Libraries

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import seaborn as sns

In [3]:
pd.options.display.max_columns= None
pd.options.display.max_colwidth= None
pd.options.display.max_rows = None

## Read dataset

In [5]:
df = pd.read_excel('/home/mahdi/NLP/Text_Preprocessing/dataset/Comment_Full_Data_Sentiment_Labeled.xlsx')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1131 entries, 0 to 1130
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   comment_id              1131 non-null   int64  
 1   app_id                  1131 non-null   int64  
 2   user_name               1131 non-null   object 
 3   comment_text            1131 non-null   object 
 4   Annotator1              1130 non-null   float64
 5   Annotator2              1129 non-null   float64
 6   Annotator3              1131 non-null   int64  
 7   comment_rating          1131 non-null   int64  
 8   comment_date            1131 non-null   int64  
 9   sentiment_result        1131 non-null   object 
 10  second_model_processed  1131 non-null   bool   
 11  comment_idd             1131 non-null   int64  
 12  sentiment_score         1131 non-null   int64  
 13  comment_date_jalali     1131 non-null   int64  
dtypes: bool(1), float64(2), int64(8), object

In [7]:
df.head()

Unnamed: 0,comment_id,app_id,user_name,comment_text,Annotator1,Annotator2,Annotator3,comment_rating,comment_date,sentiment_result,second_model_processed,comment_idd,sentiment_score,comment_date_jalali
0,111177,9,هادی,سلام،خوبه ولی برنامه همراه شهر باز نمیشه.,3.0,3.0,3,5,45650,mixed,False,175629681,0,14031004
1,111172,9,Unnamed User,سلام خسته نباشید از این برنامه میشه غیر حضوری ...,0.0,0.0,0,5,45649,no sentiment expressed,False,175659112,0,14031003
2,110746,8,Unnamed User,لطفا تعداد اقساط رو از ۱۲ ماه شروع کنید نه ۲۴ ماه,0.0,0.0,0,5,45651,no sentiment expressed,False,175712781,0,14031005
3,110706,8,shayan,خود بانک و وام هاش خوبه ولی نام افزارش خیلی هن...,3.0,3.0,3,2,45651,mixed,False,175740844,0,14031005
4,110702,8,hamed,سلام چرا وارد همراه بانک نمیتونم بشم,2.0,2.0,2,3,45651,no sentiment expressed,False,175741764,0,14031005


## Preprocessing

In [240]:
def _multiple_replace(mapping, text):
    pattern = "|".join(map(re.escape, mapping.keys()))
    return re.sub(pattern, lambda m: mapping[m.group()], str(text))

def convert_fa_numbers(input_str):
    mapping = {
        '۰': '0',
        '۱': '1',
        '۲': '2',
        '۳': '3',
        '۴': '4',
        '۵': '5',
        '۶': '6',
        '۷': '7',
        '۸': '8',
        '۹': '9',
        '.': '.',
    }
    return _multiple_replace(mapping, input_str)

def convert_en_numbers(input_str):
    mapping = {
         '0': '۰',
         '1' : '۱',
         '2' :'۲',
        '3'  :'۳',
        '4'  :'۴',
        '5' :'۵',
        '6' :'۶',
        '7' :'۷',
        '8' :'۸',
        '9' :'۹',
        '.' :'.'
    }
    return _multiple_replace(mapping, input_str)

def convert_ar_characters(input_str):
    """
    Converts Arabic chars to related Persian unicode char
    :param input_str: String contains Arabic chars
    :return: New str with converted arabic chars
    """
    mapping = {
        'ك': 'ک',
        'ى': 'ی',
        'ي': 'ی',
        'ئ':'ی',
        'إ':'ا',
        'أ':'ا',
        'ة':'ه',
        'ؤ':'و'
    }
    return _multiple_replace(mapping, input_str)



In [None]:

def merge_mi_prefix(text):

    return re.sub(r'\b(ن?می)\s+(\S+)', r'\1\2', text)





def preprocess(text):
    text = convert_fa_numbers(text)
    text = convert_ar_characters(text)
    text = text.strip()
    removelist = "<>"
    # text = re.sub(r'[^\w'+removelist+']', ' ', text)
    # text = re.sub(r'[^\w]', ' ', text)
    # text = re.sub(r'((#)[\w]*)','#',text)
    
    text = text.replace('\u200c', '')
    text = re.sub(r'[\s]{2,}', ' ', text)
    text = re.sub(r'(\w)\1{2,}', r'\1\1',text)
    text = re.sub(r' [\d+]', ' ',text)
    text = re.sub(r'[^\w]', ' ',text)
    text = convert_en_numbers(text)
    text = re.sub(r'[\d]+',' ',text)
    text = convert_ar_characters(text)
    # remove punctuations
    text= re.sub(r'[^\w]', ' ', text)
    # prefix
    text = merge_mi_prefix(text)
    return(text)