### Import Libraries

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import seaborn as sns

In [3]:
pd.options.display.max_columns= None
pd.options.display.max_colwidth= None
pd.options.display.max_rows = None

## Preprocessing

In [240]:
def _multiple_replace(mapping, text):
    pattern = "|".join(map(re.escape, mapping.keys()))
    return re.sub(pattern, lambda m: mapping[m.group()], str(text))

def convert_fa_numbers(input_str):
    mapping = {
        '۰': '0',
        '۱': '1',
        '۲': '2',
        '۳': '3',
        '۴': '4',
        '۵': '5',
        '۶': '6',
        '۷': '7',
        '۸': '8',
        '۹': '9',
        '.': '.',
    }
    return _multiple_replace(mapping, input_str)

def convert_en_numbers(input_str):
    mapping = {
         '0': '۰',
         '1' : '۱',
         '2' :'۲',
        '3'  :'۳',
        '4'  :'۴',
        '5' :'۵',
        '6' :'۶',
        '7' :'۷',
        '8' :'۸',
        '9' :'۹',
        '.' :'.'
    }
    return _multiple_replace(mapping, input_str)

def convert_ar_characters(input_str):
    """
    Converts Arabic chars to related Persian unicode char
    :param input_str: String contains Arabic chars
    :return: New str with converted arabic chars
    """
    mapping = {
        'ك': 'ک',
        'ى': 'ی',
        'ي': 'ی',
        'ئ':'ی',
        'إ':'ا',
        'أ':'ا',
        'ة':'ه',
        'ؤ':'و'
    }
    return _multiple_replace(mapping, input_str)



In [None]:

def merge_mi_prefix(text):

    return re.sub(r'\b(ن?می)\s+(\S+)', r'\1\2', text)





def preprocess(text):
    text = convert_fa_numbers(text)
    text = convert_ar_characters(text)
    text = text.strip()
    removelist = "<>"
    # text = re.sub(r'[^\w'+removelist+']', ' ', text)
    # text = re.sub(r'[^\w]', ' ', text)
    # text = re.sub(r'((#)[\w]*)','#',text)
    
    text = text.replace('\u200c', '')
    text = re.sub(r'[\s]{2,}', ' ', text)
    text = re.sub(r'(\w)\1{2,}', r'\1\1',text)
    text = re.sub(r' [\d+]', ' ',text)
    text = re.sub(r'[^\w]', ' ',text)
    text = convert_en_numbers(text)
    text = re.sub(r'[\d]+',' ',text)
    text = convert_ar_characters(text)
    # remove punctuations
    text= re.sub(r'[^\w]', ' ', text)
    # prefix
    text = merge_mi_prefix(text)
    return(text)