### Loading The Data

In [1]:
import pandas as pd

In [2]:
train_df = pd.read_parquet('DMC-Train.parquet', engine='fastparquet')

In [3]:
test_df = pd.read_parquet('DMC-phase2-validation.parquet', engine='fastparquet')

### Unwrap JSON

In [4]:
import json

In [5]:
def unwrap_json(dataframe):
    post_df = pd.json_normalize(dataframe.post_data.apply(json.loads))
    post_df = post_df.drop(post_df.columns[15:], axis=1)
    dataframe = pd.concat([post_df, dataframe], axis=1)
    dataframe = dataframe.drop('post_data', axis=1)
    return dataframe

In [6]:
train_df = unwrap_json(train_df)

In [7]:
test_df = unwrap_json(test_df)

### Creating Labels

In [None]:
# option 1: binary

train_df['review_label'] = train_df['review_label'].map({'accept': 1, 'reject': 0})
train_label = train_df[['review_label']].to_numpy().squeeze()

In [8]:
# option 2: multi-label

train_df['reject_reason_id'] = train_df['reject_reason_id'].map({
    0: 0, 5: 1, 12: 2, 13: 3, 29: 4, 139: 5, 145: 6, 146: 7, 163: 8})
train_label = train_df[['reject_reason_id']].to_numpy().squeeze()

### Categorical to One-Hot

In [9]:
from sklearn.preprocessing import OneHotEncoder

In [10]:
def train_onehot(dataframe):
    enc = OneHotEncoder(handle_unknown='ignore')
    enc.fit(dataframe)
    return enc

In [11]:
cat_columns = ['body_status', 'brand_model', 'color', 'document', 'gearbox',
               'selling_type', 'year', 'third_party_insurance_deadline']

In [12]:
onehot_model = train_onehot(train_df[cat_columns])

In [13]:
train_cat = onehot_model.transform(train_df[cat_columns])

In [14]:
test_cat = onehot_model.transform(test_df[cat_columns])

### Numerical to Categorical

In [15]:
from scipy.sparse import hstack
import numpy as np

In [16]:
def train_num2cat(dataframe, num_bins):
    bins = pd.qcut(dataframe, duplicates='drop', q=num_bins, retbins=True)[1]
    bins = np.concatenate(([-np.inf], bins[1:-1], [np.inf]))
    res_df = pd.cut(dataframe, bins).to_frame()
    oh_model = train_onehot(res_df)
    return bins, oh_model

In [17]:
def transform_num2cat(dataframe, model, bins):
    res_df = pd.cut(dataframe, bins).to_frame()
    return model.transform(res_df)

In [18]:
price_bins, price_model = train_num2cat(train_df['new_price'], 60)
usage_bins, usage_model = train_num2cat(train_df['usage'], 30)

In [19]:
train_price = transform_num2cat(train_df['new_price'], price_model, price_bins)
train_usage = transform_num2cat(train_df['usage'], usage_model, usage_bins)

In [20]:
train_cat = hstack((train_cat, train_price, train_usage))

In [21]:
test_price = transform_num2cat(test_df['new_price'], price_model, price_bins)
test_usage = transform_num2cat(test_df['usage'], usage_model, usage_bins)

In [22]:
test_cat = hstack((test_cat, test_price, test_usage))

### Cleaning Text Fields

In [23]:
import hazm
from hazm import Normalizer, WordTokenizer
import re
import pickle
import codecs

In [24]:
with open('translation_dict.pickle', 'rb') as handle:
    new_translation = pickle.load(handle)

In [25]:
class TextHandler:
    def __init__(self, persian_numbers=False,
                 change_lang_spacing=True,
                 remove_non_standard_char=True,
                 remove_repetitive_chars=True,
                 user_translations=None,
                 stopwords=None,
                 lemma=False
                ):
        
        if not persian_numbers:
            number_src = '۰۱۲۳۴۵۶۷۸۹٪'
            number_dest = '0123456789%'
        else:
            number_dest = '۰۱۲۳۴۵۶۷۸۹٪'
            number_src = '0123456789%'
        
        self.number_translations = self.maketrans(number_src, number_dest)
        
        if not user_translations:
            self.user_translations = dict()
        else:
            self.user_translations = user_translations

        self._remove_repetitive_chars = remove_repetitive_chars
        self._change_lang_spacing = change_lang_spacing
        self._remove_non_standard_char = remove_non_standard_char
        self._stopwords = stopwords
        
        self.text_normalizer = hazm.Normalizer(
            remove_extra_spaces=True,
            persian_style=False,
            persian_numbers=False,
            remove_diacritics=True,
            affix_spacing=True,
            token_based=False,
            punctuation_spacing=True)

        self.word_tokenizer = hazm.WordTokenizer(
            join_verb_parts=False,
            separate_emoji=True,
            replace_links=True,
            replace_IDs=False,
            replace_emails=True,
            replace_numbers=False,
            replace_hashtags=False)

    def normalize(self, text: str):
        text = text.translate(self.user_translations)
        text = text.translate(self.number_translations)
        
        text = text.lower()

        normalized_text = self.text_normalizer.normalize(text)

        if self._remove_repetitive_chars:
            text = self.remove_rep_chars(text)

        if self._change_lang_spacing:
            text = self.change_lang_spacing(text)

        if self._remove_non_standard_char:
            text = self.remove_non_standard_char(text)
        
        text = re.sub(r'[\u200c\s]*\s[\s\u200c]*', ' ', text)
        text = re.sub(r'[\u200c]+', '\u200c', text)

        return text

    
    @staticmethod
    def maketrans(src_chars, dest_chars):
        return dict((ord(a), b) for a, b in zip(src_chars, dest_chars))
    
    @staticmethod
    def change_lang_spacing(text: str) -> str:
        return re.sub('(([a-zA-Z0-9/\-\.]+)|([ء-یژپچگ]+))', r' \1 ', text).strip()

    @staticmethod
    def remove_non_standard_char(text: str) -> str:
        return re.sub(r'[^a-zA-Z0-9\u0621-\u06CC\u0698\u067E\u0686\u06AF]', ' ', text)

    @staticmethod
    def remove_rep_chars(text: str) -> str:
        return re.sub(r'([^0-9])\1\1+', r'\1', text)
    
    def preprocess_text(self, text: str):
        normalized_text = self.normalize(text)
        return normalized_text

In [26]:
text_handler = TextHandler(user_translations=new_translation)

In [27]:
train_df['prep_title'] = train_df['title'].apply(text_handler.preprocess_text)
train_df['prep_description'] = train_df['description'].apply(text_handler.preprocess_text)
train_df['prep_text'] = '<start> ' + train_df['prep_description'] + ' <delim> ' + train_df['prep_title'] + ' <end>'

In [28]:
test_df['prep_title'] = test_df['title'].apply(text_handler.preprocess_text)
test_df['prep_description'] = test_df['description'].apply(text_handler.preprocess_text)
test_df['prep_text'] = '<start> ' + test_df['prep_description'] + ' <delim> ' + test_df['prep_title'] + ' <end>'

In [29]:
train_texts = train_df['prep_text'].tolist()

In [30]:
test_texts = test_df['prep_text'].tolist()

### Saving to File

In [31]:
from scipy.sparse import save_npz
import pickle

In [32]:
with open('train_texts.pkl', 'wb') as f:
    pickle.dump(train_texts, f)
    
save_npz("train_cat.npz", train_cat)

with open('train_label.npy', 'wb') as f:
    np.save(f, train_label)

In [33]:
with open('test_texts.pkl', 'wb') as f:
    pickle.dump(test_texts, f)
    
save_npz("test_cat.npz", test_cat)