### Loading The Data

In [64]:
import pandas as pd

In [65]:
train_df = pd.read_csv('./data/train.csv')

In [66]:
test_df = pd.read_csv('./data/test.csv')

### Unwrap Dictionary

In [67]:
import ast

In [68]:
def unwrap_dict(dataframe):
    product_df = pd.json_normalize(dataframe.product_description.apply(ast.literal_eval))
    dataframe = pd.concat([product_df, dataframe], axis=1)
    dataframe = dataframe.drop('product_description', axis=1)
    return dataframe

In [69]:
train_df = unwrap_dict(train_df)

In [70]:
test_df = unwrap_dict(test_df)

In [71]:
for col in list(train_df.columns):
    train_df[col] = train_df[col].apply(lambda x: x[0] if type(x) == list else x)

In [72]:
for col in list(test_df.columns):
    test_df[col] = test_df[col].apply(lambda x: x[0] if type(x) == list else x)

### Removing Columns with >99% Missing Value

In [73]:
limit = len(train_df) * 0.01

In [74]:
train_df = train_df.dropna(thresh=limit, axis=1)

In [75]:
columns = train_df.columns.values.tolist()
columns.remove('price')
columns.remove('id')

In [76]:
test_df = test_df[columns]

### Analysing The Data

In [235]:
from pandas_profiling import ProfileReport

In [236]:
profile = ProfileReport(train_df, title="Pandas Profiling Report", minimal=True)
profile.to_file("dataset_profile.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

### Creating Labels

In [77]:
train_label = train_df[['price']].to_numpy().squeeze()

### Categorical to One-Hot

In [78]:
from sklearn.preprocessing import OneHotEncoder

In [79]:
def train_onehot(dataframe):
    enc = OneHotEncoder(handle_unknown='ignore')
    enc.fit(dataframe)
    return enc

In [80]:
cat_columns = ['درگاه‌های ارتباطی', 'جنس بدنه' , 'جنس', 'اتصالات', 'سیستم عامل',
               'نوع اتصال', 'رابط‌ها', 'اندازه', 'فناوری‌های ارتباطی',
               'جنس کالا', 'دسته بندی', 'برند', 'نوع حافظه', 'سری پردازنده']

In [81]:
onehot_model = train_onehot(train_df[cat_columns])

In [82]:
train_cat = onehot_model.transform(train_df[cat_columns])

In [83]:
test_cat = onehot_model.transform(test_df[cat_columns])

In [84]:
train_cat.shape

(68840, 3238)

### Categorical to Numeral

In [22]:
from sklearn.preprocessing import OrdinalEncoder
import numpy as np

In [23]:
def train_numeral(dataframe, columns):
    models = []
    
    for col in cat_columns:
        enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan)
        enc.fit(dataframe[[col]])
        models.append(enc)
    
    return models

In [24]:
def apply_numeral(dataframe, columns, models):
    
    classes = []
    for i, col in enumerate(cat_columns):
        cls = numeral_models[i].transform(train_df[[col]]).squeeze()
        cls = np.nan_to_num(cls, nan=len(models[i].categories_[0]))
        classes.append(cls)
    
    return np.vstack(classes)

In [137]:
cat_columns = ['درگاه‌های ارتباطی', 'جنس بدنه', 'نوع رابط', 'قابلیت‌های دستگاه', 'جنس', 'اتصالات', 'سیستم عامل',
               'نوع اتصال', 'رابط‌ها', 'ظرفیت', 'اندازه', 'فناوری‌های ارتباطی', 'جنس بند',
               'جنس کالا', 'تعداد باتری‌های موجود در پک', 'قابلیت‌های باتری']

In [138]:
numeral_models = train_numeral(train_df, cat_columns)

In [139]:
train_cat = apply_numeral(train_df, cat_columns, numeral_models)

In [140]:
train_cat = train_cat.T

In [141]:
test_cat = apply_numeral(test_df, cat_columns, numeral_models)

In [142]:
test_cat = test_cat.T

### Cleaning Text Fields

In [85]:
import hazm
from hazm import Normalizer, WordTokenizer
import re
import pickle
import codecs

In [86]:
with open('translation_dict.pickle', 'rb') as handle:
    new_translation = pickle.load(handle)

In [87]:
class TextHandler:
    def __init__(self, persian_numbers=False,
                 change_lang_spacing=True,
                 remove_non_standard_char=True,
                 remove_repetitive_chars=True,
                 user_translations=None,
                 stopwords=None,
                 lemma=False
                ):
        
        if not persian_numbers:
            number_src = '۰۱۲۳۴۵۶۷۸۹٪'
            number_dest = '0123456789%'
        else:
            number_dest = '۰۱۲۳۴۵۶۷۸۹٪'
            number_src = '0123456789%'
        
        self.number_translations = self.maketrans(number_src, number_dest)
        
        if not user_translations:
            self.user_translations = dict()
        else:
            self.user_translations = user_translations

        self._remove_repetitive_chars = remove_repetitive_chars
        self._change_lang_spacing = change_lang_spacing
        self._remove_non_standard_char = remove_non_standard_char
        self._stopwords = stopwords
        
        self.text_normalizer = hazm.Normalizer(
            remove_extra_spaces=True,
            persian_style=False,
            persian_numbers=False,
            remove_diacritics=True,
            affix_spacing=True,
            token_based=False,
            punctuation_spacing=True)

        self.word_tokenizer = hazm.WordTokenizer(
            join_verb_parts=False,
            separate_emoji=True,
            replace_links=True,
            replace_IDs=False,
            replace_emails=True,
            replace_numbers=False,
            replace_hashtags=False)

    def normalize(self, text: str):
        text = text.translate(self.user_translations)
        text = text.translate(self.number_translations)
        
        text = text.lower()

        normalized_text = self.text_normalizer.normalize(text)

        if self._remove_repetitive_chars:
            text = self.remove_rep_chars(text)

        if self._change_lang_spacing:
            text = self.change_lang_spacing(text)

        if self._remove_non_standard_char:
            text = self.remove_non_standard_char(text)
        
        text = re.sub(r'[\u200c\s]*\s[\s\u200c]*', ' ', text)
        text = re.sub(r'[\u200c]+', '\u200c', text)

        return text

    
    @staticmethod
    def maketrans(src_chars, dest_chars):
        return dict((ord(a), b) for a, b in zip(src_chars, dest_chars))
    
    @staticmethod
    def change_lang_spacing(text: str) -> str:
        return re.sub('(([a-zA-Z0-9/\-\.]+)|([ء-یژپچگ]+))', r' \1 ', text).strip()

    @staticmethod
    def remove_non_standard_char(text: str) -> str:
        return re.sub(r'[^a-zA-Z0-9\u0621-\u06CC\u0698\u067E\u0686\u06AF]', ' ', text)

    @staticmethod
    def remove_rep_chars(text: str) -> str:
        return re.sub(r'([^0-9])\1\1+', r'\1', text)
    
    def preprocess_text(self, text: str):
        normalized_text = self.normalize(text)
        return normalized_text

In [88]:
text_handler = TextHandler(user_translations=new_translation)

In [89]:
train_df['توضیحات'] = train_df['توضیحات'].fillna('')
train_df['سایر توضیحات'] = train_df['سایر توضیحات'].fillna('')
train_df['سایر مشخصات'] = train_df['سایر مشخصات'].fillna('')
train_df['مشخصات فنی'] = train_df['مشخصات فنی'].fillna('')
train_df['وزن'] = train_df['وزن'].fillna('')
train_df['ویژگی‌ها'] = train_df['ویژگی‌ها'].fillna('')


train_df['text'] = ('<start> ' + train_df['توضیحات'] + ' <and> ' + train_df['سایر توضیحات'] +  ' <and> ' +
               train_df['وزن'] + ' <and> ' + train_df['سایر مشخصات'] + ' <and> ' + train_df['مشخصات فنی'] +
               ' <and> ' + train_df['ویژگی‌ها'] + ' <end>')


train_df['text'] = train_df['text'].apply(text_handler.preprocess_text)
train_texts = train_df['text'].tolist()

In [90]:
test_df['توضیحات'] = test_df['توضیحات'].fillna('')
test_df['سایر توضیحات'] = test_df['سایر توضیحات'].fillna('')
test_df['سایر مشخصات'] = test_df['سایر مشخصات'].fillna('')
test_df['مشخصات فنی'] = test_df['مشخصات فنی'].fillna('')
test_df['وزن'] = test_df['وزن'].fillna('')
test_df['ویژگی‌ها'] = test_df['ویژگی‌ها'].fillna('')

test_df['text'] = ('<start> ' + test_df['توضیحات'] + ' <and> ' + test_df['سایر توضیحات'] +  ' <and> ' +
               test_df['وزن'] + ' <and> ' + test_df['سایر مشخصات'] + ' <and> ' + test_df['مشخصات فنی'] +
               ' <and> ' + test_df['ویژگی‌ها'] + ' <end>')

test_df['text'] = test_df['text'].apply(text_handler.preprocess_text)
test_texts = test_df['text'].tolist()

### Saving to File

In [91]:
import numpy as np
import pickle
from scipy.sparse import save_npz

In [92]:
with open('train_texts.pkl', 'wb') as f:
    pickle.dump(train_texts, f)
    
save_npz("train_cat.npz", train_cat)

with open('train_label.npy', 'wb') as f:
    np.save(f, train_label)

In [93]:
with open('test_texts.pkl', 'wb') as f:
    pickle.dump(test_texts, f)
    
save_npz("test_cat.npz", test_cat)