### Loading Train Data

In [1]:
import pandas as pd

In [2]:
train_df = pd.read_parquet('DMC-Train.parquet', engine='fastparquet')

In [3]:
train_df.head(5)

Unnamed: 0,post_id,post_data,review_label,reject_reason_id
0,cb000599-2ee2-42c1-9f0e-32cfeb940398,"{""body_status"": ""witout-color"", ""brand"": ""\u06...",accept,0
1,12063741-6634-444b-befa-0be4c95c2b42,"{""body_status"": ""witout-color"", ""brand"": ""\u06...",reject,13
2,81c93119-5c06-412f-80aa-363ddb0ebc33,"{""body_status"": ""witout-color"", ""brand"": ""\u06...",accept,0
3,b5a5bfa7-03be-408b-b4d9-bca26c0ca59b,"{""brand"": ""\u0633\u0627\u06cc\u0631"", ""brand_m...",accept,0
4,3414e920-dfaf-44a8-9853-0b03d66e9e2a,"{""body_status"": ""intact"", ""brand"": ""\u067e\u06...",reject,12


### Unwrap JSON

In [4]:
import json

In [5]:
def unwrap_json(dataframe):
    post_df = pd.json_normalize(dataframe.post_data.apply(json.loads))
    post_df = post_df.drop(post_df.columns[15:], axis=1) # removing optional columns
    dataframe = pd.concat([post_df, dataframe], axis=1)
    dataframe = dataframe.drop('post_data', axis=1)
    return dataframe

In [6]:
train_df = unwrap_json(train_df)

In [7]:
train_df['review_label'] = train_df['review_label'].map({'accept': 1, 'reject': 0})
train_label = train_df[['review_label']].to_numpy()

### Analyzing The Data

In [None]:
from pandas_profiling import ProfileReport

profile = ProfileReport(df, title="Pandas Profiling Report")
profile.to_file("dataset_profile.html")

#### Fields

- body_status: pre-defined (16 types) - 20% missing
- brand: pre-defined (with other) (30 types) - 0.1% missing
- brand_model: pre-defined (with other) (1135 types) - 0.2% missing
- color: pre-defined (38 types) - 3% missing
- description: free text - 0% missing
- document: pre-defined (3 types) - 21% missing
- gearbox: pre-defined (2 types) - 20% missing
- new_price: 10M - 50B integer - 25% missing
- selling_type: pre-defined (3 types) - 22% missing
- third_party_insurance_deadline: 1-12 integer - 22% missing
- title: free text (with suggestion) - 0% missing
- usage: 0-500,000 km integer - 0.1% missing
- year: pre-defined (36 types) - 0.1% missing
- options: yes or no

#### Reasons

- 0: cool = 77.1% (inbalanced data)
- 12: against the law = 5.0%
- 145: you cannot request to buy, you only sell! = 4.8%
- 163: info doesn't match the title = 4.2%
- 13: probably wrong price = 3.9%
- 139: more than one car = 3.9%
- 146: wrong category = 0.5%
- 5: wrong category, use services = 0.3%
- 29: no document uploaded (actually, not related to document field, nor any other field!) = 0.2%

In [None]:
pd.set_option("display.max_colwidth", 200)
df.loc[df['reject_reason_id'] == 145].head(5)

### Categorical to One-Hot

In [8]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

In [9]:
def train_onehot(dataframe):
    enc = OneHotEncoder(handle_unknown='ignore')
    enc.fit(dataframe)
    return enc

In [14]:
for clm in opt_columns:
    train_df[clm] = train_df[clm].map({np.nan: 'False'})

In [10]:
cat_columns = ['body_status', 'brand', 'brand_model', 'color', 'document', 'gearbox',
               'selling_type', 'year', 'third_party_insurance_deadline']

In [11]:
onehot_model = train_onehot(train_df[cat_columns])

In [12]:
train_cat = onehot_model.transform(train_df[cat_columns])

### Numerical to Categorical

In [14]:
from scipy.sparse import hstack

In [15]:
def train_num2cat(dataframe, num_bins):
    bins = pd.qcut(dataframe, duplicates='drop', q=num_bins, retbins=True)[1]
    bins = np.concatenate(([-np.inf], bins[1:-1], [np.inf]))
    res_df = pd.cut(dataframe, bins).to_frame()
    oh_model = train_onehot(res_df)
    return bins, oh_model

In [16]:
def transform_num2cat(dataframe, model, bins):
    res_df = pd.cut(dataframe, bins).to_frame()
    return model.transform(res_df)

In [18]:
price_bins, price_model = train_num2cat(train_df['new_price'], 50)
train_price = transform_num2cat(train_df['new_price'], price_model, price_bins)

In [20]:
usage_bins, usage_model = train_num2cat(train_df['usage'], 25)
train_usage = transform_num2cat(train_df['usage'], usage_model, usage_bins)

In [22]:
train_cat = hstack((train_cat, train_price, train_usage))

### Cleaning Text Fields

In [24]:
import hazm
from hazm import Normalizer, WordTokenizer
import re
import pickle
import codecs

In [25]:
with open('my_translation_dict.pickle', 'rb') as handle:
    new_translation = pickle.load(handle)

In [26]:
with open('stopwords.dat') as handle:
    stopwords = handle.readlines()
    stopwords = [word[:-1] for word in stopwords]

In [27]:
class TextHandler:
    def __init__(self, persian_numbers=False,
                 change_lang_spacing=True,
                 remove_non_standard_char=True,
                 remove_repetitive_chars=True,
                 user_translations=None,
                 stopwords=None,
                 lemma=False
                ):
        
        if not persian_numbers:
            number_src = '۰۱۲۳۴۵۶۷۸۹٪'
            number_dest = '0123456789%'
        else:
            number_dest = '۰۱۲۳۴۵۶۷۸۹٪'
            number_src = '0123456789%'
        
        self.number_translations = self.maketrans(number_src, number_dest)
        
        if not user_translations:
            self.user_translations = dict()
        else:
            self.user_translations = user_translations

        self._remove_repetitive_chars = remove_repetitive_chars
        self._change_lang_spacing = change_lang_spacing
        self._remove_non_standard_char = remove_non_standard_char
        self._stopwords = stopwords
        
        self.text_normalizer = hazm.Normalizer(
            remove_extra_spaces=True,
            persian_style=False,
            persian_numbers=False,
            remove_diacritics=True,
            affix_spacing=True,
            token_based=False,
            punctuation_spacing=True)

        self.word_tokenizer = hazm.WordTokenizer(
            join_verb_parts=False,
            separate_emoji=True,
            replace_links=True,
            replace_IDs=False,
            replace_emails=True,
            replace_numbers=False,
            replace_hashtags=False)

    def normalize(self, text: str):
        text = text.translate(self.user_translations)
        text = text.translate(self.number_translations)
        
        text = text.lower()

        normalized_text = self.text_normalizer.normalize(text)

        if self._remove_repetitive_chars:
            text = self.remove_rep_chars(text)

        if self._change_lang_spacing:
            text = self.change_lang_spacing(text)

        if self._remove_non_standard_char:
            text = self.remove_non_standard_char(text)
            
        text = self.remove_stopwords(text)

        text = re.sub(r'[\u200c\s]*\s[\s\u200c]*', ' ', text)
        text = re.sub(r'[\u200c]+', '\u200c', text)

        return text

    
    @staticmethod
    def maketrans(src_chars, dest_chars):
        return dict((ord(a), b) for a, b in zip(src_chars, dest_chars))
    
    @staticmethod
    def change_lang_spacing(text: str) -> str:
        return re.sub('(([a-zA-Z0-9/\-\.]+)|([ء-یژپچگ]+))', r' \1 ', text).strip()

    @staticmethod
    def remove_non_standard_char(text: str) -> str:
        return re.sub(r'[^a-zA-Z0-9\u0621-\u06CC\u0698\u067E\u0686\u06AF]', ' ', text)

    @staticmethod
    def remove_rep_chars(text: str) -> str:
        return re.sub(r'([^0-9])\1\1+', r'\1', text)
    
    def remove_stopwords_and_lemma(self, text: str) -> str:
        if self._stopwords:
            words = self.word_tokenizer.tokenize(text)
            words = [w for w in words if w not in self._stopwords]
            return ' '.join(words)
        return text
    
    def preprocess_text(self, text: str):
        normalized_text = self.normalize(text)
        return normalized_text

In [28]:
text_handler = TextHandler(user_translations=new_translation)

In [29]:
train_df['prep_title'] = train_df['title'].apply(text_handler.preprocess_text)
train_df['prep_description'] = train_df['description'].apply(text_handler.preprocess_text)
train_df['prep_text'] = train_df['prep_title'] + ' ' + train_df['prep_description']

### Text to Vector

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

In [32]:
def train_text2vec(dataframe, min_df=20):
    tokenizer = hazm.WordTokenizer(
        join_verb_parts=False,
        separate_emoji=True,
        replace_links=True,
        replace_IDs=False,
        replace_emails=True,
        replace_numbers=False,
        replace_hashtags=False
    )
    vect_tf_idf = TfidfVectorizer(sublinear_tf=True, tokenizer=tokenizer.tokenize, min_df=min_df, max_df=0.75)
    vect_tf_idf.fit(dataframe.prep_text)
    return vect_tf_idf

In [33]:
t2v_model = train_text2vec(train_df, min_df=20)



In [52]:
train_text = t2v_model.transform(train_df.iloc[:]['prep_text'])

### Creating Training Feature Matrix

In [64]:
from scipy.sparse import csr_matrix

In [65]:
train_cat = csr_matrix(train_cat)
train_mat = hstack((train_cat, train_text))
train_mat = csr_matrix(train_mat)

In [66]:
train_mat.shape

(540362, 13862)

### Train-Validation Split

In [67]:
from sklearn.model_selection import train_test_split

In [68]:
train, val = train_test_split(hstack((train_mat, train_label)), test_size=0.1)
X_train = train[:, :-1]
X_val = val[:, :-1]
y_train = train[:, -1].toarray().squeeze()
y_val = val[:, -1].toarray().squeeze()

### Training & Evaluation

In [69]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score
from copy import deepcopy

In [70]:
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings(action='ignore', category=ConvergenceWarning)

In [71]:
def evaluate(model, X_train, y_train, X_val, y_val):
    return (model.score(X_train, y_train),
            model.score(X_val, y_val),
            roc_auc_score(y_train, model.predict_proba(X_train)[:, 1]),
            roc_auc_score(y_val, model.predict_proba(X_val)[:, 1]))

In [74]:
def train_mlp(X_train, y_train, X_val, y_val, max_iter):
    clf = MLPClassifier(random_state=0, hidden_layer_sizes=(400, 50, 10), learning_rate_init=0.001,
                   activation='relu', alpha=0.001, max_iter=1, warm_start=True, verbose=True)
    prev_clf = deepcopy(clf)
    prev_val_acc = 0
    prev_val_auc = 0
    
    for i in range(max_iter):
        clf.fit(X_train, y_train)
        train_acc, val_acc, train_auc, val_auc = evaluate(clf, X_train, y_train, X_val, y_val)
        print(f'train acc= {train_acc:.4f}, val acc= {val_acc:.4f}',
              f'train auc= {train_auc:.4f}, val auc= {val_auc:.4f}')
        print()
        
        if val_acc < prev_val_acc or val_auc < prev_val_auc:
            clf = prev_clf
            print(f'Falling back to Iteration {i}')
            break
        
        prev_val_acc = val_acc
        prev_val_auc = val_auc
        prev_clf = deepcopy(clf)
            
    return clf

In [75]:
clf = train_mlp(X_train, y_train, X_val, y_val, max_iter=3)

Iteration 1, loss = 0.24058205
train acc= 0.9417, val acc= 0.9283 train auc= 0.9686, val auc= 0.9552

Iteration 2, loss = 0.19413514
train acc= 0.9546, val acc= 0.9322 train auc= 0.9795, val auc= 0.9589

Iteration 3, loss = 0.17186610
train acc= 0.9613, val acc= 0.9316 train auc= 0.9853, val auc= 0.9578

Falling back to Iteration 2


### Loading & Preprocessing Test Set

In [323]:
test_df = pd.read_parquet('DMC-phase2-validation.parquet', engine='fastparquet')
test_df = unwrap_json(test_df)

In [324]:
test_cat = onehot_model.transform(test_df[cat_columns])

In [325]:
test_cat.shape

(107241, 1283)

In [326]:
test_price = transform_num2cat(test_df['new_price'], price_model, price_bins)
test_usage = transform_num2cat(test_df['usage'], usage_model, usage_bins)

In [327]:
test_cat = hstack((test_cat, test_price, test_usage))

In [328]:
test_cat.shape

(107241, 1425)

In [330]:
test_df['prep_title'] = test_df['title'].apply(text_handler.preprocess_text)
test_df['prep_description'] = test_df['description'].apply(text_handler.preprocess_text)
test_df['prep_text'] = test_df['prep_title'] + " " + test_df['prep_description']

In [None]:
test_text = t2v_model.transform(test_df.iloc[:]['prep_text'])

In [332]:
test_text.shape

(107241, 21660)

In [333]:
test_cat = csr_matrix(test_cat)
test_mat = hstack((test_cat, test_text))
test_mat = csr_matrix(test_mat)

In [334]:
test_mat.shape

(107241, 23085)

### Creating Test Results

In [335]:
pred = clf.predict_proba(test_mat)
pred = pred[:, 1]

In [337]:
pred_df = pd.DataFrame()
pred_df['post_id'] = test_df['post_id']
pred_df['predictions'] = pred

In [338]:
pred_df.head(5)

Unnamed: 0,post_id,predictions
0,c16685db-c7b2-403e-b56d-4a745d7e4686,0.983742
1,e65f2de9-acd2-4f03-9395-24f89e1fed32,0.734644
2,cdf973fe-0b45-49d5-b5d6-bbca65c87adc,0.057156
3,e29d3726-6f7e-42f2-9684-26f1cd3405f8,0.992446
4,37fb59d9-be82-4985-84ed-9132732b2144,0.068135


In [339]:
pred_df.to_csv('pred.csv', index=False)