In [None]:
import argparse
import re
import time
import random
import math
import unicodedata
from datetime import datetime

import numpy as np
import pandas as pd
from scipy.special import softmax
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from cleanlab.filter import find_label_issues
from tqdm import tqdm
import demoji

import torch
from torch import cuda
from torch.utils.data import DataLoader
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CESoftmaxAccuracyEvaluator
from sentence_transformers import SentenceTransformer, InputExample, losses
from sentence_transformers import evaluation
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel
import transformers
transformers.logging.set_verbosity_error()

In [None]:
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

def fix_seed(seed):
    # random
    random.seed(seed)
    # Numpy
    np.random.seed(seed)
    # Pytorch
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    
SEED = 2022
fix_seed(SEED)

In [None]:
BATCH_SIZE = 16
num_epochs = 1
num_labels = 2
max_length = 512

# target language of model training (multi, us, es)
# For JP, please use train_jp.ipynb
target_lang = 'es'

In [None]:
data_path_task1 = './data/task1/'
data_path_task2 = './data/task2/'

train_df_task1 = pd.read_csv(data_path_task1+'train-v0.3.csv.zip')
train_df_task2 = pd.read_csv(data_path_task2+'train-v0.3.csv.zip')
product_df = pd.read_csv(data_path_task2+'product_catalogue-v0.3.csv.zip')

train_df = pd.concat([train_df_task1[['query','query_locale','product_id','esci_label']],
           train_df_task2[['query','query_locale','product_id','esci_label']]]).drop_duplicates()

train_df['query_id'] = train_df["query"].factorize()[0] + 0

train_df = train_df.merge(product_df,left_on = ['product_id','query_locale'],right_on=['product_id','product_locale'], how= 'left')
print(train_df.shape)

train_df['label'] = train_df['esci_label'].map({'exact':0, 'substitute':1, 'complement':0, 'irrelevant':0})

if target_lang=='multi':
    train_df = train_df.reset_index(drop=True)
if target_lang=='us':
    train_df = train_df[train_df.query_locale=='us'].reset_index(drop=True)
if target_lang=='es':
    train_df = train_df[train_df.query_locale=='es'].reset_index(drop=True)

print(train_df.shape)
train_df.head(3)

In [None]:
def str_normalize(s):
    norm_text = re.sub(r'(http|https)://([-\w]+\.)+[-\w]+(/[-\w./?%&=]*)?', "", s)
    norm_text = unicodedata.normalize("NFKC", norm_text)
    norm_text = demoji.replace(string=norm_text, repl="")
    
    return norm_text

In [None]:
# set data type and normalize text
train_df['query'] = train_df['query'].astype(str)
train_df['query'] = train_df['query'].map(str_normalize)
train_df['product_title'] = train_df['product_title'].astype(str)
train_df['product_title'] = train_df['product_title'].map(str_normalize)

In [None]:
# split train data into train and valid (if needed, local test set is also created)
list_query_id = train_df["query_id"].unique()

# list_query_id_train, list_query_id_test = train_test_split(list_query_id, test_size=0.2, random_state=SEED)
list_query_id_train, list_query_id_dev = train_test_split(list_query_id, test_size=0.1, random_state=SEED)

df_train = train_df[train_df["query_id"].isin(list_query_id_train)]
df_dev = train_df[train_df["query_id"].isin(list_query_id_dev)]
# df_test = train_df[train_df["query_id"].isin(list_query_id_test)]

print('train size',df_train.shape)
print('valid size',df_dev.shape)
# print('test size',df_test.shape)

In [None]:
# calculate sample balance for two-phase learning
BALANCED_SAMPLE_NUM = int(train_df['label'].value_counts()[1] + train_df['label'].value_counts()[1]*0.1)
print(BALANCED_SAMPLE_NUM)

In [None]:
df_train = df_train.reset_index(drop=True)
df_train_sampled_exact = df_train[df_train.label==0].sample(BALANCED_SAMPLE_NUM,random_state=SEED)
df_train_sampled_no_exact = df_train[df_train.label!=0]

tgt_idx = list(df_train_sampled_exact.index.tolist()) + list(df_train_sampled_no_exact.index.tolist())
df_train_balanced = df_train.iloc[tgt_idx]

In [None]:
train_samples = []
for (_, row) in df_train.iterrows():
    train_samples.append(InputExample(texts=[row['query'], row['product_title']], label=int(row['label'])))
    
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=BATCH_SIZE, drop_last=True)

In [None]:
train_samples_balanced = []
for (_, row) in df_train_balanced.iterrows():
    train_samples_balanced.append(InputExample(texts=[row['query'], row['product_title']], label=int(row['label'])))
    
train_dataloader_balanced = DataLoader(train_samples_balanced, shuffle=True, batch_size=BATCH_SIZE, drop_last=True)

In [None]:
dev_samples = []
for (_, row) in df_dev.iterrows():
    dev_samples.append(InputExample(texts=[row['query'], row['product_title']], label=int(row['label'])))

In [None]:
# test_samples = []
# for (_, row) in df_test.iterrows():
#     test_samples.append(InputExample(texts=[row['query'], row['product_title']], label=int(row['label'])))

In [None]:
if target_lang=='multi':
    model_save_path = 'model/task3_model_multi'
    model_name = 'microsoft/mdeberta-v3-base'
if target_lang=='us':
    model_save_path = 'model/task3_model_us'
    model_name = 'sentence-transformers/all-mpnet-base-v2'
if target_lang=='es':
    model_save_path = 'model/task3_model_es'
    model_name = 'dccuchile/bert-base-spanish-wwm-uncased'
    
model = CrossEncoder(model_name, num_labels=num_labels)
evaluator = CESoftmaxAccuracyEvaluator.from_input_examples(dev_samples, name='train-dev')

In [None]:
#10% of train data for warm-up
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)

# Train the model with balanced sample
model.fit(train_dataloader=train_dataloader_balanced,
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=500,
          warmup_steps=warmup_steps,
          output_path=model_save_path)

In [None]:
# Train the model with the original sample ratio
model.fit(train_dataloader=train_dataloader,
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=500,
          warmup_steps=warmup_steps,
          output_path=model_save_path)

In [None]:
# Evaluation for test data (If needed)
# evaluator = CESoftmaxAccuracyEvaluator.from_input_examples(test_samples, name='train-test')
# evaluator(model)