In [None]:
import argparse
import re
import time
import random
import math
import unicodedata
from datetime import datetime

import numpy as np
import pandas as pd
from scipy.special import softmax
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from cleanlab.filter import find_label_issues
from tqdm import tqdm
import demoji

import torch
from torch import cuda
from torch.utils.data import DataLoader
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CESoftmaxAccuracyEvaluator
from sentence_transformers import SentenceTransformer, InputExample, losses
from sentence_transformers import evaluation
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel
import transformers
transformers.logging.set_verbosity_error()

In [None]:
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

def fix_seed(seed):
    # random
    random.seed(seed)
    # Numpy
    np.random.seed(seed)
    # Pytorch
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    
SEED = 2022
fix_seed(SEED)

In [None]:
BATCH_SIZE = 16
num_epochs = 1
num_labels = 2
max_length = 512

In [None]:
data_path_task1 = './data/task1/'
data_path_task2 = './data/task2/'

train_df_task1 = pd.read_csv(data_path_task1+'train-v0.3.csv.zip')
train_df_task2 = pd.read_csv(data_path_task2+'train-v0.3.csv.zip')
product_df = pd.read_csv(data_path_task2+'product_catalogue-v0.3.csv.zip')

train_df = pd.concat([train_df_task1[['query','query_locale','product_id','esci_label']],
           train_df_task2[['query','query_locale','product_id','esci_label']]]).drop_duplicates()

train_df['query_id'] = train_df["query"].factorize()[0] + 0

train_df = train_df.merge(product_df,left_on = ['product_id','query_locale'],right_on=['product_id','product_locale'], how= 'left')
print(train_df.shape)

train_df['label'] = train_df['esci_label'].map({'exact':0, 'substitute':1, 'complement':0, 'irrelevant':0})
train_df = train_df[train_df.query_locale=='jp'].reset_index(drop=True)

print(train_df.shape)
train_df.head(3)

In [None]:
def str_normalize(s):
    norm_text = re.sub(r'(http|https)://([-\w]+\.)+[-\w]+(/[-\w./?%&=]*)?', "", s)
    norm_text = unicodedata.normalize("NFKC", norm_text)
    norm_text = demoji.replace(string=norm_text, repl="")
    
    return norm_text

In [None]:
# set data type and normalize text
train_df['query'] = train_df['query'].astype(str)
train_df['query'] = train_df['query'].map(str_normalize)
train_df['product_title'] = train_df['product_title'].astype(str)
train_df['product_title'] = train_df['product_title'].map(str_normalize)

In [None]:
# split train data into train and valid (if needed, local test set is also created)
list_query_id = train_df["query_id"].unique()

# list_query_id_train, list_query_id_test = train_test_split(list_query_id, test_size=0.2, random_state=SEED)
list_query_id_train, list_query_id_dev = train_test_split(list_query_id, test_size=0.1, random_state=SEED)
# 2-Fold Cross-validation for label cleaning (CleanLab)
# Note: We tried CleanLab for all langulages, but it was only effective for Japanese
list_query_id_train1, list_query_id_train2 = train_test_split(list_query_id_train, test_size=0.5, random_state=SEED)

df_train1 = train_df[train_df["query_id"].isin(list_query_id_train1)]
df_train2 = train_df[train_df["query_id"].isin(list_query_id_train2)]
df_dev = train_df[train_df["query_id"].isin(list_query_id_dev)]
# df_test = train_df[train_df["query_id"].isin(list_query_id_test)]

print('train CV1 size',df_train1.shape)
print('train CV2 size',df_train2.shape)
print('valid size',df_dev.shape)
# print('test size',df_test.shape)

In [None]:
train_samples1 = []
for (_, row) in df_train1.iterrows():
    train_samples1.append(InputExample(texts=[row['query'], row['product_title']], label=int(row['label'])))
    
train_dataloader1 = DataLoader(train_samples1, shuffle=True, batch_size=BATCH_SIZE, drop_last=True)


train_samples2 = []
for (_, row) in df_train2.iterrows():
    train_samples2.append(InputExample(texts=[row['query'], row['product_title']], label=int(row['label'])))
    
train_dataloader2 = DataLoader(train_samples2, shuffle=True, batch_size=BATCH_SIZE, drop_last=True)

In [None]:
dev_samples = []
for (_, row) in df_dev.iterrows():
    dev_samples.append(InputExample(texts=[row['query'], row['product_title']], label=int(row['label'])))

In [None]:
# test_samples = []
# for (_, row) in df_test.iterrows():
#     test_samples.append(InputExample(texts=[row['query'], row['product_title']], label=int(row['label'])))

In [None]:
model_save_path1 = 'models/model_jp_CleanLab_CV1'
model_save_path2 = 'models/model_jp_CleanLab_CV2'

model = CrossEncoder('cl-tohoku/bert-base-japanese-v2', num_labels=num_labels)
evaluator = CESoftmaxAccuracyEvaluator.from_input_examples(dev_samples, name='train-dev')

In [None]:
warmup_steps = math.ceil(len(train_dataloader1) * num_epochs * 0.1)
# Train the model
model.fit(train_dataloader=train_dataloader1,
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=500,
          warmup_steps=warmup_steps,
          output_path=model_save_path1)

In [None]:
del model
torch.cuda.empty_cache()

In [None]:
# inferece for train data CV2 using the model trained by CV1
model = AutoModelForSequenceClassification.from_pretrained(model_save_path1).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_save_path1)

In [None]:
features_query = df_train2['query'].to_list()
features_product = df_train2['product_title'].to_list()

n_examples = len(features_query)
scores = np.empty((0, num_labels))

In [None]:
with torch.no_grad():
    for i in tqdm(range(0, n_examples, BATCH_SIZE)):
        j = min(i + BATCH_SIZE, n_examples)
        features_query_ = features_query[i:j]
        features_product_ = features_product[i:j]
        features = tokenizer(features_query_, features_product_,  padding=True, truncation=True, return_tensors="pt").to(device)
        scores = np.vstack((scores, np.squeeze(model(**features).logits.cpu().detach().numpy())))
        i = j

In [None]:
pred_probs = softmax(scores, axis=1)

ranked_label_issues_train2 = find_label_issues(
    df_train2['label'],
    pred_probs,
    return_indices_ranked_by="self_confidence",
)

In [None]:
del model, tokenizer

In [None]:
# Swith CV
model = CrossEncoder('cl-tohoku/bert-base-japanese-v2', num_labels=num_labels)

warmup_steps = math.ceil(len(train_dataloader2) * num_epochs * 0.1) #10% of train data for warm-up
# Train the model
model.fit(train_dataloader=train_dataloader2,
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=500,
          warmup_steps=warmup_steps,
          output_path=model_save_path2)

In [None]:
del model
torch.cuda.empty_cache()

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_save_path2).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_save_path2)

In [None]:
features_query = df_train1['query'].to_list()
features_product = df_train1['product_title'].to_list()

n_examples = len(features_query)
scores = np.empty((0, num_labels))

In [None]:
with torch.no_grad():
    for i in tqdm(range(0, n_examples, BATCH_SIZE)):
        j = min(i + BATCH_SIZE, n_examples)
        features_query_ = features_query[i:j]
        features_product_ = features_product[i:j]
        features = tokenizer(features_query_, features_product_,  padding=True, truncation=True, return_tensors="pt").to(device)
        scores = np.vstack((scores, np.squeeze(model(**features).logits.cpu().detach().numpy())))
        i = j

In [None]:
pred_probs = softmax(scores, axis=1)

ranked_label_issues_train1 = find_label_issues(
    df_train1['label'],
    pred_probs,
    return_indices_ranked_by="self_confidence",
)

In [None]:
# Train a model without noisy data
df_train = pd.concat([df_train1, df_train2])
label_issues_idx = ranked_label_issues_train1.tolist() + ranked_label_issues_train2.tolist()

train_samples = []
for (_, row) in df_train.iloc[~df_train.index.isin(label_issues_idx)].iterrows():
    train_samples.append(InputExample(texts=[row['query'], row['product_title']], label=int(row['label'])))
    
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=BATCH_SIZE, drop_last=True)

In [None]:
# calculate sample balance for two-phase learning
BALANCED_SAMPLE_NUM = int(df_train['label'].value_counts()[1] + df_train['label'].value_counts()[1]*0.1)
print(BALANCED_SAMPLE_NUM)

In [None]:
df_train = df_train.reset_index(drop=True)
df_train_sampled_exact = df_train[df_train.label==0].sample(BALANCED_SAMPLE_NUM,random_state=SEED)
df_train_sampled_no_exact = df_train[df_train.label!=0]

tgt_idx = list(df_train_sampled_exact.index.tolist()) + list(df_train_sampled_no_exact.index.tolist())
df_train_balanced = df_train.iloc[tgt_idx]

In [None]:
train_samples_balanced = []
for (_, row) in df_train_balanced.iterrows():
    train_samples_balanced.append(InputExample(texts=[row['query'], row['product_title']], label=int(row['label'])))
    
train_dataloader_balanced = DataLoader(train_samples_balanced, shuffle=True, batch_size=BATCH_SIZE, drop_last=True)

In [None]:
model = CrossEncoder('cl-tohoku/bert-base-japanese-v2', num_labels=num_labels)

model_save_path = 'models/task3_model_jp'
warmup_steps = math.ceil(len(train_dataloader_balanced) * num_epochs * 0.1)
# Train the model
model.fit(train_dataloader=train_dataloader_balanced,
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=500,
          warmup_steps=warmup_steps,
          output_path=model_save_path)

In [None]:
model.fit(train_dataloader=train_dataloader,
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=500,
          warmup_steps=warmup_steps,
          output_path=model_save_path)

In [None]:
# Evaluation for test data (If needed)
# evaluator = CESoftmaxAccuracyEvaluator.from_input_examples(test_samples, name='train-test')
# evaluator(model)