In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
import warnings

warnings.filterwarnings("ignore")

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/intent-detection-by-sinch/sample.csv
/kaggle/input/intent-detection-by-sinch/y_train.csv
/kaggle/input/intent-detection-by-sinch/X_test.npy
/kaggle/input/intent-detection-by-sinch/X_train.npy


In [None]:
!pip install transformers==4.19.0
!pip install datasets

## Imports and data loading

In [None]:
from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import Dataset

from transformers import AutoModelForSequenceClassification, RobertaForSequenceClassification, RobertaConfig
from transformers import TrainerCallback, Trainer, TrainingArguments
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers.trainer_utils import IntervalStrategy
from datasets import load_metric

In [None]:
data_folder = '/kaggle/input/intent-detection-by-sinch/'
X_train = np.load(os.path.join(data_folder, 'X_train.npy'))
X_test = np.load(os.path.join(data_folder, 'X_test.npy'))

y_train = pd.read_csv(os.path.join(data_folder, 'y_train.csv')).Predicted.to_numpy()

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)

(1663, 768)
(713, 768)
(1663,)


## Setting parameters

In [None]:
DEVICE = torch.device('cuda')
params = {
    "model_type": "roberta", 
    "hidden_size": 768, 
    "num_attention_heads": 12, 
    "num_hidden_layers": 12, 
    "return_dict": True, 
    "is_decoder": True, 
    "add_cross_attention": True, 
    "architectures": ["RobertaForSequenceClassification"], 
    "num_labels": 45, 
    "problem_type": "single_label_classification", 
    "intermediate_size": 3072, 
    "hidden_act": "gelu", 
    "hidden_dropout_prob": 0.1, 
    "attention_probs_dropout_prob": 0.1, 
    "max_position_embeddings": 512, 
    "initializer_range": 0.02, 
    "layer_norm_eps": 1e-05, 
    "classifier_dropout": 0.1, 
    "pad_token_id": 1, 
    "bos_token_id": 0,
    "eos_token_id": 2,
         }

## Model and optimizer initialization

In [None]:
config = RobertaConfig.from_dict(params)
model = AutoModelForSequenceClassification.from_config(config).to(DEVICE)

In [None]:
class CustomDataset(Dataset):
    def __init__(self, embedding, label):
        self.embedding = embedding
        self.labels = label

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        input_embeds = torch.unsqueeze(torch.tensor(self.embedding[idx]), 0).to(DEVICE)
        return {'inputs_embeds': input_embeds.cpu().detach(), 'labels': self.labels[idx]}
    
    
class CustomCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, **kwargs):
        # Log
        if args.logging_strategy == IntervalStrategy.EPOCH:
            control.should_log = True

        # Evaluate
        if args.evaluation_strategy == IntervalStrategy.EPOCH and args.eval_delay <= state.epoch:
            control.should_evaluate = True

        return control
    
    def check_metric_value(self, args, state, control, metric_value):
        operator = np.greater if args.greater_is_better else np.less
    
        if state.best_metric is None or (
            state.best_metric < metric_value
        ):
            if state.best_model_checkpoint is not None:
                for file in os.listdir(state.best_model_checkpoint):
                    try:
                        os.remove(os.path.join(state.best_model_checkpoint, file))
                    except:
                        pass
                os.rmdir(state.best_model_checkpoint)
    
            control.should_save = True
        else:
            control.should_save = False
            
        return control
            
    def on_evaluate(self, args, state, control, metrics, **kwargs):
        metric_to_check = args.metric_for_best_model
        if not metric_to_check.startswith("eval_"):
            metric_to_check = f"eval_{metric_to_check}"
        metric_value = metrics.get(metric_to_check)

        if metric_value is None:
            logger.warning(
                f"early stopping required metric_for_best_model, but did not find {metric_to_check} so early stopping"
                " is disabled"
            )
            return

        control = self.check_metric_value(args, state, control, metric_value)
        control.should_validate = False
        return control
        
    
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)
X_train.shape, X_val.shape

((1330, 768), (333, 768))

In [None]:
train = CustomDataset(X_train, y_train)
val = CustomDataset(X_val, y_val)

## Training

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average='weighted')


training_args = TrainingArguments(
    num_train_epochs=350, 
    output_dir="test_trainer", 
    evaluation_strategy="epoch", 
    save_strategy="epoch",
    metric_for_best_model="f1",
    load_best_model_at_end=True,
    greater_is_better=True, 
                                  )
metric = load_metric("f1")

optimizer = AdamW(model.parameters(), lr=3e-5, weight_decay=0.001, no_deprecation_warning=True)
scheduler = get_linear_schedule_with_warmup(optimizer, 10 * 167, 350 * 167)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=val,
    compute_metrics=compute_metrics,
    callbacks=[CustomCallback], 
    optimizers=(optimizer, scheduler),
)
setattr(trainer, "warnings_issued", {"estimate_tokens": True})
trainer.train()


In [None]:
class CustomEvalDataset(Dataset):
    def __init__(self, embedding):
        self.embedding = embedding

    def __len__(self):
        return len(self.embedding)

    def __getitem__(self, idx):
        input_embeds = torch.unsqueeze(torch.tensor(self.embedding[idx]), 0)
        return {'inputs_embeds': input_embeds}


eval_model = RobertaForSequenceClassification.from_pretrained(
    os.path.join('/kaggle/working/test_trainer', os.listdir('/kaggle/working/test_trainer')[0]), **params
)
eval_trainer = Trainer(model=eval_model)

X_test = np.load(os.path.join(data_folder, 'X_test.npy'))
X_test = CustomEvalDataset(X_test)
result = eval_trainer.predict(X_test)

In [None]:
np.argmax(result[0], axis=1).shape

In [None]:
labels = np.argmax(result[0], axis=1)

In [None]:
y_test = pd.DataFrame({
    'Id': [i for i in range(len(labels))], 
    'Predicted': labels
})
y_test.head()

In [None]:
y_test.to_csv('y_test.csv', index=False)