# Multilabel Classification Model RoBERTa

Install necessary packages (should be performed once)

In [None]:
#!pip install simpletransformers
#!pip install transformers
#!pip install seqeval
#!pip install tensorboardx
#!pip install torch



Import functions and packages, necessary for model

In [None]:
import logging
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from sklearn.metrics import roc_auc_score

In [None]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [None]:
from simpletransformers.classification import (
    MultiLabelClassificationModel, MultiLabelClassificationArgs
)


### Data preparation for model

For the model dataframe should have 2 columns:
  * 'labels' - with all classes of labels
  * 'text' - with text of comments (please, provide here necessary column from train dataset with stop-words or without, with noise or without etc.





In [None]:

df = pd.read_csv("data/df_cleaned.csv")
df['labels'] = list(zip(df.toxic.tolist(), df.severe_toxic.tolist(), df.obscene.tolist(), df.threat.tolist(),  df.insult.tolist(), df.identity_hate.tolist()))
df['text'] = df['clean_comments_without_stop_w']

### Data split into train and eval datasets


In [None]:
train_df, eval_df = train_test_split(df, random_state = 0, test_size=0.2)

## Model

### Definition of model arguments:
   * "output_dir" (optional): The directory where model files will be saved. If not given self.args['output_dir'] will be used.
   * "cache_dir" (optional): The directory where The directory where cached files will be saved. If not given, self.args['cache_dir'] will be used.


In [None]:

sargs = {
    "output_dir": "outputs_full_new/",
    "cache_dir": "cache_dir_full_new/",
    'train_batch_size':2, 
    'gradient_accumulation_steps':16, 
    'learning_rate': 3e-5, 
    'num_train_epochs': 3, 
    'max_seq_length': 128
    }

### Model initialization with necessary parameters

When we train model for the first time,  we used pretrained model from the library 'roberta-base'
* num_labels=6, because we have multilabel classification
* use_cuda=False - always, when don't use GPU for processing


In [None]:
model = MultiLabelClassificationModel('roberta', 'roberta-base', 
                                      num_labels=6, args=sargs,
                                      use_cuda=False
                                      )

When we have pretrained model, that we want to use, we load this model from default path to Transformer model 'outputs/'

By default it is saved in 'outputs/pytorch_model.bin'

In [None]:
model = MultiLabelClassificationModel('roberta', 'outputs/',
                                      num_labels=6, args=sargs,
                                      use_cuda=False)

### Training of the model with train_model()

In [None]:
model.train_model(train_df)

### Evaluation of the model with eval_model()

Returns:
* result: Dictionary containing evaluation results.  By default the Label ranking average precision (LRAP) and loss are reported for multilabel classification. We use also roc_auc_score.
* model_outputs: List of model outputs for each row in eval_df
* wrong_preds: List of objects corresponding to each incorrect prediction by the model


In [None]:
result, model_outputs, wrong_predictions = model.eval_model(eval_df, acc=roc_auc_score)

#### Saving wrong predictions to .csv file for analysis

In [None]:
w_pred=pd.DataFrame(wrong_predictions)
w_pred.to_csv('wrong_pred.csv', index=False)

#### Saving model outputs to .csv file for analysis

In [None]:
mod_out=pd.DataFrame(model_outputs)
mod_out.to_csv('mod_out.csv', index=False)

### Making predictions on (unlabelled) data with predict()

Download test data set without labels

In [None]:
test_for_sub = pd.read_csv('data/df_test_cleaned.csv')

* Preparation of list of comments for model.predict() function
* Run of model.predict() function

In [None]:
to_predict = test_for_sub.comment_text.tolist()
preds, outputs = model.predict(to_predict)

Preparation of file for submiting on Kaggle

In [None]:
sub_df = pd.DataFrame(outputs, columns=['toxic','severe_toxic','obscene','threat','insult','identity_hate'])
sub_df['id'] = test_for_sub['id']
sub_df = sub_df[['id', 'toxic','severe_toxic','obscene','threat','insult','identity_hate']]
sub_df.to_csv('submission.csv', index=False)