#IN ORDER TO EASILY USE BERT FOR ANY MULTI-LABEL TEXT CLASSIFICATION PROJECT, IT IS ADVISABLE TO CONVERT THE TARGET VARIABLE USING ONE-HOT ENCODING IF IT IS NOT CONVERTED, SINCE THE TARGET VARIABLE IS ALWAYS MULTI-DIMENSIONAL OR CATEGORICAL (THAT IS MORE THAN TWO CATEGORIES OF TARGET VARIABLE).

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import os
import random
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
train=pd.read_csv('/content/drive/My Drive/Train_health.csv')
test=pd.read_csv('/content/drive/My Drive/Test_health.csv')
sub = pd.read_csv('/content/drive/My Drive/SampleSubmission_health.csv')

In [None]:
print('Train shape:',train.shape,'and number of null values are:',train.isnull().sum())
train.head()

Train shape: (616, 3) and number of null values are: ID       0
text     0
label    0
dtype: int64


Unnamed: 0,ID,text,label
0,SUAVK39Z,I feel that it was better I dieAm happy,Depression
1,9JDAGUV3,Why do I get hallucinations?,Drugs
2,419WR1LQ,I am stresseed due to lack of financial suppor...,Depression
3,6UY7DX6Q,Why is life important?,Suicide
4,FYC0FTFB,How could I be helped to go through the depres...,Depression


In [None]:
print('Test shape:',test.shape,'and number of null values are:',test.isnull().sum())
test.head()

Test shape: (309, 2) and number of null values are: ID      0
text    0
dtype: int64


Unnamed: 0,ID,text
0,02V56KMO,How to overcome bad feelings and emotions
1,03BMGTOK,I feel like giving up in life
2,03LZVFM6,I was so depressed feel like got no strength t...
3,0EPULUM5,I feel so low especially since I had no one to...
4,0GM4C5GD,can i be successful when I am a drug addict?


In [None]:
test.to_csv("test_df.csv", index= False)

#Performing One-Hot Encoding for the train label

In [None]:
cate = train['label']
cate.head()

0    Depression
1         Drugs
2    Depression
3       Suicide
4    Depression
Name: label, dtype: object

In [None]:
from sklearn.preprocessing import OneHotEncoder
cate_one = pd.get_dummies(data=cate)
cate_one.head()

Unnamed: 0,Alcohol,Depression,Drugs,Suicide
0,0,1,0,0
1,0,0,1,0
2,0,1,0,0
3,0,0,0,1
4,0,1,0,0


In [None]:
cate_one.shape

(616, 4)

#Merging One-Hot encoded train label dataframe to train data

In [None]:
train_new =train.join(cate_one)
train_new.head()

Unnamed: 0,ID,text,label,Alcohol,Depression,Drugs,Suicide
0,SUAVK39Z,I feel that it was better I dieAm happy,Depression,0,1,0,0
1,9JDAGUV3,Why do I get hallucinations?,Drugs,0,0,1,0
2,419WR1LQ,I am stresseed due to lack of financial suppor...,Depression,0,1,0,0
3,6UY7DX6Q,Why is life important?,Suicide,0,0,0,1
4,FYC0FTFB,How could I be helped to go through the depres...,Depression,0,1,0,0


#Now let's drop the formal label in the train dataset

In [None]:
train_new = train_new.drop(['label'], axis=1)

#Hence, the new train dataset looks like this:

In [None]:
train_new.head()

Unnamed: 0,ID,text,Alcohol,Depression,Drugs,Suicide
0,SUAVK39Z,I feel that it was better I dieAm happy,0,1,0,0
1,9JDAGUV3,Why do I get hallucinations?,0,0,1,0
2,419WR1LQ,I am stresseed due to lack of financial suppor...,0,1,0,0
3,6UY7DX6Q,Why is life important?,0,0,0,1
4,FYC0FTFB,How could I be helped to go through the depres...,0,1,0,0


In [None]:
train_new.shape

(616, 6)

In [None]:
# Saving our new train data
train_new.to_csv('encoded_train.csv', index= False)

In [None]:
train_new.shape, test.shape

((616, 6), (309, 2))

In [None]:
!pip install transformers


In [None]:
!pip install seqeval


In [None]:
!pip install tensorboardx

In [None]:
!pip install simpletransformers

In [None]:
!pip help install -v --no-cache-dir ./

In [None]:
!pip help install -v --no-cache-dir

#Multilabel Classification

#The dataset contains a column for each criterion with a Boolean 1 or 0 indicating whether or not the comment contains the corresponding toxicity.


In [None]:
train_df = pd.read_csv('/content/encoded_train.csv')
train_df.head()

Unnamed: 0,ID,text,Alcohol,Depression,Drugs,Suicide
0,SUAVK39Z,I feel that it was better I dieAm happy,0,1,0,0
1,9JDAGUV3,Why do I get hallucinations?,0,0,1,0
2,419WR1LQ,I am stresseed due to lack of financial suppor...,0,1,0,0
3,6UY7DX6Q,Why is life important?,0,0,0,1
4,FYC0FTFB,How could I be helped to go through the depres...,0,1,0,0


#However, Simple Transformers requires a column labels which contains multi-hot encoded lists of labels as well as a column text which contains all the text 

In [None]:
df = pd.read_csv('/content/encoded_train.csv')
df.head()

Unnamed: 0,ID,text,Alcohol,Depression,Drugs,Suicide
0,SUAVK39Z,I feel that it was better I dieAm happy,0,1,0,0
1,9JDAGUV3,Why do I get hallucinations?,0,0,1,0
2,419WR1LQ,I am stresseed due to lack of financial suppor...,0,1,0,0
3,6UY7DX6Q,Why is life important?,0,0,0,1
4,FYC0FTFB,How could I be helped to go through the depres...,0,1,0,0


In [None]:
df['labels'] = list(zip(df.Depression.tolist(), df.Alcohol.tolist(), df.Suicide.tolist(), df.Drugs.tolist()))
df['text_comment'] = df['text'].apply(lambda x: x.replace('\n', ' '))

df.head()

Unnamed: 0,ID,text,Alcohol,Depression,Drugs,Suicide,labels,text_comment
0,SUAVK39Z,I feel that it was better I dieAm happy,0,1,0,0,"(1, 0, 0, 0)",I feel that it was better I dieAm happy
1,9JDAGUV3,Why do I get hallucinations?,0,0,1,0,"(0, 0, 0, 1)",Why do I get hallucinations?
2,419WR1LQ,I am stresseed due to lack of financial suppor...,0,1,0,0,"(1, 0, 0, 0)",I am stresseed due to lack of financial suppor...
3,6UY7DX6Q,Why is life important?,0,0,0,1,"(0, 0, 1, 0)",Why is life important?
4,FYC0FTFB,How could I be helped to go through the depres...,0,1,0,0,"(1, 0, 0, 0)",How could I be helped to go through the depres...


#Let’s split the dfinto train and eval datasets so we can validate the model easily.

In [None]:
from sklearn.model_selection import train_test_split


train_df, eval_df = train_test_split(df, test_size=0.2)

#Now the dataset is ready for use!

Multilabel Classification Model


In [None]:
from simpletransformers.classification import MultiLabelClassificationModel


model = MultiLabelClassificationModel('roberta', 'roberta-base',use_cuda=False,
                                      num_labels=4, args={'train_batch_size':2,
                                      'gradient_accumulation_steps':16,
                                      'learning_rate': 3e-5, 'num_train_epochs': 3, 
                                      'max_seq_length': 512})

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=501200538.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




In [None]:
self.args = {
    "output_dir": "outputs/",
    "cache_dir": "cache_dir/",

    "fp16": True,
    "fp16_opt_level": "O1",
    "max_seq_length": 128,
    "train_batch_size": 8,
    "gradient_accumulation_steps": 1,
    "eval_batch_size": 8,
    "num_train_epochs": 1,
    "weight_decay": 0,
    "learning_rate": 4e-5,
    "adam_epsilon": 1e-8,
    "warmup_ratio": 0.06,
    "warmup_steps": 0,
    "max_grad_norm": 1.0,

    "logging_steps": 50,
    "save_steps": 2000,

    "overwrite_output_dir": False,
    "reprocess_input_data": False,
    "evaluate_during_training": False,
    "process_count": cpu_count() - 2 if cpu_count() > 2 else 1,
    "n_gpu": 1,
}

In [None]:
model = MultiLabelClassificationModel('xlnet', 'path_to_model/', num_labels=4)

#Training

In [None]:
# Train the model
model.train_model(train_df)

This will train the model on the training data. You can also change the hyperparameters by passing in a dict containing the relevant attributes to the train_model method. Note that, these modifications will persist even after training is completed.
The train_model method will create a checkpoint (save) of the model at every nth step where n is self.args['save_steps']. Upon completion of training, the final model will be saved to self.args['output_dir'].

#Evaluation

In [None]:
result, model_outputs, wrong_predictions = model.eval_model(eval_df)

#Prediction/Testing

In [None]:
test_df = pd.read_csv('/content/test_df.csv')

to_predict = test_df.text.apply(lambda x: x.replace('\n', ' ')).tolist()
preds, outputs = model.predict(to_predict)

sub_df = pd.DataFrame(outputs, columns=['Depression','Alcohol','Suicide','Drugs'])


sub_df['ID'] = test_df['ID']
sub_df = sub_df[['ID', 'Depression','Alcohol','Suicide','Drugs']]

sub_df.to_csv('outputs/submission.csv', index=False)
