# 01-01 : ktrain Multi-Label Classification

## References

- [Toxic Comments: Multi-Label Text Classification](https://github.com/amaiya/ktrain/blob/master/examples/text/toxic_comments-fasttext.ipynb)

In [1]:
import os
os.environ["TF_USE_LEGACY_KERAS"] = "1"

In [14]:
import pandas as pd
import ktrain
import tensorflow as tf
from tensorflow import keras

In [3]:
data_path = '../../data'
input_path = f'{data_path}/input/labelled_tweets/csv_labels'
train_input_file = f'{input_path}/train.csv'
test_input_file = f'{input_path}/test.csv'
val_input_file = f'{input_path}/val.csv'

## 1. Load Data

In [4]:
df_train = pd.read_csv(train_input_file)
df_val = pd.read_csv(val_input_file)
df_test = pd.read_csv(test_input_file)

# show the data frame shapes
print(f'Train shape: {df_train.shape}')
print(f'Val shape: {df_val.shape}')
print(f'Test shape: {df_test.shape}')

Train shape: (6957, 3)
Val shape: (987, 3)
Test shape: (1977, 3)


In [5]:
df_train.head()

Unnamed: 0,ID,text,labels
0,1311981051720409089t,"@sandraburgess3 They have no idea , they cant ...",ineffective
1,1361403925845401601t,@stepheniscowboy Nvm I ’ ve had covid I ’ ve g...,unnecessary
2,1293488278361055233t,Coronavirus updates : Government partners with...,pharma
3,1305252218526990338t,@OANN U . K . Glaxo Smith Klein whistleblower ...,rushed
4,1376135683400687618t,"3 / horse "" AstraZeneca , not so much for the ...",ineffective pharma


## 2. Preprocessing

### 2.1. Labels to List

In [6]:
df_train['labels_list'] = df_train['labels'].str.split(' ')
df_test['labels_list'] = df_test['labels'].str.split(' ')
df_val['labels_list'] = df_val['labels'].str.split(' ')

In [7]:
df_train.head()

Unnamed: 0,ID,text,labels,labels_list
0,1311981051720409089t,"@sandraburgess3 They have no idea , they cant ...",ineffective,[ineffective]
1,1361403925845401601t,@stepheniscowboy Nvm I ’ ve had covid I ’ ve g...,unnecessary,[unnecessary]
2,1293488278361055233t,Coronavirus updates : Government partners with...,pharma,[pharma]
3,1305252218526990338t,@OANN U . K . Glaxo Smith Klein whistleblower ...,rushed,[rushed]
4,1376135683400687618t,"3 / horse "" AstraZeneca , not so much for the ...",ineffective pharma,"[ineffective, pharma]"


### 2.1. Multi-label Binarization

In [16]:
# get the list of label values
labels = tf.ragged.constant(
    pd.concat([df_train.labels_list, df_val.labels_list, df_test.labels_list]) \
        .values
)

# learn the vocabulary
labels_lookup = keras.layers.StringLookup(output_mode="multi_hot")
labels_lookup.adapt(labels)

# show the vocabulary
vocab = labels_lookup.get_vocabulary()
print("Vocabulary:\n")
print(labels_lookup.get_vocabulary())
print(f'Vocabulary size: {len(vocab)}')

Vocabulary:

['[UNK]', 'side-effect', 'ineffective', 'rushed', 'pharma', 'mandatory', 'unnecessary', 'none', 'political', 'conspiracy', 'ingredients', 'country', 'religious']
Vocabulary size: 13


In [23]:
# apply the label_lookup to the data sets
df_train['labels_encoded'] = labels_lookup(tf.ragged.constant(df_train['labels_list'].values)).numpy().tolist()

df_train

Unnamed: 0,ID,text,labels,labels_list,labels_encoded
0,1311981051720409089t,"@sandraburgess3 They have no idea , they cant ...",ineffective,[ineffective],"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,1361403925845401601t,@stepheniscowboy Nvm I ’ ve had covid I ’ ve g...,unnecessary,[unnecessary],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
2,1293488278361055233t,Coronavirus updates : Government partners with...,pharma,[pharma],"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
3,1305252218526990338t,@OANN U . K . Glaxo Smith Klein whistleblower ...,rushed,[rushed],"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,1376135683400687618t,"3 / horse "" AstraZeneca , not so much for the ...",ineffective pharma,"[ineffective, pharma]","[0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...
6952,1370403012842250241t,I don ’ t get the flu shot . Haven ’ t gotten ...,none,[none],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
6953,1337580201560125442t,Why are they rushing this vaccine out ? COVID ...,rushed side-effect,"[rushed, side-effect]","[0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
6954,1351838450706763777t,@SJJB55 ' Cases ' data is propaganda . Where /...,none,[none],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
6955,1333970612897808385t,@holly_go_litely @jmask @JustinTrudeau Moderna...,pharma,[pharma],"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
