# 01-01 : ktrain Multi-Label Classification

## References

- [Toxic Comments: Multi-Label Text Classification](https://github.com/amaiya/ktrain/blob/master/examples/text/toxic_comments-fasttext.ipynb)

In [1]:
import os
os.environ["TF_USE_LEGACY_KERAS"] = "1"

In [11]:
import pandas as pd
import ktrain
from ktrain import text
from sklearn.preprocessing import MultiLabelBinarizer

In [3]:
data_path = '../../data'
input_path = f'{data_path}/input/labelled_tweets/csv_labels'
train_input_file = f'{input_path}/train.csv'
test_input_file = f'{input_path}/test.csv'
val_input_file = f'{input_path}/val.csv'

## 1. Load Data

In [4]:
df_train = pd.read_csv(train_input_file)
df_val = pd.read_csv(val_input_file)
df_test = pd.read_csv(test_input_file)

# show the data frame shapes
print(f'Train shape: {df_train.shape}')
print(f'Val shape: {df_val.shape}')
print(f'Test shape: {df_test.shape}')

Train shape: (6957, 3)
Val shape: (987, 3)
Test shape: (1977, 3)


In [5]:
df_train.head()

Unnamed: 0,ID,text,labels
0,1311981051720409089t,"@sandraburgess3 They have no idea , they cant ...",ineffective
1,1361403925845401601t,@stepheniscowboy Nvm I ’ ve had covid I ’ ve g...,unnecessary
2,1293488278361055233t,Coronavirus updates : Government partners with...,pharma
3,1305252218526990338t,@OANN U . K . Glaxo Smith Klein whistleblower ...,rushed
4,1376135683400687618t,"3 / horse "" AstraZeneca , not so much for the ...",ineffective pharma


## 2. Preprocessing

### 2.1. Labels to List

In [6]:
df_train['labels_list'] = df_train['labels'].str.split(' ')
df_test['labels_list'] = df_test['labels'].str.split(' ')
df_val['labels_list'] = df_val['labels'].str.split(' ')

In [7]:
df_train.head()

Unnamed: 0,ID,text,labels,labels_list
0,1311981051720409089t,"@sandraburgess3 They have no idea , they cant ...",ineffective,[ineffective]
1,1361403925845401601t,@stepheniscowboy Nvm I ’ ve had covid I ’ ve g...,unnecessary,[unnecessary]
2,1293488278361055233t,Coronavirus updates : Government partners with...,pharma,[pharma]
3,1305252218526990338t,@OANN U . K . Glaxo Smith Klein whistleblower ...,rushed,[rushed]
4,1376135683400687618t,"3 / horse "" AstraZeneca , not so much for the ...",ineffective pharma,"[ineffective, pharma]"


### 2.1. Multi-label Binarization

In [8]:
# get the list of label values
labels = pd.concat([df_train.labels_list, 
                    df_val.labels_list, 
                    df_test.labels_list])

# initialize MultiLabelBinarizer
labels_lookup = MultiLabelBinarizer()

# learn the vocabulary
labels_lookup = labels_lookup.fit(labels)

# show the vocabulary
vocab = labels_lookup.classes_
print(f'Vocabulary size: {len(vocab)}')
print(f'Vocabulary: {vocab}')


Vocabulary size: 12
Vocabulary: ['conspiracy' 'country' 'ineffective' 'ingredients' 'mandatory' 'none'
 'pharma' 'political' 'religious' 'rushed' 'side-effect' 'unnecessary']


In [9]:
# add the one-hot encoded labels as columns to the data frames
df_train = df_train.join(pd.DataFrame(labels_lookup.transform(df_train.labels_list), 
                                     columns=labels_lookup.classes_, 
                                     index=df_train.index))

df_val = df_val.join(pd.DataFrame(labels_lookup.transform(df_val.labels_list),
                                    columns=labels_lookup.classes_,
                                    index=df_val.index))

df_test = df_test.join(pd.DataFrame(labels_lookup.transform(df_test.labels_list),
                                    columns=labels_lookup.classes_,
                                    index=df_test.index))

In [10]:
df_test.head()

Unnamed: 0,ID,text,labels,labels_list,conspiracy,country,ineffective,ingredients,mandatory,none,pharma,political,religious,rushed,side-effect,unnecessary
0,1329097028845105155t,@nbc4i Please everyone research Pfizer ’ s his...,pharma,[pharma],0,0,0,0,0,0,1,0,0,0,0,0
1,1340017965828349952t,EVEN WITH A VACCINE IT IS STILL NOT ENOUGH . T...,conspiracy political,"[conspiracy, political]",1,0,0,0,0,0,0,1,0,0,0,0
2,1329126491649683459t,@ianbrown @DarrenPlymouth It ’ s the unknown l...,side-effect,[side-effect],0,0,0,0,0,0,0,0,0,0,1,0
3,1285666197003018240t,@FACT_Exeter @K3T3R @respect65 Back to my orig...,rushed,[rushed],0,0,0,0,0,0,0,0,0,1,0,0
4,1284814348679028737t,@DrJasonJohnson IMHO optimistic talk about a v...,ineffective,[ineffective],0,0,1,0,0,0,0,0,0,0,0,0
