In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gc
import string
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
import pickle

In [2]:
## store the embeddings and one hot encodings using np.save()

In [3]:
X_train,y_train,X_test,y_test = np.load('embed_train.npy'),np.load('labels_train.npy'),np.load('embed_test.npy'),np.load('labels_test.npy')

In [4]:
alphabets = list(string.ascii_uppercase)

In [5]:
# store both the multi label binarizer models

In [6]:
with open('mlb.pkl','rb') as file:
    mlb = pickle.load(file)
with open('mlb_letter.pkl','rb') as file:
    mlb_letter = pickle.load(file)

In [7]:
#y_train_letter
y_labels_train = mlb.inverse_transform(y_train)
y_labels_test = mlb.inverse_transform(y_test)

In [8]:
y_starter_train = [tuple(s[0] for s in tup) for tup in y_labels_train]
y_starter_test = [tuple(s[0] for s in tup) for tup in y_labels_test]

In [9]:
y_train_letter = mlb_letter.transform(y_starter_train)
y_test_letter = mlb_letter.transform(y_starter_test)

In [10]:
gc.collect()

0

In [11]:
###  The first stage begins ###
from skmultilearn.adapt import MLARAM

classifier = MLARAM(threshold=0.05, vigilance=0.95)
classifier.fit(X_train, y_train_letter)

In [12]:
prediction = classifier.predict(X_test[:100])

In [13]:
pred_1 = prediction

In [14]:
from sklearn.metrics import fbeta_score
print(f"Score is {fbeta_score(prediction,y_test_letter[:100],beta = 2,average = 'micro')}")

Score is 0.9096385542168675


In [None]:
### The first stage ends ###

In [None]:
### The second stage begins ###

In [15]:
def filter_data_points(embeddings,labels, labels_to_check):
    #choose rows who have atleast one entry in 'labels_to_check' and choose labels data only from 'labels_to_check' columns
    interested_rows = np.any(labels[:, labels_to_check], axis=1)
    interested_labels = labels[interested_rows]
    filtered_labels = np.zeros((np.sum(interested_rows),labels.shape[1]))
    filtered_labels[:,labels_to_check] += interested_labels[:,labels_to_check] 
    filtered_embeddings = embeddings[interested_rows]
    return filtered_embeddings, filtered_labels

In [16]:
#Training a model for each alphabet
mlabels = mlb.classes_
models = {}
for letter in tqdm(alphabets):
    # X should be embeddings, y should be multilabels
    indices  = [ind for ind,s in enumerate(mlabels) if s.startswith(letter)]
    filtered_data,filtered_labels = filter_data_points(X_train,y_train,indices)
    if len(indices) == 0:
        models[letter] = []
        continue
    elif len(indices) == 1:
        mask = np.zeros((1,len(mlabels)))
        mask[0,indices[0]] = 1
        models[letter] = mask
    else:
        model = MultiOutputClassifier(RandomForestClassifier(n_estimators = 5,verbose = 1))
        model.fit(filtered_data,filtered_labels)
        models[letter] = model

100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [30:36<00:00, 70.64s/it]


In [17]:
### second stage ends ###

In [18]:
### The final prediction ###

In [31]:
def final_pred(X_test,y_actual,pred_1,models):
    pred_2 = np.zeros((X_test.shape[0],len(mlabels)))
    ind = 0
    for letter in tqdm(alphabets):
        if letter == 'P' or letter == 'U':
            continue
        reqd_data = np.any(pred_1[:,ind].reshape(-1,1),axis = 1)  #all rows where the pred_1 is 1 for the given letter
        reqd_model = models[letter]
        ind +=1
        if np.sum(reqd_data) == 0:
            continue
        elif isinstance(reqd_model,type([])):
            pred_2[reqd_data,:] +=reqd_model
        else:
            pred_2[reqd_data,:] += reqd_model.predict(X_test[reqd_data,:])
    return pred_2

In [32]:
pred_2 = final_pred(X_test[:100],y_test[:100],pred_1,models)

100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [00:07<00:00,  3.51it/s]


In [33]:
print(f"Score is {fbeta_score(pred_2,y_test[:100],beta = 2,average = 'micro')}")

Score is 0.7111834961997828


In [34]:
### A few comparisons ###

In [35]:
mlb.inverse_transform(pred_2[:5])

[('H65.23', 'H69.83'),
 ('K57.30', 'Z12.11', 'Z86.010'),
 ('M48.061', 'M51.16', 'M54.50'),
 ('K22.70',),
 ()]

In [36]:
y_labels_test[:5]

[('H65.06', 'H65.23', 'H65.33', 'H69.83'),
 ('D12.2', 'D12.4', 'K57.30', 'K62.1', 'Z12.11', 'Z86.010'),
 ('M25.551', 'M25.561', 'M47.816', 'M48.061', 'M51.16', 'M54.50', 'M79.651'),
 ('K21.00', 'K22.70', 'K29.50', 'K31.89'),
 ('N84.1', 'N93.8', 'N93.9', 'Z30.430')]