## Fintech Final Project
## Category: multi-label classification 
Author: Nana

Date: Created at 2022.4.14


In [None]:
from google.colab import drive
drive.mount('/content/drive')
maindir = '/content/drive/MyDrive/FinTech-final-project'
datadir = f'{maindir}/data'
spmdir = f'{maindir}/spm'
modeldir = f'{maindir}/models'
cat_df_path = f'{maindir}/東吳課程_發票資料集/品類資料集/cat_train_v2.csv'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Task: Classifying fasttext embeddings
2 linear layers 

In [None]:
import joblib, sys

In [None]:
# required for skmultilearn (can skip if not using the package)
# # follow the instructions on https://github.com/thunlp/OpenNE

## Preparing Data (X, y) 

http://scikit.ml/multilabeldnn.html#Multi-label-deep-learning-with-scikit-multilearn



In [None]:
fmodelpath = f'{modeldir}/fasttext.model'
ypath = f'{datadir}/category/category_labels_v2.pkl'

In [None]:
import joblib 
from collections import defaultdict
LabelsList = joblib.load(ypath)

In [None]:
CatsDict = defaultdict(list)
for x in LabelsList:
  # print(x.keys())
  for cat in x['product']:
    # print(x['name'])
    CatsDict[cat].append(x['name'])

In [None]:
import random
print('# of categories:', len(CatsDict.keys()))
for k, v in CatsDict.items():
  print(f"Category: {k}, Length: {len(v)}, \nExamples:{random.sample(v, min(10, len(v)))}")
  print('---------------')
  break

# of categories: 217
Category: 人工淚液, Length: 121, 
Examples:['750390@信東舒眼人工淚液0.5ml*20支', '視舒坦人工淚液點眼液 10ML', '(M/R)麗眼舒單支裝點眼液0', 'Rohto乾眼淚液', '(R)視舒坦單支裝人工', '博士倫舒視能單支裝舒潤液 30支入【康是美】', 'Sato 視樂眼液15', '(R)愛爾康淚然人工淚', '※(OTC)視舒坦單支裝人工淚液Systane ultra', '視舒坦人工淚液點眼液0.5m']
---------------


In [None]:
import json
with open(f"{datadir}/category/category_dictionary.json", "w") as outfile:
    json.dump(CatsDict, outfile, ensure_ascii = False, indent = 4)

In [None]:
# average the token vectors to form X's feature vector
# use the binary tuple as label
from gensim.models import FastText
# should save keyedvectors only if there's no further training 
model = FastText.load(fmodelpath) 

In [None]:
## segmented, should have saved the segmented results for this
# !pip install -q -U ckip-transformers
# from ckip_transformers.nlp import CkipWordSegmenter
# ws_driver = CkipWordSegmenter(level=3, device = 0)
# name_texts = [x['name'] for x in LabelsList]
# ws = ws_driver(name_texts) !nvidia-smi # k80要20分鐘

In [None]:
import numpy as np
# parameters
emb_dim = 200 # (see notebook 12.0)
Xlen = len(LabelsList)
nclass = len(LabelsList[0]['labels'])

# make X(Emb matrix), y (label matrix)
Emb = np.zeros((Xlen, emb_dim))
Labels = np.zeros((Xlen, nclass))

def get_avg_embeddings(tokenlist):
  # if no token has embedding available, yields np.zeros(emb_dim)
  return np.mean([model.wv[tok] if tok in model.wv else np.zeros(emb_dim) for tok in tokenlist], axis = 0)

def get_weighted_embeddings(tokenlist):
  # ref: https://stackoverflow.com/questions/29330792/weighted-averaging-a-list
  rate = np.ones(len(tokenlist))
  mid = int(len(rate)/2)
  # category name通常出現在產品名稱後半部
  rate = np.append(rate[:mid], (rate[mid:]+1))
  # print(rate)
  X = [model.wv[tok] if tok in model.wv else np.zeros(emb_dim) for tok in tokenlist]
  return np.average(X, 
                    axis = 0,
                    weights=rate)

In [None]:
for i in range(Xlen):
    Emb[i,:] = get_weighted_embeddings(LabelsList[i]['seg_name'])
    Labels[i,:] = LabelsList[i]['labels']

In [None]:
get_weighted_embeddings(['黑人','牙膏'])

array([-0.32247051, -0.32322059, -0.70355904,  2.07394052,  2.14516916,
       -1.69316   ,  2.09346946, -0.72159924, -1.41702869,  1.0934815 ,
        1.49317399,  0.0466401 ,  1.01135514, -0.81285715, -0.69938417,
        0.3276531 , -0.16764755, -0.01113643, -0.16000227, -0.47312679,
       -1.03419837, -0.51314718,  0.48273041,  1.22986977, -1.94230157,
       -1.54858735,  2.95696082,  0.08220889, -0.93612036, -2.0776791 ,
       -0.28581977,  0.11455089, -0.23082522,  0.97293643,  0.28760044,
        0.96373625, -1.6524392 ,  3.69620971,  0.22831683, -1.61783663,
       -0.65232239,  0.2443138 , -0.66612804, -1.91881184, -0.50354154,
       -0.30937341,  0.36371383,  1.54410291,  4.29339623,  2.54499979,
        0.41443558,  1.246618  , -0.53838086,  1.87320554, -0.69987931,
       -2.23694323, -0.31305675,  1.00335614, -0.24181277,  0.20122028,
       -1.17844717, -1.39400069, -1.93530732, -1.77534568,  2.95400822,
        1.97300873,  2.90354705, -1.89682297, -1.6842713 ,  3.22

In [None]:
print(model.wv['黑人'])
print(model.wv['牙膏'])

[ 1.5489779  -0.39174908 -0.330629    0.6661055   0.6256692  -0.1446274
  0.20171165  0.5533052  -0.95217854  1.2505952  -0.28921488  0.64008945
  0.85936373 -0.48164427 -0.12731491  0.5615754   0.30188504 -0.07759612
  0.35072374 -0.33762568 -0.27788594 -0.85254127 -0.21423523 -0.05245302
  0.0160771   0.05456965  0.5865492  -0.50909364 -0.28784016 -0.8009508
  0.5128217  -0.5685834  -0.45796877  0.04401051  0.36486688  0.49517354
 -1.2559731   1.8368772   0.58585787 -0.6956234  -0.76854163 -0.25020218
 -0.69865394 -1.5215093  -0.1395044  -0.22810875  0.85111356  0.77385926
  1.6878211   0.43676957  1.0313563   0.16242279  0.454149    1.1793333
 -0.5056054  -0.4782843  -0.24404621 -0.23599458  0.07155167 -0.0897591
 -0.78272724  0.09441853 -0.18591255 -0.78110373  0.2967342   0.06642801
  1.6636679  -0.9008466   0.4922372   2.0589561   0.81139314 -1.0007845
 -0.38927853 -0.26854435 -0.4600889  -0.32442346  1.5665947  -0.8503532
 -1.2809393  -0.42920536 -0.26822165  0.62158597 -0.49669

## Training
stack a simple few-layer neural network 

In [None]:
!pip install -q -U torch

In [None]:
# seeding; normally not working in colab
import torch
from torch import nn
import torch.nn.functional as F
def seeding(myseed):
  torch.manual_seed(myseed)
  torch.cuda.manual_seed(myseed)
  torch.cuda.manual_seed_all(myseed)
  np.random.seed(myseed)
  torch.backends.cudnn.benchmark = False
  torch.backends.cudnn.deterministic = True

In [None]:
# hyperparameters
MYSEED = 42
TESTRATIO = 0.2 
DROPOUT = 0.5
BATCHSIZE = 200 
## Category數：nclass (217) fixed value
seeding(MYSEED)

In [None]:
from sklearn.model_selection import train_test_split
# 一些分工：做k-fold、
X_train, X_test, y_train, y_test = train_test_split(Emb, Labels, 
                                                    test_size=TESTRATIO, 
                                                    random_state= MYSEED)
input_dim = X_train.shape[1]
hd1, hd2 = 128, 256 
output_dim = nclass 
# originally len(np.unique(y_train.rows)) as http://scikit.ml/multilabeldnn.html#Multi-label-deep-learning-with-scikit-multilearn 
# states, but the logic is (if I understand it correctly) very weird

In [None]:
from torch.utils.data import Dataset

class CatDataset(Dataset):
    def __init__(self, X, y=None):
        self.data = torch.from_numpy(X).float()
        if y is not None:
            y = y.astype(np.int)
            self.label = torch.from_numpy(y).float()
        else:
            self.label = None
    def __getitem__(self, idx):
        if self.label is not None:
            return self.data[idx], self.label[idx]
        else:
            return self.data[idx]
    def __len__(self):
        return len(self.data)


In [None]:
from torch.utils.data import DataLoader
batch_size = BATCHSIZE
train_set = CatDataset(X_train, y_train)
val_set = CatDataset(X_test, y_test)
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=False)
val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False)

## Architecture(s): Simple Network (linear layers)


In [None]:
class MultiLabelClassifier(nn.Module):
    def __init__(
            self,
            input_dim=input_dim,
            hd1 = hd1,
            hd2 = hd2,
            output_dim=output_dim,
            dropout=DROPOUT,
    ):
        super(MultiLabelClassifier, self).__init__()
        self.dropout = nn.Dropout(dropout)
        # 只有兩層fc layers (shallow)，主要是看fasttext embedding的效果
        self.hidden1 = nn.Linear(input_dim, hd1)
        self.hidden2 = nn.Linear(hd1, hd2)
        self.output = nn.Linear(hd2, output_dim)

    def forward(self, X, **kwargs):
        X = self.hidden1(X)
        X = self.hidden2(X)
        X = F.relu(X)
        X = self.dropout(X)
        # 過softmax拿機率值
        X = F.softmax(self.output(X), dim=-1)
        return X

References: 
https://keras.io/examples/nlp/pretrained_word_embeddings/
https://towardsdatascience.com/

lstm-text-classification-using-pytorch-2c6c657f8fc0

My machine learning assignments

In [None]:
# from skmultilearn.problem_transform import LabelPowerset
# training parameters
THRESHOLD = 0.3
EPOCHS = 50
device = 'cuda' if torch.cuda.is_available() else 'cpu'
learning_rate = 5e-4       # learning rate
net = MultiLabelClassifier().to(device)
criterion = nn.BCELoss() 
weight_decay = 0.1
optimizer = torch.optim.AdamW(net.parameters(), 
                              lr=learning_rate, 
                              weight_decay=weight_decay)

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import time, datetime
from datetime import date, datetime
now = datetime.now()
model_prefix = now.strftime("%m%d-%H%M")
MODELPATH = f'{maindir}/multi_label/{model_prefix}.ckpt'
MODELPATH

'/content/drive/MyDrive/FinTech-final-project/multi_label/0427-1353.ckpt'

In [None]:
METHOD =( 'macro', 'weighted')

In [None]:
# Training Loop
from sklearn.metrics import f1_score
best_val_acc = 0.0
best_acc = 0.0

for epoch in range(EPOCHS):
    train_acc, val_acc, val_weighted_acc = 0.0, 0.0, 0.0
    train_loss, val_loss = 0.0,0.0
    # training
    net.train() # set the model to training mode
    for i, data in enumerate(train_loader):
        inputs, labels = data
        # print('l:', labels.shape)
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad() 
        outputs = net(inputs) 
        # print('o:', outputs.shape, 'l:', labels.shape)
        batch_loss = criterion(outputs, labels)
        
        # THRESHOLD: all labels with probabilities higher than it are considered predicted labels 
        # and others are skipped. We are using a threshold value of 0.5.
        pred = np.array(outputs.cpu().detach().numpy() > THRESHOLD, dtype=float)
        labels = labels.cpu().detach().numpy()
        # train_acc += getbatchf1(labels, pred)
        # print(f1_score(labels, pred, average='macro'))
        # compute macro_f1
        # print(labels.shape, pred.shape)
        # print(f1_score(labels, pred, average='macro'))
        batch_loss.backward() 
        optimizer.step() 
        train_loss += batch_loss.item()

    # validation
    if len(val_set) > 0:
        net.eval() # set the model to evaluation mode
        val_preds, val_labels = [], []
        with torch.no_grad():
            for i, data in enumerate(val_loader):
                inputs, labels = data
                inputs, labels = inputs.to(device), labels.to(device)
                outputs =net(inputs)
                batch_loss = criterion(outputs, labels) 
                val_pred = np.array(outputs.cpu().detach().numpy() > THRESHOLD, dtype=float)
                labels = labels.cpu().detach().numpy()
                val_loss += batch_loss.item()
                val_preds.append(val_pred)
                val_labels.append(labels)
            val_labels, val_preds = np.vstack(val_labels), np.vstack(val_preds)
            val_acc = f1_score(val_labels, val_preds, average = METHOD[0])
            val_weighted_acc = f1_score(val_labels, val_preds, average = METHOD[1])
            
            if val_acc > best_val_acc:
              torch.save(net.state_dict(), MODELPATH)
              best_val_acc = val_acc
            print('[{:03d}/{:03d}] Loss: {:3.4f} || VLoss: {:3.4f} | Val macro f1: {:3.4f}, weighted f1: {:3.4f}.'.format(
                epoch + 1, EPOCHS, 
                train_loss/len(train_loader), 
                val_loss/len(val_loader),
                val_acc, 
                val_weighted_acc,
            ))

[001/050] Loss: 0.0106 || VLoss: 0.0087 | Val macro f1: 0.3161, weighted f1: 0.6127.
[002/050] Loss: 0.0092 || VLoss: 0.0079 | Val macro f1: 0.3739, weighted f1: 0.6580.
[003/050] Loss: 0.0085 || VLoss: 0.0074 | Val macro f1: 0.4079, weighted f1: 0.6882.
[004/050] Loss: 0.0080 || VLoss: 0.0070 | Val macro f1: 0.4304, weighted f1: 0.7051.
[005/050] Loss: 0.0077 || VLoss: 0.0067 | Val macro f1: 0.4520, weighted f1: 0.7181.
[006/050] Loss: 0.0075 || VLoss: 0.0066 | Val macro f1: 0.4651, weighted f1: 0.7276.
[007/050] Loss: 0.0072 || VLoss: 0.0064 | Val macro f1: 0.4792, weighted f1: 0.7370.
[008/050] Loss: 0.0070 || VLoss: 0.0062 | Val macro f1: 0.4867, weighted f1: 0.7419.
[009/050] Loss: 0.0069 || VLoss: 0.0061 | Val macro f1: 0.4969, weighted f1: 0.7493.
[010/050] Loss: 0.0068 || VLoss: 0.0060 | Val macro f1: 0.5043, weighted f1: 0.7542.
[011/050] Loss: 0.0066 || VLoss: 0.0059 | Val macro f1: 0.5061, weighted f1: 0.7569.
[012/050] Loss: 0.0066 || VLoss: 0.0058 | Val macro f1: 0.5127, w

## **對這結果的檢視與說明：**
https://www.notion.so/4-27-macro-f1-on-multi-label-5c9e0534efe94cc8a526246f66f213e5

In [None]:
from sklearn.metrics import f1_score, classification_report
# multilabel classification
y_true = [[0, 0, 0], [1, 1, 1], [0, 1, 1], [1,0,0]]
y_pred = [[0, 0, 0], [1, 1, 1], [1, 1, 0], [1,0,1]]

sc = 0
for a, b in zip(y_true, y_pred):
  sc+=f1_score(a, b, average = 'macro')
sc/=len(y_true)

In [None]:
y_true, y_pred = np.array(y_true), np.array(y_pred)
f1_score(y_true, y_pred, average='macro') 

0.7666666666666666

In [None]:
net.eval()
val_preds, val_labels = [], []
for i, data in enumerate(val_loader):
      inputs, labels = data
      inputs, labels = inputs.to(device), labels.to(device)
      outputs =net(inputs)
      val_pred = np.array(outputs.cpu().detach().numpy() > THRESHOLD, dtype=float)
      labels = labels.cpu().detach().numpy()
      val_preds.append(val_pred)
      val_labels.append(labels)

In [None]:
val_preds = np.vstack(val_preds)
val_labels = np.vstack(val_labels)
val_labels.shape

(18462, 217)

In [None]:
val_acc = f1_score(val_labels, val_preds, average = METHOD[0])
val_weighted_acc = f1_score(val_labels, val_preds, average = METHOD[1])
print(f'macro f1: {val_acc:.4f}')
print(f'weighted f1: {val_weighted_acc:.4f}')

macro f1: 0.6070
weighted f1: 0.8159


In [None]:
cat2idx = joblib.load(f'{datadir}/category/cat2idx.pkl')
labels = sorted(cat2idx.items(), key = lambda x:x[1])
print(labels[:5])
keys = [x[0] for x in labels]
keys[:5]

[('人工淚液', 0), ('中式香腸', 1), ('化妝水', 2), ('成人牙膏', 3), ('水路/健行鞋', 4)]


['人工淚液', '中式香腸', '化妝水', '成人牙膏', '水路/健行鞋']

In [None]:
print(classification_report(val_labels, val_preds, target_names= keys))

              precision    recall  f1-score   support

        人工淚液       0.86      0.46      0.60        13
        中式香腸       0.72      0.91      0.80        53
         化妝水       0.75      0.84      0.79       181
        成人牙膏       0.97      0.97      0.97       381
      水路/健行鞋       0.00      0.00      0.00         1
         火鍋料       0.73      0.58      0.65        62
          奶瓶       0.90      0.97      0.94        38
        巧拼地墊       0.00      0.00      0.00         3
        平板電腦       0.65      0.61      0.63        28
       筆記型電腦       0.98      0.81      0.89        67
       智慧型手機       0.78      0.95      0.85       164
    瓦斯爐(廚房用)       0.17      0.22      0.19         9
    瓦斯爐(攜帶式)       0.75      0.32      0.44        19
       甲片/甲貼       1.00      0.83      0.91        52
         甲油膠       0.50      0.17      0.25         6
          冰箱       0.79      0.83      0.81        41
        安全汽座       1.00      0.30      0.46        10
        成人牙刷       0.91    

In [None]:
len(keys)

217

## Checking example reference: 
http://scikit.ml/multilabeldnn.html#Multi-label-deep-learning-with-scikit-multilearn
(which doesn't seem to have the logic our task requires)

In [None]:
!pip install arff
from skmultilearn.dataset import load_dataset
X_train, y_train, feature_names, label_names = load_dataset('emotions', 'train')
X_test, y_test, _, _ = load_dataset('emotions', 'test')