In [1]:
# Run setup code
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from sklearn.metrics import accuracy_score
from huggingface_hub import hf_hub_download
from tqdm import tqdm

%matplotlib inline

# To guarantee reproducible results
torch.manual_seed(5420)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(5420)

  from .autonotebook import tqdm as notebook_tqdm


Download datasets

In [None]:
import wandb 

In [3]:
torch.cuda.device_count()

1

In [4]:
hf_hub_download(repo_id="iristun/corpora", filename="corpora.tar.gz", repo_type="dataset", local_dir=".")


'corpora.tar.gz'

In [None]:
!tar xvf corpora.tar.gz


x corpora/
x corpora/mnist_data/
x corpora/mnist_data/t10k-images-idx3-ubyte.gz
x corpora/mnist_data/train-images-idx3-ubyte.gz
x corpora/mnist_data/.ipynb_checkpoints/
x corpora/mnist_data/vis_utils.py
x corpora/mnist_data/__init__.py
x corpora/mnist_data/load_mnist.py
x corpora/mnist_data/train-labels-idx1-ubyte.gz
x corpora/mnist_data/t10k-labels-idx1-ubyte.gz
x corpora/BEST/
x corpora/BEST/test/
x corpora/BEST/test/df_best_article_test.csv
x corpora/BEST/test/df_best_encyclopedia_test.csv
x corpora/BEST/test/df_best_novel_test.csv
x corpora/BEST/test/df_best_news_test.csv
x corpora/BEST/train/
x corpora/BEST/train/df_best_encyclopedia_train.csv
x corpora/BEST/train/df_best_article_train.csv
x corpora/BEST/train/df_best_news_train.csv
x corpora/BEST/train/df_best_novel_train.csv
x corpora/BEST/val/
x corpora/BEST/val/df_best_encyclopedia_val.csv
x corpora/BEST/val/df_best_news_val.csv
x corpora/BEST/val/df_best_article_val.csv
x corpora/BEST/val/df_best_novel_val.csv
x corpora/.ipyn

In [12]:
# Create a character map
CHARS = [
  '\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+',
  ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8',
  '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E',
  'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
  'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_',
  'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
  'n', 'o', 'other', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y',
  'z', '}', '~', 'ก', 'ข', 'ฃ', 'ค', 'ฅ', 'ฆ', 'ง', 'จ', 'ฉ', 'ช',
  'ซ', 'ฌ', 'ญ', 'ฎ', 'ฏ', 'ฐ', 'ฑ', 'ฒ', 'ณ', 'ด', 'ต', 'ถ', 'ท',
  'ธ', 'น', 'บ', 'ป', 'ผ', 'ฝ', 'พ', 'ฟ', 'ภ', 'ม', 'ย', 'ร', 'ฤ',
  'ล', 'ว', 'ศ', 'ษ', 'ส', 'ห', 'ฬ', 'อ', 'ฮ', 'ฯ', 'ะ', 'ั', 'า',
  'ำ', 'ิ', 'ี', 'ึ', 'ื', 'ุ', 'ู', 'ฺ', 'เ', 'แ', 'โ', 'ใ', 'ไ',
  'ๅ', 'ๆ', '็', '่', '้', '๊', '๋', '์', 'ํ', '๐', '๑', '๒', '๓',
  '๔', '๕', '๖', '๗', '๘', '๙', '‘', '’', '\ufeff'
]
CHARS_MAP = {v: k for k, v in enumerate(CHARS)}
print(type(CHARS_MAP))
CHARS_MAP

<class 'dict'>


{'\n': 0,
 ' ': 1,
 '!': 2,
 '"': 3,
 '#': 4,
 '$': 5,
 '%': 6,
 '&': 7,
 "'": 8,
 '(': 9,
 ')': 10,
 '*': 11,
 '+': 12,
 ',': 13,
 '-': 14,
 '.': 15,
 '/': 16,
 '0': 17,
 '1': 18,
 '2': 19,
 '3': 20,
 '4': 21,
 '5': 22,
 '6': 23,
 '7': 24,
 '8': 25,
 '9': 26,
 ':': 27,
 ';': 28,
 '<': 29,
 '=': 30,
 '>': 31,
 '?': 32,
 '@': 33,
 'A': 34,
 'B': 35,
 'C': 36,
 'D': 37,
 'E': 38,
 'F': 39,
 'G': 40,
 'H': 41,
 'I': 42,
 'J': 43,
 'K': 44,
 'L': 45,
 'M': 46,
 'N': 47,
 'O': 48,
 'P': 49,
 'Q': 50,
 'R': 51,
 'S': 52,
 'T': 53,
 'U': 54,
 'V': 55,
 'W': 56,
 'X': 57,
 'Y': 58,
 'Z': 59,
 '[': 60,
 '\\': 61,
 ']': 62,
 '^': 63,
 '_': 64,
 'a': 65,
 'b': 66,
 'c': 67,
 'd': 68,
 'e': 69,
 'f': 70,
 'g': 71,
 'h': 72,
 'i': 73,
 'j': 74,
 'k': 75,
 'l': 76,
 'm': 77,
 'n': 78,
 'o': 79,
 'other': 80,
 'p': 81,
 'q': 82,
 'r': 83,
 's': 84,
 't': 85,
 'u': 86,
 'v': 87,
 'w': 88,
 'x': 89,
 'y': 90,
 'z': 91,
 '}': 92,
 '~': 93,
 'ก': 94,
 'ข': 95,
 'ฃ': 96,
 'ค': 97,
 'ฅ': 98,
 'ฆ': 99,
 'ง'

In [9]:
def create_n_gram_df(df, n_pad):
  """
  Given an input dataframe, create a feature dataframe of shifted characters
  Input:
  df: timeseries of size (N)
  n_pad: the number of context. For a given character at position [idx],
    character at position [idx-n_pad/2 : idx+n_pad/2] will be used
    as features for that character.

  Output:
  dataframe of size (N * n_pad) which each row contains the character,
    n_pad_2 characters to the left, and n_pad_2 characters to the right
    of that character.
  """
  n_pad_2 = int((n_pad - 1)/2)
  for i in range(n_pad_2):
      df['char-{}'.format(i+1)] = df['char'].shift(i + 1)
      df['char{}'.format(i+1)] = df['char'].shift(-i - 1)
  return df[n_pad_2: -n_pad_2]

In [10]:
def prepare_feature(best_processed_path, option='train'):
  """
  Transform the path to a directory containing processed files
  into a feature matrix and output array
  Input:
  best_processed_path: str, path to a processed version of the BEST dataset
  option: str, 'train' or 'test'
  """
  # we use padding equals 21 here to consider 10 characters to the left
  # and 10 characters to the right as features for the character in the middle
  n_pad = 21
  n_pad_2 = int((n_pad - 1)/2)
  pad = [{'char': ' ', 'target': True}]
  df_pad = pd.DataFrame(pad * n_pad_2)

  df = []
  # article types in BEST corpus
  article_types = ['article', 'encyclopedia', 'news', 'novel']
  for article_type in article_types:
      df.append(pd.read_csv(os.path.join(best_processed_path, option, 'df_best_{}_{}.csv'.format(article_type, option))))

  df = pd.concat(df)
  # pad with empty string feature
  df = pd.concat((df_pad, df, df_pad))

  # map characters to numbers, use 'other' if not in the predefined character set.
  df['char'] = df['char'].map(lambda x: CHARS_MAP.get(x, 80))

  # Use nearby characters as features
  df_with_context = create_n_gram_df(df, n_pad=n_pad)

  char_row = ['char' + str(i + 1) for i in range(n_pad_2)] + \
             ['char-' + str(i + 1) for i in range(n_pad_2)] + ['char']

  # convert pandas dataframe to numpy array to feed to the model
  x_char = df_with_context[char_row].to_numpy()
  y = df_with_context['target'].astype(int).to_numpy()

  return x_char, y

In [13]:
# Path to the preprocessed data
best_processed_path = 'corpora/BEST'

In [14]:
# Load preprocessed BEST corpus
x_train_char, y_train = prepare_feature(best_processed_path, option='train')
x_val_char, y_val = prepare_feature(best_processed_path, option='val')
x_test_char, y_test = prepare_feature(best_processed_path, option='test')

# As a sanity check, we print out the size of the training, val, and test data.
print('Training data shape: ', x_train_char.shape)
print('Training data labels shape: ', y_train.shape)
print('Validation data shape: ', x_val_char.shape)
print('Validation data labels shape: ', y_val.shape)
print('Test data shape: ', x_test_char.shape)
print('Test data labels shape: ', y_test.shape)

Training data shape:  (16461637, 21)
Training data labels shape:  (16461637,)
Validation data shape:  (2035694, 21)
Validation data labels shape:  (2035694,)
Test data shape:  (2271932, 21)
Test data labels shape:  (2271932,)


In [32]:
# Print some entry from the data to make sure it is the same as what you think.
print('First 21 features: ', x_train_char[:2,:21])
print('First 2 class labels', y_train[:2])

First 21 features:  [[112. 140. 114. 148. 130. 142.  94. 142. 128. 128.   1.   1.   1.   1.
    1.   1.   1.   1.   1.   1.  97.]
 [140. 114. 148. 130. 142.  94. 142. 128. 128. 141.  97.   1.   1.   1.
    1.   1.   1.   1.   1.   1. 112.]]
First 2 class labels [1 0]


| char10 | ... | char2 | char1 | **char** | char-1 | char-2 | ... | char-10 |
| ------ | --- | ----- | ----- | -------- | ------ | ------ | --- | ------- |
| 0      | ... | 0     | 0     | **1**    | 2      | 3      | ... | 0       |
| ...    |     |       |       | **2**    | 3      | 4      | ... | 0       |
| ...    |     |       |       | **3**    | 4      | 5      | ... | 0       |
| ...    |     |       |       | **6**    | 7      | 0      | ... | 0       |


In [33]:
#print char of feature 1
char = np.array(CHARS)

#A function for displaying our features in text
def print_features(tfeature,label,index):
    feature = np.array(tfeature[index],dtype=int).reshape(21,1)
    #Convert to string
    char_list = char[feature]
    left = ''.join(reversed(char_list[10:20].reshape(10))).replace(" ", "")
    center = ''.join(char_list[20])
    right =  ''.join(char_list[0:10].reshape(10)).replace(" ", "")
    word = ''.join([left,' ',center,' ',right])
    print(center + ': ' + word + "\tpred = "+str(label[index]))

for ind in range(0,30):
    print_features(x_train_char,y_train,ind)

ค:  ค ณะตุลาการร	pred = 1
ณ: ค ณ ะตุลาการรั	pred = 0
ะ: คณ ะ ตุลาการรัฐ	pred = 0
ต: คณะ ต ุลาการรัฐธ	pred = 0
ุ: คณะต ุ ลาการรัฐธร	pred = 0
ล: คณะตุ ล าการรัฐธรร	pred = 0
า: คณะตุล า การรัฐธรรม	pred = 0
ก: คณะตุลา ก ารรัฐธรรมน	pred = 0
า: คณะตุลาก า รรัฐธรรมนู	pred = 0
ร: คณะตุลากา ร รัฐธรรมนูญ	pred = 0
ร: คณะตุลาการ ร ัฐธรรมนูญก	pred = 0
ั: ณะตุลาการร ั ฐธรรมนูญกั	pred = 0
ฐ: ะตุลาการรั ฐ ธรรมนูญกับ	pred = 0
ธ: ตุลาการรัฐ ธ รรมนูญกับค	pred = 0
ร: ุลาการรัฐธ ร รมนูญกับคว	pred = 0
ร: ลาการรัฐธร ร มนูญกับควา	pred = 0
ม: าการรัฐธรร ม นูญกับความ	pred = 0
น: การรัฐธรรม น ูญกับความเ	pred = 0
ู: ารรัฐธรรมน ู ญกับความเป	pred = 0
ญ: รรัฐธรรมนู ญ กับความเป็	pred = 0
ก: รัฐธรรมนูญ ก ับความเป็น	pred = 1
ั: ัฐธรรมนูญก ั บความเป็นอ	pred = 0
บ: ฐธรรมนูญกั บ ความเป็นอง	pred = 0
ค: ธรรมนูญกับ ค วามเป็นองค	pred = 1
ว: รรมนูญกับค ว ามเป็นองค์	pred = 0
า: รมนูญกับคว า มเป็นองค์ก	pred = 0
ม: มนูญกับควา ม เป็นองค์กร	pred = 0
เ: นูญกับความ เ ป็นองค์กรต	pred = 1
ป: ูญกับความเ ป ็นองค์กรตุ	pred = 0
็: ญกับความ

In [34]:
import torch.nn.functional as F
from torchinfo import summary

class SimpleFeedforwardNN(torch.nn.Module):
  def __init__(self):
    super(SimpleFeedforwardNN, self).__init__()

    self.mlp1 = torch.nn.Linear(21, 100)
    self.mlp2 = torch.nn.Linear(100, 100)
    self.mlp3 = torch.nn.Linear(100, 100)
    self.cls_head = torch.nn.Linear(100, 1)

  def forward(self, x):
    x = F.relu(self.mlp1(x))
    x = F.relu(self.mlp2(x))
    x = F.relu(self.mlp3(x))
    x = self.cls_head(x)
    out = torch.sigmoid(x)
    return out

model = SimpleFeedforwardNN() #Initialize model
model.cuda() #specify the location that it is in the GPU
summary(model, input_size=(1, 21), device='cuda') #summarize the model

Layer (type:depth-idx)                   Output Shape              Param #
SimpleFeedforwardNN                      [1, 1]                    --
├─Linear: 1-1                            [1, 100]                  2,200
├─Linear: 1-2                            [1, 100]                  10,100
├─Linear: 1-3                            [1, 100]                  10,100
├─Linear: 1-4                            [1, 1]                    101
Total params: 22,501
Trainable params: 22,501
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 0.02
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.09
Estimated Total Size (MB): 0.09

In [36]:
mlp_test = torch.nn.Linear(21, 3).cuda() # a MLP that has 21 input nodes and 3 output nodes
print(x_train_char[:4])
print(x_train_char[:4].shape)
test_input = torch.tensor(x_train_char[:4], dtype = torch.float).cuda()
print(mlp_test(test_input).shape)
print(mlp_test(test_input))

[[112. 140. 114. 148. 130. 142.  94. 142. 128. 128.   1.   1.   1.   1.
    1.   1.   1.   1.   1.   1.  97.]
 [140. 114. 148. 130. 142.  94. 142. 128. 128. 141.  97.   1.   1.   1.
    1.   1.   1.   1.   1.   1. 112.]
 [114. 148. 130. 142.  94. 142. 128. 128. 141. 109. 112.  97.   1.   1.
    1.   1.   1.   1.   1.   1. 140.]
 [148. 130. 142.  94. 142. 128. 128. 141. 109. 117. 140. 112.  97.   1.
    1.   1.   1.   1.   1.   1. 114.]]
(4, 21)
torch.Size([4, 3])
tensor([[-10.7609,  35.0017, -75.6997],
        [-27.3164,  26.4542, -69.2510],
        [ -9.0613,  42.3437, -91.1748],
        [-21.3748,  35.3703, -57.5476]], device='cuda:0',
       grad_fn=<AddmmBackward0>)


In [37]:
class Dataset(torch.utils.data.Dataset):
  'Characterizes a dataset for PyTorch'
  def __init__(self, X, Y, dtype = 'float'):
        'Initialization'
        self.X = X
        self.Y = Y.reshape(-1, 1)
        if(dtype == 'float'):
          self.X = torch.tensor(self.X, dtype = torch.float).cuda()
        elif(dtype == 'long'):
          self.X = torch.tensor(self.X, dtype = torch.long).cuda()
        self.Y = torch.tensor(self.Y, dtype = torch.float).cuda()

  def __len__(self):
        'Denotes the total number of samples'
        return len(self.X)

  def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        x = self.X[index]
        y = self.Y[index, :]
        return x, y

In [38]:
from torch.utils.data import DataLoader
import torch.optim as optim

#hyperparameter initialization
NUM_EPOCHS = 3
criterion = torch.nn.BCELoss(reduction = 'none')
BATCHS_SIZE = 512
optimizer_class = optim.Adam
optimizer_params = {'lr': 5e-4}

config = {
    'architecture': 'simpleff',
    'epochs': NUM_EPOCHS,
    'batch_size': BATCHS_SIZE,
    'optimizer_params': optimizer_params,
}

train_loader = DataLoader( Dataset(x_train_char, y_train, dtype = 'float'), batch_size = BATCHS_SIZE)
val_loader = DataLoader( Dataset(x_val_char, y_val, dtype = 'float'), batch_size = BATCHS_SIZE)
test_loader = DataLoader( Dataset(x_test_char, y_test, dtype = 'float'), batch_size = BATCHS_SIZE)

In [39]:
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from torchmetrics.functional import accuracy

class LightningModel(pl.LightningModule):
    def __init__(
        self,
        model=SimpleFeedforwardNN(),
        criterion=criterion,
        optimizer_class=optim.Adam,
        optimizer_params={'lr': 5e-4}
    ):
        super().__init__()
        self.model = model
        self.criterion = criterion
        self.optimizer_class = optimizer_class
        self.optimizer_params = optimizer_params

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        X_train, Y_train = batch
        Y_pred = self.model(X_train)
        loss = self.criterion(Y_pred, Y_train).mean()
        self.log('train_loss', loss, on_step=True, on_epoch=True)
        return loss

    def validation_step(self, batch, batch_idx):
        X_val, Y_val = batch
        Y_pred = self.model(X_val)
        loss = self.criterion(Y_pred, Y_val).mean()
        self.log('val_loss', loss, on_step=False, on_epoch=True)

        # Convert probalities to classes.
        val_pred = (Y_pred >= 0.5).float()

        # Calculate accuracy.
        val_acc = accuracy(val_pred, Y_val, task="binary")

        self.log('val_accuracy', val_acc, on_step=False, on_epoch=True)
        return {'val_loss': loss, 'val_accuracy': val_acc}

    def configure_optimizers(self):
        return self.optimizer_class(self.parameters(), **self.optimizer_params)

In [48]:
# Initialize LightningModel.
lightning_model = LightningModel(
  model,
  criterion,
  optimizer_class,
  optimizer_params,
)
# Define checkpoint.
feedforward_nn_checkpoint = ModelCheckpoint(
  monitor="val_accuracy",
  mode="max",
  save_top_k=1,
  dirpath="./checkpoints",
  filename='feedforward_nn'
)
# Initialize Trainer
trainer = pl.Trainer(
  max_epochs=NUM_EPOCHS,
  logger=pl.loggers.WandbLogger(),
  callbacks=[feedforward_nn_checkpoint],
  accelerator="gpu",
  devices=1,
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [49]:
import torch

if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
else:
    print("ไม่มี GPU ที่รองรับ CUDA")

GPU 0: NVIDIA GeForce RTX 3060 Laptop GPU


In [50]:
# Initialize wandb to log the losses from each step.
wandb.init(
    project='simpleff',
    config=config,
)
# Fit model.
trainer.fit(lightning_model, train_loader, val_loader)

print(f"Best model is saved at {feedforward_nn_checkpoint.best_model_path}")

0,1
epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,▇▇▇▆█▅▅▅▇▄▆▅▄▄▅█▂▃▁▄▅▂▆▂▇▆▆▂▅▂▃▂▁▃▄▂▆▂▄▄
trainer/global_step,▁▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▆▆▆▆▆▆▆▇▇▇▇▇████

0,1
epoch,0.0
train_loss_step,0.29346
trainer/global_step,15499.0


d:\mini\envs\pine\Lib\site-packages\pytorch_lightning\loggers\wandb.py:397: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type                | Params | Mode 
----------------------------------------------------------
0 | model     | SimpleFeedforwardNN | 22.5 K | train
1 | criterion | BCELoss             | 0      | train
----------------------------------------------------------
22.5 K    Trainable params
0         Non-trainable params
22.5 K    Total params
0.090     Total estimated model params size (MB)
6         Modules in train mode
0         Modules in eval mode


                                                                           

d:\mini\envs\pine\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.
d:\mini\envs\pine\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Epoch 0:   6%|▌         | 1775/32152 [00:31<08:55, 56.74it/s, v_num=kkeg]


Detected KeyboardInterrupt, attempting graceful shutdown ...


Epoch 0:  48%|████▊     | 15527/32152 [06:52<07:22, 37.60it/s, v_num=7hxq]


NameError: name 'exit' is not defined

In [51]:
from sklearn.metrics import f1_score,precision_score,recall_score

################################################################################
# A function to evaluate your model. This function must take test dataloader   #
# and the input model to return f-score, precision, and recall of the model.   #
################################################################################
def evaluate(test_loader, model):
  """
  Evaluate model on the splitted 10 percent testing set.
  """
  model.eval()
  with torch.no_grad():
    test_loss = []
    test_pred = []
    test_true = []
    for X_test, Y_test in tqdm(test_loader):
      Y_pred = model(X_test)
      loss = criterion(Y_pred, Y_test)
      test_loss.append(loss)
      test_pred.append(Y_pred)
      test_true.append(Y_test)

    avg_test_loss = torch.cat(test_loss, axis = 0).mean().item()
    test_pred = torch.cat(test_pred, axis = 0).cpu().detach().numpy()
    test_true = torch.cat(test_true, axis = 0).cpu().detach().numpy()

    prob_to_class = lambda p: 1 if p[0]>=0.5 else 0
    test_pred = np.apply_along_axis(prob_to_class,1,test_pred)

    acc = accuracy_score(test_true, test_pred)
    f1score = f1_score(test_true, test_pred)
    precision = precision_score(test_true, test_pred)
    recall = recall_score(test_true, test_pred)

  return {
    "accuracy": acc,
    "f1_score": f1score,
    "precision": precision,
    "recall": recall
  }

In [52]:
# Load best model and evaluate it.
best_model_path = feedforward_nn_checkpoint.best_model_path
# best_model_path = ... # Insert if you have already trained this model.
best_model = LightningModel.load_from_checkpoint(best_model_path, model=SimpleFeedforwardNN())
result = evaluate(test_loader, best_model)

wandb.finish()
print(result)

PermissionError: [Errno 13] Permission denied: 'd:/NLP_learn/NLP_learn'