In [None]:
!nvidia-smi

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

!pip install -q transformers

In [None]:
!sudo apt-get install texlive-xetex texlive-fonts-recommended texlive-plain-generic

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.utils import shuffle

from tqdm.notebook import tqdm

import os
import re
import json
import copy
import collections

In [None]:
from transformers import AutoConfig, AutoTokenizer, TFAutoModel
from transformers import AutoTokenizer, AutoModelForSequenceClassification

from transformers import TFBertModel, TFBertForSequenceClassification, BertConfig
from transformers import TFAutoModelForSequenceClassification, TFAutoModelForTokenClassification

from transformers import glue_convert_examples_to_features
from transformers import InputExample, InputFeatures
import tensorflow as tf
from sklearn import preprocessing
from torch.utils.data import Dataset
import torch
from keras.preprocessing.sequence import pad_sequences


In [None]:
MODEL_NAME = 'bert-base-multilingual-uncased'
# MODEL_NAME = 'HooshvareLab/bert-fa-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)



In [None]:
MAX_LEN = 128

In [None]:
def make_dataset(tag_dict, path):
    Sen = []
    Tag = []
    Tag_encoded = []
    with open(path, 'r') as file:
        sentence = ['[CLS]']
        tag = ['X']
        tag_encoded = [tag_dict['X']]
        for line in file:
            if line == '\n':
                sentence.append('[SEP]')
                tag.append('X')
                tag_encoded.append(tag_dict['X'])
                Sen.append(copy.deepcopy(sentence))
                Tag.append(copy.deepcopy(tag))
                Tag_encoded.append(copy.deepcopy(tag_encoded))
                sentence.clear()
                tag.clear()
                tag_encoded.clear()
                sentence = ['[CLS]']
                tag = ['X']
                tag_encoded = [tag_dict['X']]

            else:
                line = line.strip().split()
                sentence.append(line[0])
                tag.append(line[1])
                tag_encoded.append(tag_dict[line[1]])

    Sen = np.array([np.array(xi, dtype='object') for xi in Sen], dtype='object')
    Tag = np.array([np.array(xi, dtype='object') for xi in Tag], dtype='object')
    Tag_encoded = np.array([np.array(xi) for xi in Tag_encoded], dtype='object')
    
    # Sen = np.array(Sen, dtype='object')
    # Tag = np.array(Tag, dtype='object')
    # Tag_encoded = np.array(Tag_encoded, dtype='object')
    return Sen, Tag, Tag_encoded

gdrive_path = '/content/gdrive/My Drive/ColabNotebooks/'

Test = 'Test.txt'
Train = 'Train.txt'

labels_to_ids = {'SPEC': 21, 'DEFAULT': 4, 'OH': 14, 'MQUA': 10, 'OHH': 15, 'P': 16,
            'QUA': 20, 'PS': 19, 'MS': 11, 'IF': 7, 'NP': 13, 'PRO': 18, 'DELM': 5,
            'V': 22, 'DET': 6, 'AR': 2, 'ADV': 1, 'MORP': 9, 'N': 12, 'CON': 3,
            'PP': 17, 'ADJ': 0, 'INT': 8, 'UNK':-100, 'X': 23, '[PAD]': 23, '[CLS]':24, '[SEP]':24}
ids_to_labels = {v: k for v, k in labels_to_ids.items()}

train_path = gdrive_path + Train
test_path = gdrive_path + Test

x_train, y_train, y_train_encoded = make_dataset(labels_to_ids, train_path)
x_test, y_test, y_test_encoded  = make_dataset(labels_to_ids, train_path)

print(type(x_train[0]))
print(x_train[1][0])


In [None]:
def input_id(sentence):
    sentence2 = np.array(sentence, dtype='object')
    if len(sentence2) <= MAX_LEN:
        sentence2 = np.pad(sentence2, (0, MAX_LEN-len(sentence2)), mode='constant', constant_values=(0, '[PAD]'))
    else:
        sentence2 = sentence2[:MAX_LEN]

    input = [tokenizer.convert_tokens_to_ids(str(txt)) for txt in sentence2]
    return input

input_ids_train = []
for sen in x_train:
    input_ids_train.append(tf.constant(input_id(sen)))

input_ids_test = []
for sen in x_test:
    input_ids_test.append(tf.constant(input_id(sen)))


print(len(input_ids_train))
print(input_ids_train[1])

In [None]:
def get_label_id(labels):
    label_ids = [labels_to_ids['[CLS]']]
    for label in labels:
        label_ids.append(labels_to_ids[label])
    label_ids.append(labels_to_ids['[SEP]'])
    label_ids = np.array(label_ids)
    if len(label_ids) <= MAX_LEN:
        label_ids = np.pad(label_ids, (0, MAX_LEN-len(label_ids)), mode='constant', constant_values=(0,23))
    else:
        label_ids = label_ids[:MAX_LEN]
    return label_ids.tolist()

label_ids_train = []
token_ids_train = []
for labels in y_train:
    label_ids_train.append(tf.constant(get_label_id(labels)))
    token_ids_train.append(tf.constant([0] * MAX_LEN))

label_ids_test = []
token_ids_test = []
for labels in y_test:
    label_ids_test.append(tf.constant(get_label_id(labels)))
    token_ids_test.append(tf.constant([0] * MAX_LEN))


print(label_ids_train[1])
print(x_train[1])

In [None]:
tmp_train = np.array(input_ids_train)
attention_masks_train = [tf.constant([int(i != 0) for i in ii]) for ii in tmp_train]
# attention_masks_train = np.array(attention_masks_train)

tmp_test = np.array(input_ids_test)
attention_masks_test = [tf.constant([int(i != 0) for i in ii]) for ii in tmp_test]
# attention_masks_test = np.array(attention_masks_test)
print(attention_masks_train[0])

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices(({
            'input_ids': input_ids_train,
            'attention_mask': attention_masks_train,
            'token_type_ids': token_ids_train
}, label_ids_train))

test_dataset = tf.data.Dataset.from_tensor_slices(({
            'input_ids': input_ids_test,
            'attention_mask': attention_masks_test,
            'token_type_ids': token_ids_test
}, label_ids_test))


In [None]:
train_dataset = train_dataset.repeat().batch(32)
test_dataset = test_dataset.batch(32)



In [None]:
train_steps = np.ceil(len(input_ids_train) / 32)
test_steps = np.ceil(len(input_ids_test) / 32)
print(train_steps)

In [None]:
def build_model(model_name, config, learning_rate=3e-5):
    model = TFAutoModelForTokenClassification.from_pretrained(model_name, config=config)

    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
    model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

    return model

In [None]:
# MODEL_NAME = 'HooshvareLab/bert-fa-base-uncased'
MODEL_NAME = 'bert-base-multilingual-uncased'

config = AutoConfig.from_pretrained(
    MODEL_NAME, **{
        'label2id': labels_to_ids,
        'id2label': ids_to_labels,
    })
model = build_model(MODEL_NAME, config, learning_rate=1e-4)



In [None]:
r = model.fit(
    train_dataset,
    steps_per_epoch=train_steps,
    validation_data=test_dataset,
    epochs=5,
    verbose=1)

final_accuracy = r.history['val_accuracy']
print('FINAL ACCURACY MEAN: ', np.mean(final_accuracy))


In [None]:
    !jupyter nbconvert --to pdf /content/gdrive/MyDrive/ColabNotebooks/14.ipynb