In [1]:
"""import libaries"""
from sklearn.datasets import fetch_20newsgroups
from sklearn import metrics
from tqdm import tqdm
import re
import matplotlib.pyplot as plt
import random
import numpy as np
import os

In [2]:
%%capture
!pip install transformers

In [3]:
"""deep learning libaries"""
import torch
from torch import cuda
from transformers import LongformerModel,LongformerTokenizer,LongformerForSequenceClassification,TrainingArguments,Trainer,EarlyStoppingCallback
from torch.utils.data import Dataset,DataLoader

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 1) Import Data

In [5]:
"""Read Data"""
data_train = fetch_20newsgroups(subset='train', random_state=21)
data_test = fetch_20newsgroups(subset='test', random_state=21)

In [6]:
"""Clean Data"""
def clean(data):
    tmp_doc = []
    for words in data.split():
        if ':' in words or '@' in words or len(words) > 60:
            pass
        else:
            c = re.sub(r'[>|-]', '', words)
            # c = words.replace('>', '').replace('-', '')
            if len(c) > 0:
                tmp_doc.append(c)
    tmp_doc = ' '.join(tmp_doc)
    tmp_doc = re.sub(r'\([A-Za-z \.]*[A-Z][A-Za-z \.]*\) ', '', tmp_doc)
    return tmp_doc

In [7]:
def get_clean_data(raw_dataset):

    clean_dataset = {}
    doc_list = []
    label_id_list = []
    label_name_list = []
    
    for i in tqdm(range(len(raw_dataset.data))):
        
        doc, label = clean(raw_dataset.data[i]), raw_dataset.target[i]
        label_name = raw_dataset.target_names[label]
        doc_list.append(doc)
        label_id_list.append(label)
        label_name_list.append(label_name)
    
    clean_dataset['data'] = doc_list
    clean_dataset['label_id'] = label_id_list
    clean_dataset['label_name'] = label_name_list
    
    return clean_dataset

In [8]:
train_dataset = get_clean_data(data_train)

100%|██████████| 11314/11314 [00:09<00:00, 1246.82it/s]


In [9]:
test_dataset = get_clean_data(data_test)

100%|██████████| 7532/7532 [00:07<00:00, 977.21it/s] 


In [10]:
num_labels = len(set(data_train['target'])) #20

In [11]:
test_dataset['data'][0]

'mathew Alt.Atheism Atheist Resources Books, addresses, music anything related to atheism FAQ, atheism, books, music, fiction, addresses, contacts Thu, 27 May 1993 GMT world Mantis Consultants, Cambridge. UK. 303 atheism/resources resources 5 April 1993 1.1 Atheist Resources Addresses of Atheist Organizations USA FREEDOM FROM RELIGION FOUNDATION Darwin fish bumper stickers and assorted other atheist paraphernalia are available from the Freedom From Religion Foundation in the US. Write FFRF, P.O. Box 750, Madison, WI 53701. (608) 2568900 EVOLUTION DESIGNS Evolution Designs sell the "Darwin fish". It\'s a fish symbol, like the ones Christians stick on their cars, but with feet and the word "Darwin" written inside. The deluxe moulded 3D plastic fish is $4.95 postpaid in the US. Write Evolution Designs, 7119 Laurel Canyon #4, North Hollywood, CA 91605. People in the San Francisco Bay area can get Darwin Fish from Lynn Gold try mailing For net people who go to Lynn directly, the price is $4

In [12]:
# !pip install transformers

# 2) Configuration

In [63]:
class config:
    device = "cuda" if cuda.is_available() else "cpu"
    model_type = "LongformerModel"
    model_loader = "LongformerForSequenceClassification"
    token_type = "LongformerTokenizer"
    pretrain_model_path = 'allenai/longformer-base-4096'
    gradient_checkpointing=True
    max_length = 1024
    batch_size = 24
    model_save_path = "/content/drive/My Drive/Colab Notebooks/Models/Longformer20News" # data folder
    learning_rate = 2e-5
    evaluation_strategy = "steps"
    eval_steps = 30
    save_steps = 30
    num_train_epochs = 3
    num_labels = num_labels

In [14]:
os.listdir(config.model_save_path)

['runs']

# 3) Testing

In [15]:
# model = eval(config.model_type).from_pretrained(config.pretrain_model_path, config.gradient_checkpointing).to(config.device)
model = LongformerForSequenceClassification.from_pretrained(
    config.pretrain_model_path,gradient_checkpointing=True,
    use_cache=False,
    num_labels = config.num_labels).to(config.device)

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerForSequenceClassification: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weigh

In [16]:
model_parameters_list = [name for name, w in model.named_parameters()]

In [17]:
classfication_layers = [name for name in model_parameters_list if "classifier" in name]
attention_layer = [name for name in model_parameters_list if "embeddings" in name]
first_layer = [name for name in model_parameters_list if "layer.0" in name or "layer.1" in name]
last_layer = [name for name in model_parameters_list if "layer.11" in name or "layer.10" in name]

In [18]:
trainable_parameters = classfication_layers + attention_layer + first_layer + last_layer

In [19]:
# freeze parameter
# due to the limited GPU memory
for name,param in model.named_parameters():
    if name not in trainable_parameters:
        param.requires_grad = False

In [20]:
tokenizer = eval(config.token_type).from_pretrained(config.pretrain_model_path)

In [21]:
""""test input"""

'"test input'

In [22]:
test_text = test_dataset['data'][0]

In [23]:
test_text

'mathew Alt.Atheism Atheist Resources Books, addresses, music anything related to atheism FAQ, atheism, books, music, fiction, addresses, contacts Thu, 27 May 1993 GMT world Mantis Consultants, Cambridge. UK. 303 atheism/resources resources 5 April 1993 1.1 Atheist Resources Addresses of Atheist Organizations USA FREEDOM FROM RELIGION FOUNDATION Darwin fish bumper stickers and assorted other atheist paraphernalia are available from the Freedom From Religion Foundation in the US. Write FFRF, P.O. Box 750, Madison, WI 53701. (608) 2568900 EVOLUTION DESIGNS Evolution Designs sell the "Darwin fish". It\'s a fish symbol, like the ones Christians stick on their cars, but with feet and the word "Darwin" written inside. The deluxe moulded 3D plastic fish is $4.95 postpaid in the US. Write Evolution Designs, 7119 Laurel Canyon #4, North Hollywood, CA 91605. People in the San Francisco Bay area can get Darwin Fish from Lynn Gold try mailing For net people who go to Lynn directly, the price is $4

In [24]:
def get_token(input_text):
    inputs = tokenizer(
            text = input_text,
            max_length = 1000,
            padding = "max_length",
            return_token_type_ids = False,
            return_tensors=  'pt',
            truncation=True
            )
    return inputs

In [25]:
inputs = get_token(test_text)

In [26]:
inputs['input_ids'].shape


torch.Size([1, 1000])

In [27]:
inputs = {k:torch.tensor(v).to(config.device) for k,v in inputs.items()}

  """Entry point for launching an IPython kernel.


In [28]:
with torch.no_grad():
    test_ouput = model(**inputs)

In [29]:
inputs['input_ids'].size()

torch.Size([1, 1000])

In [30]:
test_ouput

LongformerSequenceClassifierOutput([('logits',
                                     tensor([[ 0.1299, -0.0146, -0.0763, -0.0037, -0.0331, -0.1145, -0.0883, -0.1619,
                                              -0.0425,  0.0743, -0.2141,  0.1970,  0.1126, -0.0511, -0.0117,  0.0482,
                                              -0.0111,  0.0414,  0.0881, -0.1324]], device='cuda:0'))])

In [31]:
# test_ouput.pooler_output

In [32]:
# model

In [33]:

# break

In [34]:
"""Get The Max Length"""

'Get The Max Length'

In [35]:
# total_doc = train_dataset['data'] + test_dataset['data']
# ratio = 1
# sample_size = int(len(total_doc) * ratio)

In [36]:
# sample_doc = random.sample(total_doc,sample_size)  

In [37]:
# doc_len_list = []
# for doc in tqdm(sample_doc):
#     temp = get_token(doc)
#     doc_len_list.append(temp['input_ids'].shape[1])

In [38]:
# plt.hist(doc_len_list)

In [39]:
# doc_len_more_5000_list = [doc_len for doc_len in doc_len_list if doc_len > 5000]

In [40]:
# doc_len_less_5000_list = [doc_len for doc_len in doc_len_list if doc_len < 5000]

In [41]:
# plt.hist(doc_len_more_5000_list)


In [42]:
# plt.hist(doc_len_less_5000_list)

In [43]:

# doc_len_1000_5000_list = [doc_len for doc_len in doc_len_list if doc_len < 5000 and doc_len > 1000 ]

In [44]:
# plt.hist(doc_len_1000_5000_list)

In [45]:
# np.percentile(doc_len_list,99.5)

In [46]:

# np.percentile(doc_len_list,95)

# 4) Tokenization Features Engineering


 - Create Dataset Object

In [47]:
config.max_length

1024

In [48]:
class LongTextDataset(Dataset):
    def __init__(self,doc_list,label_list,tokenizer,config,):

        self.tokenizer = tokenizer
        self.max_length = config.max_length
        self.device = config.device
        self.doc_list = doc_list
        self.len = len(doc_list)
        self.label_list = None
        if label_list is not None:
            self.label_list = label_list

    def __len__(self):
        return self.len
    
    def __getitem__(self,index):

        doc = self.doc_list[index]
        if self.label_list is not None:
            label = self.label_list[index]

        inputs = self.tokenizer(
            text = doc,
            add_special_tokens = True,
            max_length = self.max_length,
            padding = "max_length",
            return_token_type_ids = False,
            truncation=True
            # return_tensors = 'pt',
            )
        
        if self.label_list is not None:
            inputs= {
                'input_ids':torch.tensor(inputs['input_ids']),
                'attention_mask':torch.tensor(inputs['attention_mask']),
                'labels':torch.tensor(label),
            }
        else:
            inputs= {
                'input_ids':torch.tensor(inputs['input_ids']),
                'attention_mask':torch.tensor(inputs['attention_mask']),
            }

        inputs = {k:v.to("cpu") for (k,v) in inputs.items()}

        return inputs

In [49]:

test_dataset.keys()

dict_keys(['data', 'label_id', 'label_name'])

In [53]:

size = 0.2

In [55]:
train_size = int(len(train_dataset['data'] )* size)

In [56]:
test_size = int(len(test_dataset['data'] )* size)


In [50]:
# test_dataset['label'][:10]

In [57]:
train_tokenized_dataset = LongTextDataset(train_dataset['data'][:train_size],train_dataset['label_id'][:train_size],tokenizer,config)
test_tokenzed_dataset = LongTextDataset(test_dataset['data'][:test_size],test_dataset['label_id'][:test_size],tokenizer,config)

In [58]:
# train_tokenized_dataset[0:12]

In [59]:
train_tokenized_dataset[0:12]['input_ids'].shape

torch.Size([12, 1024])

# 5) Fine Tune LongFormer for long-text classification

In [64]:
len(train_tokenized_dataset)


2262

In [65]:
steps = len(train_tokenized_dataset)//config.batch_size

In [66]:
steps

94

In [75]:
def compute_metrics(inputs):
    pred,labels = inputs
    pred = np.argmax(pred,axis = 1)
    accuracy = metrics.accuracy_score(labels,pred)
    # recall = metrics.recall_score(labels,pred)
    # precision = metrics.precision_score(labels,pred)
    # f1 = metrics.f1_score(labels,pred)
    # return {"accuracy":accuracy, "recall":recall, "precision":precision, "recall":recall, f1:"f1"}
    return {"accuracy":accuracy}

In [76]:
config.eval_steps

30

In [77]:
args = TrainingArguments(
    output_dir=config.model_save_path,
    overwrite_output_dir = True,
    evaluation_strategy = "steps",
    eval_steps = config.eval_steps,
    save_steps = config.save_steps,
    logging_steps = config.eval_steps,
    num_train_epochs = config.num_train_epochs,
    do_train = True,
    do_eval = True,
    learning_rate = config.learning_rate,
    per_device_train_batch_size = config.batch_size,
    per_device_eval_batch_size = config.batch_size,
    seed = 0,
    load_best_model_at_end = True
)



PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [78]:

train_tokenized_dataset[:10]['input_ids'].shape

torch.Size([10, 1024])

In [79]:
trainer  = Trainer(
    model = model,
    args = args,
    train_dataset =train_tokenized_dataset,
    eval_dataset = test_tokenzed_dataset,
    compute_metrics = compute_metrics,
    callbacks = [EarlyStoppingCallback] 
)


In [None]:
trainer.train()

***** Running training *****
  Num examples = 2262
  Num Epochs = 3
  Instantaneous batch size per device = 24
  Total train batch size (w. parallel, distributed & accumulation) = 24
  Gradient Accumulation steps = 1
  Total optimization steps = 285
Initializing global attention on CLS token...


Step,Training Loss,Validation Loss,Accuracy
30,2.9352,2.938626,0.086985
60,2.8917,2.783743,0.15405
90,2.6387,2.366567,0.444887
120,2.2319,1.953581,0.534529
150,1.9514,1.650503,0.631474
180,1.741,1.480383,0.616866


Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on C

In [None]:
!nvidia-smi