In [None]:
#!pip uninstall tensorflow -y
#!pip  install transformers==4.22.1 -q
#!pip install evaluate

In [1]:
import transformers
from transformers import GPT2Tokenizer, DataCollatorWithPadding, TrainingArguments, Trainer, \
                         GPT2ForSequenceClassification, set_seed, pipeline,GPT2Config
import evaluate
import pandas as pd
import numpy as np

In [2]:
import load_data
from load_data import Data
import train_datasets
from train_datasets import Train_dataset

In [3]:
def load_dataset():
    #Load a datafraom from the Data class from load_data
    data.handle_file()
    data.convert_json_to_dataframe()
    data.get_next_value()
    data.compare_values()
    data.label_sentences()
    data.initial_df()

In [4]:
def preprocess_function(dataset):
    #Mapping tokenizer with a dataset
    return tokenizer(dataset["text"], truncation=True, max_length=400)

In [5]:
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):    
    #Setting evaluation metrics
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [6]:
def training_model():
    #Defining Model and training arguments 
    #Training model with splitted dataset
    uni_labels = ['Continue', "shift"]
    gpt2_model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=2)
    gpt2_model.resize_token_embeddings(len(tokenizer))
    gpt2_model.config.id2label = {i: l for i, l in enumerate(uni_labels)}
    gpt2_model.config.pad_token_id = gpt2_model.config.eos_token_id

    training_args = TrainingArguments(
        output_dir='./results_gpt',
        num_train_epochs=2,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=0.05,
        logging_dir='./logs_gpt',
        load_best_model_at_end=True,
        logging_steps=1,
        log_level='info',
        evaluation_strategy='epoch',
        eval_steps=100,
        save_strategy='epoch'
    )

    trainer1 = Trainer(
        model=gpt2_model,
        args=training_args,
        train_dataset=dataset['train'],
        eval_dataset=dataset['test1'],
        compute_metrics=compute_metrics,
        data_collator=data_collator
    )

    trainer2 = Trainer(
        model=gpt2_model,
        args=training_args,
        train_dataset=dataset['train'],
        eval_dataset=dataset['test2'],
        compute_metrics=compute_metrics,
        data_collator=data_collator
    )
    
    return trainer1, trainer2

In [8]:
if __name__ == '__main__':
    #============================================
    #Generate a df from Data class from data_load
    #A df contains "texc" column a sentence per a row and their lables(0: continue, 1: change)
    path ='hotels.json'
    data = Data(path)
    load_dataset()

    #save initial df as a json file
    df = data.df.to_json("df.json")

    #============================================
    #Preprocessing data for modeling
    t = data.df.copy()
    t = t[:4000]
    #4000 rows were used which will generate sequences.    

    #Splitting setences to tokens and labeling tokens
    train_dataset = Train_dataset(t) 
    train_dataset.test_text = t['text'].values
    train_dataset.test_label = t['label'].values
    
    train_dataset.split_token_sentences()
    train_dataset.tokenized_text_label = train_dataset.flatten_list(train_dataset.tokenized_text_label)
    train_dataset.tokenized_text = train_dataset.flatten_list(train_dataset.tokenized_text)
    
    #Generating sequnces to be used for training by combining tokens and labeling the sequences
    train_dataset.generate_test_dataset()
    
    #Generating datasets for training, vaildating and testing
    train_dataset.datasets_for_training()
    
    #Applying DistilBertTokenizerFast for DistilBert model
    dataset = train_dataset.dataset

    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    tokenizer.pad_token = tokenizer.eos_token
    
    dataset = dataset.map(preprocess_function, batched=True)
    dataset = dataset.remove_columns(['text'])
    dataset = dataset.rename_column("label", "labels")
    
    #padding dataset
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")
    
    #training and saving models
    trainer1, trainer2 = training_model()
    
    #trainer1.evaluate()
    #tarining modesl with preprocessed dataset
    trainer1.train()
    #evaluation trained model
    #trainer1.evaluate()
    #evaluation model with another dataset
    #trainer2.evaluate()
    trainer1.save_model('./model/bert_distilbert_seq_token')

Map:   0%|          | 0/10834 [00:00<?, ? examples/s]

Map:   0%|          | 0/3611 [00:00<?, ? examples/s]

Map:   0%|          | 0/3612 [00:00<?, ? examples/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
***** Running training *****
  Num examples = 10834
  Num Epochs = 2
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 5418


Epoch,Training Loss,Validation Loss,Accuracy
1,0.0,0.138604,0.957906
2,0.0,0.152401,0.960122


***** Running Evaluation *****
  Num examples = 3611
  Batch size = 8
Saving model checkpoint to ./results_gpt\checkpoint-2709
Configuration saved in ./results_gpt\checkpoint-2709\config.json
Model weights saved in ./results_gpt\checkpoint-2709\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3611
  Batch size = 8
Saving model checkpoint to ./results_gpt\checkpoint-5418
Configuration saved in ./results_gpt\checkpoint-5418\config.json
Model weights saved in ./results_gpt\checkpoint-5418\pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./results_gpt\checkpoint-2709 (score: 0.13860446214675903).
Saving model checkpoint to ./model/bert_distilbert_seq_token
Configuration saved in ./model/bert_distilbert_seq_token\config.json
Model weights saved in ./model/bert_distilbert_seq_token\pytorch_model.bin
