In [None]:
#!pip uninstall tensorflow -y
#!pip  install transformers==4.22.1 -q
#!pip install evaluate

In [1]:
import transformers
from transformers import Trainer, TrainingArguments, DistilBertForSequenceClassification, DistilBertTokenizerFast, \
     DataCollatorWithPadding, pipeline
from transformers import DataCollatorWithPadding
import evaluate
import pandas as pd
import numpy as np

In [2]:
import load_data
from load_data import Data
import train_datasets
from train_datasets import Train_dataset

In [3]:
def load_dataset():
    #Load a datafraom from the Data class from load_data
    data.handle_file()
    data.convert_json_to_dataframe()
    data.get_next_value()
    data.compare_values()
    data.label_sentences()
    data.initial_df()

In [4]:
def preprocess_function(dataset):
    #Mapping tokenizer with a dataset
    return tokenizer(dataset["text"], truncation=True, max_length=400)

In [5]:
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    #Setting evaluation metrics
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [6]:
def training_model():
    #Defining Model and training arguments 
    #Training model with splitted dataset
    uni_labels = ['Continue', "Change"]
    sequence_clf_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels = 2)
    sequence_clf_model.config.id2label = {i: l for i, l in enumerate(uni_labels)}
    sequence_clf_model.config.dropout =0.2

    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=2,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=64,
        warmup_steps=500,
        weight_decay=0.05,
        logging_dir='./logs',
        load_best_model_at_end=True,
        logging_steps=1,
        log_level='info',
        evaluation_strategy='epoch',
        eval_steps=100,
        save_strategy='epoch'
    )

    trainer1 = Trainer(
        model=sequence_clf_model,
        args=training_args,
        train_dataset=tokenized_dataset['train'],
        eval_dataset=tokenized_dataset['test1'],
        compute_metrics=compute_metrics,
        data_collator=data_collator
    )

    trainer2 = Trainer(
        model=sequence_clf_model,
        args=training_args,
        train_dataset=tokenized_dataset['train'],
        eval_dataset=tokenized_dataset['test2'],
        compute_metrics=compute_metrics,
        data_collator=data_collator
    ) 
    
    return trainer1, trainer2

In [7]:
if __name__ == '__main__':
    #============================================
    #Generate a df from Data class from data_load
    #A df contains "texc" column a sentence per a row and their lables(0: continue, 1: change)
    path ='hotels.json'
    data = Data(path)
    load_dataset()
    print("\n\nInitial Data")
    display(data.df[:3])
    print(data.df.shape)
    print(data.df['label'].value_counts())
    #save initial df as a json file
    df = data.df.to_json("df.json")

    #============================================
    #Preprocessing data for modeling
    #t = df.copy()
    t = data.df.copy()
    t = t[:4000]
    #4000 rows were used which will generate sequences.
    print("\n\nDataset to be used for modeling")
    print(t.shape)
    display(t[:5])
    #Converting columns to list for tokenizing

    #Splitting setences to tokens and labeling tokens
    train_dataset = Train_dataset(t) 
    train_dataset.test_text = t['text'].values
    train_dataset.test_label = t['label'].values
    
    train_dataset.split_token_sentences()
    train_dataset.tokenized_text_label = train_dataset.flatten_list(train_dataset.tokenized_text_label)
    train_dataset.tokenized_text = train_dataset.flatten_list(train_dataset.tokenized_text)
    print("\n\nSentences")
    print(train_dataset.text[:10])
    print("Tokenized sentences")
    print(train_dataset.tokenized_text[:20])
    print("Labels")
    print(train_dataset.tokenized_text_label[:20])

    #Generating sequnces to be used for training by combining tokens and labeling the sequences
    train_dataset.generate_test_dataset()
    print("\n\nGenerated Sequences")
    print(train_dataset.sequences[:10])
    print("Lables for each sequences")
    print(train_dataset.labels[:10])
    print("The number of generated Sequences")
    print(len(train_dataset.sequences))

    #Generating datasets for training, vaildating and testing
    train_dataset.datasets_for_training()
    print("\n\nDatasets for training and evaluating models")
    print(train_dataset.dataset, end="\n\n")

    #Applying DistilBertTokenizerFast for DistilBert model
    dataset = train_dataset.dataset

    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased', ignore_mismatched_sizes=True)
    tokenized_dataset = dataset.map(preprocess_function, batched=True)
    print("\n\nConverted Sequences")
    print(tokenized_dataset['train'][15], end="\n\n")

    #padding dataset
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    
    #training and saving models
    trainer1, trainer2 = training_model()
    
    #trainer1.evaluate()
    #tarining modesl with preprocessed dataset
    trainer1.train()
    #evaluation trained model
    #trainer1.evaluate()
    #evaluation model with another dataset
    #trainer2.evaluate()
    trainer1.save_model('./model/bert_distilbert_seq_token')



Initial Data


Unnamed: 0,text,label,count
0,Hello!,0,1
1,How can I help you?,1,5
2,"Hi,",0,1


(105030, 3)
0    53659
1    51371
Name: label, dtype: int64


Dataset to be used for modeling
(4000, 3)


Unnamed: 0,text,label,count
0,Hello!,0,1
1,How can I help you?,1,5
2,"Hi,",0,1
3,I would like to find a hotel.,1,7
4,Okay.,0,1




Sentences
['Hello!', 'How can I help you?', 'Hi,', 'I would like to find a hotel.', 'Okay.', 'else can you tell me about the hotel?', 'Find a hotel near the beachfront.', 'Okay beachfront in which town?', 'Oxnard California.', 'Okay.']
Tokenized sentences
['Hello!', 'How', 'can', 'I', 'help', 'you?', 'Hi,', 'I', 'would', 'like', 'to', 'find', 'a', 'hotel.', 'Okay.', 'else', 'can', 'you', 'tell', 'me']
Labels
[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]


Generated Sequences
['Hello!', 'Hello! How', 'Hello! How can', 'Hello! How can I', 'Hello! How can I help', 'Hello! How can I help you?', 'Hi,', 'Hi, I', 'Hi, I would', 'Hi, I would like']
Lables for each sequences
[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]
The number of generated Sequences
18057


Datasets for training and evaluating models
DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 10834
    })
    test1: Dataset({
        features: ['label', 'text'],
        num_rows: 3611
    })


Map:   0%|          | 0/10834 [00:00<?, ? examples/s]

Map:   0%|          | 0/3611 [00:00<?, ? examples/s]

Map:   0%|          | 0/3612 [00:00<?, ? examples/s]



Converted Sequences
{'label': 0, 'text': "Hi, this is Raja from California. I'd like to find out more about a hotel. I plan to stay at Park City Peaks", 'input_ids': [101, 7632, 1010, 2023, 2003, 10164, 2013, 2662, 1012, 1045, 1005, 1040, 2066, 2000, 2424, 2041, 2062, 2055, 1037, 3309, 1012, 1045, 2933, 2000, 2994, 2012, 2380, 2103, 11373, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}



Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier

Epoch,Training Loss,Validation Loss,Accuracy
1,0.0004,0.098694,0.95486
2,0.0001,0.088414,0.959845


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3611
  Batch size = 64
Saving model checkpoint to ./results\checkpoint-678
Configuration saved in ./results\checkpoint-678\config.json
Model weights saved in ./results\checkpoint-678\pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3611
  Batch size = 64
Saving model checkpoint to ./results\checkpoint-1356
Configuration saved in ./results\checkpoint-1356\config.json
Model weights saved in ./r