In [1]:
import multiprocessing
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
import transformers

from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AutoModelForMaskedLM
from transformers import AutoTokenizer, AutoConfig
from transformers import BertForMaskedLM, DistilBertForMaskedLM
from transformers import BertTokenizer, DistilBertTokenizer
from transformers import RobertaTokenizer, RobertaForMaskedLM
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling
from tokenizers import BertWordPieceTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os  # Import the os module

# Set environment variable for PyTorch CUDA allocation configuration
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:50'



In [3]:
# HYPERPARAMS
SEED_SPLIT = 0  # Seed for splitting dataset
SEED_TRAIN = 0  # Seed for training
MAX_SEQ_LEN = 224  # Maximum sequence length
TRAIN_BATCH_SIZE = 16  # Batch size for training
EVAL_BATCH_SIZE = 16  # Batch size for evaluation
LEARNING_RATE = 2e-5  # Learning rate
LR_WARMUP_STEPS = 100  # Number of warmup steps for learning rate
WEIGHT_DECAY = 0.01  # Weight decay for regularization


In [4]:
# load data
df = pd.read_excel('dataset/preprocessed_updated_20_search_and_cf_data-2.xlsx')
df.columns

Index(['veh_model', 'veh year', 'veh_loc', 'veh_mile', 'cust_complaint',
       'repr_comments', 'cmpnt_cat_desc', 'cmpnt_code', 'cmpnt_symp_txt',
       'TREAD_cat'],
      dtype='object')

In [5]:
#dataset cleaning
punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~।|'''
def punctuations_remover(text):
    no_punct = " " 
    for char in text:
        if char not in punctuations and char.isnumeric() == False :
            no_punct = no_punct + char
    return no_punct.lower().strip()

In [6]:
df['corpus'] = df.cust_complaint+df.repr_comments
df['corpus'] = [punctuations_remover(text) for text in df['corpus']]
df['corpus']

1     the airbag is making a buzzing noise very anno...
2     my audis fuel gauge is stuck showing full even...
4     the air suspension system of my audi seems to ...
5     the air suspension system of my audi seems to ...
7     my audis fuel gauge is stuck showing full even...
8     the air suspension system of my audi seems to ...
9     my audis fuel gauge is stuck showing full even...
10    my audis fuel gauge is stuck showing full even...
11    the air suspension system of my audi seems to ...
12    the air suspension system of my audi seems to ...
13    my audis fuel gauge is stuck showing full even...
14    the air suspension system of my audi seems to ...
15    my audis fuel gauge is stuck showing full even...
16    having blinking problem with cabin light at in...
17    light of interior cabin does not turn onreplac...
Name: corpus, dtype: object

In [7]:
# Train/Valid Split
# Splitting the DataFrame into training and validation sets
df_train, df_valid = train_test_split(
    df, test_size=0.15, random_state=SEED_SPLIT
)
# Displaying the lengths of the training and validation sets
len(df_train), len(df_valid)


(15, 3)

In [8]:
# Convert to Dataset object ## this is very very important
# Converting the training DataFrame to a Dataset object, dropping NaN values from the 'corpus' column
train_dataset = Dataset.from_pandas(df_train[['corpus']].dropna())

# Converting the validation DataFrame to a Dataset object, dropping NaN values from the 'corpus' column
valid_dataset = Dataset.from_pandas(df_valid[['corpus']].dropna())


In [9]:
MODEL = 'bert'  # Specify the model type
bert_type = 'bert-base-uncased'  # Specify the type of BERT model

# Selecting the tokenizer and model class based on the specified MODEL
if MODEL == 'distilbert':
    TokenizerClass = DistilBertTokenizer 
    ModelClass = DistilBertForMaskedLM 
elif MODEL == 'bert':
    TokenizerClass = BertTokenizer
    ModelClass = BertForMaskedLM 
elif MODEL == 'roberta':
    TokenizerClass = RobertaTokenizer
    ModelClass = RobertaForMaskedLM
elif MODEL == 'scibert':
    TokenizerClass = AutoTokenizer
    ModelClass = AutoModelForMaskedLM
    
# Instantiating the tokenizer with specified parameters
tokenizer = BertTokenizer.from_pretrained(
            bert_type, use_fast=True, do_lower_case=False, max_len=MAX_SEQ_LEN
            )

# Instantiating the model with the specified BERT type
model = ModelClass.from_pretrained(bert_type)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
## Tokenize the dataset

def tokenize_function(row):
    from transformers import BertTokenizer
    MAX_SEQ_LEN = 512  # Define the constant directly within the function
    
    # Instantiate the tokenizer with specified parameters
    tokenizer = BertTokenizer.from_pretrained(
        'bert-base-uncased', use_fast=True, do_lower_case=False, max_len=MAX_SEQ_LEN
    )
    
    # Tokenize the text in the 'corpus' column
    return tokenizer(
        row['corpus'],
        padding='max_length',
        truncation=True,
        max_length=MAX_SEQ_LEN,
        return_special_tokens_mask=True
    )

# Get column names from the train_dataset
column_names = train_dataset.column_names

# Tokenize the training dataset using multiprocessing
train_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=multiprocessing.cpu_count(),  # Use all available CPU cores
    remove_columns=column_names,  # Remove original text column after tokenization
)

# Tokenize the validation dataset using multiprocessing
valid_dataset = valid_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=multiprocessing.cpu_count(),  # Use all available CPU cores
    remove_columns=column_names,  # Remove original text column after tokenization
)


  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)
num_proc must be <= 3. Reducing num_proc to 3 for dataset of size 3.
                                                                                                                       

In [11]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)  # Define data collator for language modeling

steps_per_epoch = int(len(train_dataset) / TRAIN_BATCH_SIZE)  # Calculate steps per epoch

# Define training arguments
training_args = TrainingArguments(
    output_dir='my_bert',  # Directory to save model checkpoints
    logging_dir='LMlogs',  # Directory for training logs
    num_train_epochs=10,  # Number of training epochs
    do_train=True,  # Perform training
    do_eval=True,  # Perform evaluation
    per_device_train_batch_size=TRAIN_BATCH_SIZE,  # Batch size for training
    per_device_eval_batch_size=EVAL_BATCH_SIZE,  # Batch size for evaluation
    warmup_steps=LR_WARMUP_STEPS,  # Number of warmup steps for learning rate
    save_steps=steps_per_epoch,  # Save model checkpoints at this interval
    save_total_limit=3,  # Limit the total number of saved checkpoints
    weight_decay=WEIGHT_DECAY,  # Weight decay for regularization
    learning_rate=LEARNING_RATE,  # Learning rate
    evaluation_strategy='epoch',  # Evaluate at the end of each epoch
    save_strategy='epoch',  # Save model at the end of each epoch
    load_best_model_at_end=True,  # Load the best model at the end of training
    metric_for_best_model='loss',  # Metric for determining the best model
    greater_is_better=False,  # Lower loss is better
    seed=SEED_TRAIN  # Seed for reproducibility
)

# Define the trainer
trainer = Trainer(
    model=model,  # The model to be trained
    args=training_args,  # Training arguments
    data_collator=data_collator,  # Data collator
    train_dataset=train_dataset,  # Training dataset
    eval_dataset=valid_dataset,  # Evaluation dataset
    tokenizer=tokenizer,  # Tokenizer
)

# Start training
trainer.train()

# Save the fine-tuned model
trainer.save_model("Bert_FineTuned")


The following columns in the training set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: special_tokens_mask. If special_tokens_mask are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 15
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 10
  Number of trainable parameters = 109514298


OutOfMemoryError: CUDA out of memory. Tried to allocate 896.00 MiB (GPU 0; 4.00 GiB total capacity; 10.51 GiB already allocated; 0 bytes free; 10.71 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
###### 