In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, TrainerCallback
from datasets import Dataset, DatasetDict
import numpy as np
import os
import torch

def read_jsonl(file_path, nrows=None):
    return pd.read_json(file_path, lines=True, nrows=nrows)


train_data = read_jsonl('C:/Users/Administrator.DUCS-GPU/Desktop/LLM_data/attribute_train.data')
train_solution = read_jsonl('C:/Users/Administrator.DUCS-GPU/Desktop/LLM_data/attribute_train.solution')
test_data = read_jsonl('C:/Users/Administrator.DUCS-GPU/Desktop/LLM_data/attribute_test.data')
#test_solution = read_jsonl('./data/attribute_test.solution', nrows=200)
val_data = read_jsonl('C:/Users/Administrator.DUCS-GPU/Desktop/LLM_data/attribute_val.data')
val_solution = read_jsonl('C:/Users/Administrator.DUCS-GPU/Desktop/LLM_data/attribute_val.solution')

def preprocess_data(data, solution=None):
    if solution is not None:
        merged = pd.merge(data, solution, on='indoml_id')
        merged['input_text'] = merged.apply(lambda row: f"title: {row['title']} store: {row['store']} details_Manufacturer: {row['details_Manufacturer']}", axis=1)
        merged['target_text'] = merged.apply(lambda row: f"details_Brand: {row['details_Brand']} L0_category: {row['L0_category']} L1_category: {row['L1_category']} L2_category: {row['L2_category']} L3_category: {row['L3_category']} L4_category: {row['L4_category']}", axis=1)
        return merged[['input_text', 'target_text']]
    
    else:
        data['input_text'] = data.apply(lambda row: f"title: {row['title']} store: {row['store']} details_Manufacturer: {row['details_Manufacturer']}", axis=1)
        return data[['input_text']]


train_processed = preprocess_data(train_data, train_solution)
test_processed = preprocess_data(test_data)
val_processed = preprocess_data(val_data, val_solution)

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_processed)
test_dataset = Dataset.from_pandas(test_processed)
val_dataset = Dataset.from_pandas(val_processed)

dataset_dict = DatasetDict({
    'train': train_dataset,
    #'test': test_dataset,
    'validation': val_dataset
})

In [None]:
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

In [None]:
def preprocess_function(examples):
    
    inputs = examples['input_text']
    targets = examples['target_text']
    model_inputs = tokenizer(inputs, max_length=128, padding='max_length', truncation=True)
    labels = tokenizer(targets, max_length=128, padding='max_length', truncation=True)

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

tokenized_datasets = dataset_dict.map(preprocess_function, batched=True)

In [None]:
print(tokenized_datasets)

In [None]:
tokenized_datasets.save_to_disk('./')

In [None]:
from datasets import load_from_disk

tokenized_datasets = load_from_disk('./')

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-3,
    per_device_train_batch_size=500,
    per_device_eval_batch_size=500,
    num_train_epochs=50,
    weight_decay=0.01,
    save_total_limit=3,
    logging_dir='./logs',
    logging_steps=20,
    report_to='none',
)


class CustomCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            print(f"Step: {state.global_step}")
            for key, value in logs.items():
                print(f"{key}: {value}")
            print("\n")

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    callbacks=[CustomCallback()],
)

# Start training
trainer.train()


In [None]:
val_results = trainer.evaluate(eval_dataset=tokenized_datasets['validation'])
print(f"Validation Loss: {val_results['eval_loss']}")

#test_results = trainer.evaluate(eval_dataset=tokenized_datasets['test'])
#print(f"Test Loss: {test_results['eval_loss']}")

In [None]:
model.save_pretrained('./fine_tuned_t5_full_50epochsdp')
tokenizer.save_pretrained('./fine_tuned_t5_full_50epochsdp')

# Next time run from here....

In [None]:
import re
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch
from tqdm import tqdm

device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = T5ForConditionalGeneration.from_pretrained('./fine_tuned_t5_full_50epochsdp').to(device)
tokenizer = T5Tokenizer.from_pretrained('./fine_tuned_t5_full_50epochsdp')

model.eval()

test_data = test_dataset['input_text']
#test_label = test_dataset['target_text']

def generate_text(inputs):
    inputs = tokenizer.batch_encode_plus(inputs, return_tensors="pt", padding=True, truncation=True, max_length=352)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=128)
    
    generated_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    cleaned_texts = [clean_repeated_patterns(text) for text in generated_texts]
    return cleaned_texts

def extract_details(text):
    pattern = r'details_Brand: (.*?) L0_category: (.*?) L1_category: (.*?) L2_category: (.*?) L3_category: (.*?) L4_category: (.*)'
    match = re.match(pattern, text)
    if match:
        return tuple(item if item is not None else 'na' for item in match.groups())
    return 'na', 'na', 'na', 'na', 'na', 'na'

def clean_repeated_patterns(text):
    cleaned_data = text.split(' L4_category')[0] 
    return cleaned_data




In [None]:
batch_size = 2048
generated_details = []
target_details = []

for i in tqdm(range(0, len(test_data), batch_size), desc="Processing test data"):
    batch_inputs = test_data[i:i+batch_size]
    #batch_labels = test_label[i:i+batch_size] #you are not going to have this
    
    generated_texts = generate_text(batch_inputs)
    
    for generated_text in generated_texts:
        generated_details.append(extract_details(generated_text))

print('Generated info extracted.............')

In [None]:
import json
categories = ['details_Brand', 'L0_category', 'L1_category', 'L2_category', 'L3_category', 'L4_category']

with open('full_50epochs.predict', 'w') as file:

    for indoml_id, details in enumerate(generated_details):
        result = {"indoml_id": indoml_id}
        for category, value in zip(categories, details):
            result[category] = value
        
        file.write(json.dumps(result) + '\n')

In [None]:
import zipfile

file_to_zip = 'full_50epochs.predict'
zip_file_name = 'full_50epochs.zip'

with zipfile.ZipFile(zip_file_name, 'w') as zipf:
    zipf.write(file_to_zip, arcname=file_to_zip)