In [None]:
!pip install transformers[torch]
!pip install accelerate -U

Collecting accelerate>=0.20.3 (from transformers[torch])
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.24.1


In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
from transformers import DistilBertTokenizer, DistilBertModel
import torch
from torch.utils.data import Dataset



In [None]:
# Load the dataset
file_path = 'LLM-Sample-Input-File.csv'  # Replace with the path to your dataset
dataset = pd.read_csv(file_path)

In [None]:
# Function to transform dataset to question-answer format
def transform_to_qa_format(df):
    qa_pairs = []
    for _, row in df.iterrows():
        company = row['Company Name']
        category = row['Category']
        sub_cat = row['Sub Cat']
        period = row['Period']
        value = row['Value - Randomized']

        question1 = f"How much revenue did {company} make from {sub_cat} in {period}?"
        answer1 = f"{value}"

        question2 = f"What was the revenue of {company} in {sub_cat} during {period}?"
        answer2 = f"{value}"

        qa_pairs.append({'question': question1, 'context': f"{company} made {value} revenue from {sub_cat} in {period}.", 'answer': answer1})
        qa_pairs.append({'question': question2, 'context': f"In {period}, {company}'s revenue from {sub_cat} was {value}.", 'answer': answer2})

    return qa_pairs


In [None]:
# Transforming the data
transformed_data = transform_to_qa_format(dataset)

# Tokenization
model_name = "distilbert-base-uncased"  # Example model
tokenizer = AutoTokenizer.from_pretrained(model_name)


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
# Your custom dataset class
class CustomDataset(Dataset):
    def __init__(self, tokenizer, data):
        self.tokenizer = tokenizer
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        context = item['context']
        question = item['question']
        answer = item['answer']

        # Tokenize the question and context
        inputs = self.tokenizer.encode_plus(
            question,
            context,
            add_special_tokens=True,
            max_length=512,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        # Find start and end positions of the answer in the context
        start_position = context.find(answer)
        end_position = start_position + len(answer) - 1

        # Convert positions to model's token positions
        inputs['start_positions'] = torch.tensor(start_position)
        inputs['end_positions'] = torch.tensor(end_position)

        inputs = {key: inputs[key].squeeze(0) for key in inputs}
        return inputs

In [None]:
import accelerate
import transformers
print("Accelerate version:", accelerate.__version__)
print("Transformers version:", transformers.__version__)


Accelerate version: 0.24.1
Transformers version: 4.35.2


In [None]:
# Load the model
model = AutoModelForQuestionAnswering.from_pretrained(model_name).to('cuda')

# Splitting the data into training and evaluation sets
train_data = transformed_data[:300]
eval_data = transformed_data[300:]

# Prepare the datasets
train_dataset = CustomDataset(tokenizer, train_data)
eval_dataset = CustomDataset(tokenizer, eval_data)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    report_to="none",  # To disable reporting to Huggingface Hub
    fp16=torch.cuda.is_available(),  # Use mixed precision if CUDA is available
)

NameError: ignored

In [None]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,  # Add eval dataset if available
)

In [None]:
print(transformed_data[0])

{'question': 'How much revenue did Potato Inc. make from Americas in 2015 Q1?', 'context': 'Potato Inc. made 183000000000.0 revenue from Americas in 2015 Q1.', 'answer': '183000000000.0'}


In [None]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,6.2783,6.198213
2,6.1709,5.970425
3,5.8438,5.38187


TrainOutput(global_step=57, training_loss=6.049180081016139, metrics={'train_runtime': 40.4538, 'train_samples_per_second': 22.248, 'train_steps_per_second': 1.409, 'total_flos': 117587790028800.0, 'train_loss': 6.049180081016139, 'epoch': 3.0})

In [None]:
model_path = "./fine_tuned_model"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/vocab.txt',
 './fine_tuned_model/added_tokens.json',
 './fine_tuned_model/tokenizer.json')

In [None]:

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model_path = "./fine_tuned_model"
model = AutoModelForQuestionAnswering.from_pretrained(model_path)

In [None]:
def ask_question(context, question):
    inputs = tokenizer.encode_plus(question, context, add_special_tokens=True, return_tensors="pt")
    input_ids = inputs["input_ids"].tolist()[0]

    outputs = model(**inputs)
    answer_start_scores = outputs.start_logits
    answer_end_scores = outputs.end_logits

    # Get the most likely beginning and end of answer with the argmax of the score
    answer_start = torch.argmax(answer_start_scores)
    answer_end = torch.argmax(answer_end_scores) + 1

    # Convert the tokens to the answer string
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))

    return answer

In [None]:
context = "Potato Inc. made 200 million in revenue from smartphones in 2021."  # Example context
question = "How much revenue does Potato Inc. make from selling Smartphones?"

answer = ask_question(context, question)
print("Answer:", answer)

Answer: potato inc. made 200 million in revenue from
