In [1]:


import torch
import transformers
import pandas as pd

from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from transformers import TrainingArguments
from transformers import AutoModelForCausalLM




# Step 4: Fine-tuning Setup
from transformers import TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from torch.utils.data import Dataset

In [2]:
# pip install transformers

In [3]:
df = pd.read_csv('/content/code_of_conduct.csv')

In [4]:
df.head()

Unnamed: 0,Questions,Answers
0,What is the primary importance of honesty and ...,The primary importance of honesty and integrit...
1,What should police officers avoid in their dea...,Police officers should avoid being improperly ...
2,What responsibility do police officers have re...,Police officers have a responsibility to act w...
3,How does police behavior affect the image of t...,Police behavior reflects the image of the enti...
4,What is the stance on corruption for police of...,Police officials should not commit any act of ...


In [5]:

# Step 3: Choose a Pre-trained LLM (e.g., GPT-2 from Hugging Face)
from transformers import GPT2Tokenizer, GPT2LMHeadModel

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2").to('cuda')  # Use .to('cuda') to move the model to GPU

In [6]:

import string

def preprocess_text(text):
    # Remove punctuation and lowercase text
    text = text.translate(str.maketrans('', '', string.punctuation)).lower()
    return text

In [7]:

df['Questions'] = df['Questions'].apply(preprocess_text)
df['Answers'] = df['Answers'].apply(preprocess_text)

In [8]:

qa_pairs = [
    f"Q: {question} A: {answer}"
    for question, answer in zip(df['Questions'], df['Answers'])
]

In [9]:


# Define a padding token and add it to the tokenizer
padding_token = "[PAD]"
tokenizer.pad_token = padding_token

# Tokenize the question-answer pairs with padding
tokenized_data = tokenizer(
    qa_pairs,
    truncation=True,
    padding=True,
    return_tensors="pt"
)

In [10]:


class QADataset(Dataset):
    def __init__(self, tokenized_data):
        self.input_ids = tokenized_data['input_ids']
        self.attention_mask = tokenized_data['attention_mask']

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx]
        }

In [11]:

qa_dataset = QADataset(tokenized_data)


In [12]:

# Fine-tuning Process
training_args = TrainingArguments(
    per_device_train_batch_size=4,
    num_train_epochs=300,
    logging_dir='./logs',
    logging_steps=100,
    output_dir='./model'
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=qa_dataset
)

In [13]:

trainer.train()



Step,Training Loss
100,0.9628
200,0.1752
300,0.1448
400,0.131
500,0.1258
600,0.1217
700,0.1207
800,0.1187
900,0.1163
1000,0.1162


TrainOutput(global_step=2100, training_loss=0.16062691279820032, metrics={'train_runtime': 242.1047, 'train_samples_per_second': 32.217, 'train_steps_per_second': 8.674, 'total_flos': 155244211200000.0, 'train_loss': 0.16062691279820032, 'epoch': 300.0})

In [14]:

from transformers import pipeline

chatbot = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0)


question = "What is the typical dress code or appearance expected of police officers while they are on duty?"
input_text = "Q: " + question

input_ids = tokenizer.encode(input_text, return_tensors="pt").to('cuda')  # Move input to GPU

input_str = tokenizer.decode(input_ids[0], skip_special_tokens=True)
response = chatbot(input_str, max_length=50, num_return_sequences=1)

generated_text = response[0]['generated_text']
print(generated_text)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Q: What is the typical dress code or appearance expected of police officers while they are on duty? A: police officers unless dictated otherwise by their duties should always be well turned out in the performance of their duties with courtesy and respect for the highest standards


In [15]:
df['Answers'][0]

'the primary importance of honesty and integrity for police officers is to ensure that the public'

In [16]:
# !pip install datasets transformers==4.28.0