In [1]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load pre-trained model and tokenizer from Hugging Face
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

print("Environment setup complete.")


  from .autonotebook import tqdm as notebook_tqdm


Environment setup complete.


In [2]:


# Step 2: Import Libraries
from datasets import load_dataset
import json

# Step 3: Load Emergency Response Protocols
with open('ics_protocols.json', 'r') as f:
    ics_protocols_data = json.load(f)

with open('emd_protocols.json', 'r') as f:
    emd_protocols_data = json.load(f)

# Combine both protocols into one list
emergency_protocols = ics_protocols_data['ics_protocols'] + emd_protocols_data['emd_protocols']

# Step 4: Load General Conversational Dataset
convai2_dataset = load_dataset('conv_ai_2', split='train', trust_remote_code=True)

# Step 5: Preprocess the Data
def preprocess_conversations(dataset):
    conversations = []
    for data in dataset:
        dialog = data['dialog']
        if len(dialog) > 1:
            # Use the last user message as input and the last bot message as response
            conversation = {
                "input": dialog[-2]['text'],  # Last user message
                "response": dialog[-1]['text']  # Last bot message
            }
            conversations.append(conversation)
    return conversations

def preprocess_protocols(protocols):
    processed_protocols = []
    for protocol in protocols:
        scenario = protocol.get("scenario", "unknown scenario")
        steps = protocol.get("steps", [])
        if steps:
            processed_protocols.append({
                "scenario": scenario,
                "steps": steps
            })
    return processed_protocols

# Preprocess the ConvAI2 dataset
preprocessed_conversations = preprocess_conversations(convai2_dataset)

# Preprocess the emergency protocols
preprocessed_protocols = preprocess_protocols(emergency_protocols)

# Save preprocessed conversations to a JSON file
with open('conversations.json', 'w') as f:
    json.dump(preprocessed_conversations, f)

# Save preprocessed emergency protocols to a JSON file
with open('processed_emergency_protocols.json', 'w') as f:
    json.dump(preprocessed_protocols, f)

print("Data collection and preprocessing complete.")


Data collection and preprocessing complete.


In [4]:
# File: model_selection_and_fine_tuning.ipynb

# Step 1: Ensure Proper Installation
!pip install accelerate -U
!pip install transformers[torch]
!pip install pandas

# Verify the installations
import accelerate
import transformers

print("Accelerate version:", accelerate.__version__)
print("Transformers version:", transformers.__version__)

# Step 2: Import Libraries
import json
import pandas as pd
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import Dataset

# Step 3: Load Preprocessed Data
with open('conversations.json', 'r') as f:
    conversations_data = json.load(f)

with open('processed_emergency_protocols.json', 'r') as f:
    protocols_data = json.load(f)

# Load the CSV file
csv_file_path = '911.csv'
csv_data = pd.read_csv(csv_file_path)

# Display the first few rows of the CSV data
print(csv_data.head())

# Combine relevant data from CSV into our training data
def extract_csv_data(row):
    description = row['desc']
    title = row['title']
    timestamp = row['timeStamp']
    township = row['twp']
    address = row['addr']
    return f"Emergency: {title}\nDescription: {description}\nTimestamp: {timestamp}\nTownship: {township}\nAddress: {address}"

csv_texts = csv_data.apply(extract_csv_data, axis=1).tolist()

# Step 4: Combine Datasets and Reduce Size
combined_data = conversations_data[:1000] + protocols_data[:200] + [{"input": text, "response": ""} for text in csv_texts[:200]]

# Step 5: Prepare Dataset for Training
def prepare_data(data):
    texts = []
    for item in data:
        if 'input' in item and 'response' in item:
            # Prepare conversation data
            texts.append(f"User: {item['input']}\nAI: {item['response']}")
        elif 'scenario' in item and 'steps' in item:
            # Prepare protocol data
            steps_text = "\n".join(item['steps'])
            texts.append(f"Scenario: {item['scenario']}\nSteps:\n{steps_text}")
    return texts

# Prepare the text data
training_texts = prepare_data(combined_data)

# Create a dataset from the text data
dataset = Dataset.from_dict({"text": training_texts})

# Tokenize the data
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Set the padding token to the end-of-sequence token
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    # Encode the inputs and add padding and truncation
    inputs = tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)  # Reduce max_length to 128
    inputs['labels'] = inputs['input_ids'].copy()  # Copy input_ids to labels
    return inputs

tokenized_data = dataset.map(tokenize_function, batched=True)

# Step 6: Fine-Tune the Model
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,  # Reduce number of epochs to 1
    per_device_train_batch_size=1,  # Keep batch size to 1
    save_steps=10_000,
    save_total_limit=2,
    logging_steps=500,
    evaluation_strategy="no",
    disable_tqdm=False,
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data,
)

# Train the model
trainer.train()

# Step 7: Save the Fine-Tuned Model
model.save_pretrained('./fine_tuned_model')
tokenizer.save_pretrained('./fine_tuned_model')

print("Model fine-tuning complete and saved to './fine_tuned_model'.")


Accelerate version: 0.32.1
Transformers version: 4.42.3

         lat        lng                                               desc  \
0  40.297876 -75.581294  REINDEER CT & DEAD END;  NEW HANOVER; Station ...   
1  40.258061 -75.264680  BRIAR PATH & WHITEMARSH LN;  HATFIELD TOWNSHIP...   
2  40.121182 -75.351975  HAWS AVE; NORRISTOWN; 2015-12-10 @ 14:39:21-St...   
3  40.116153 -75.343513  AIRY ST & SWEDE ST;  NORRISTOWN; Station 308A;...   
4  40.251492 -75.603350  CHERRYWOOD CT & DEAD END;  LOWER POTTSGROVE; S...   

       zip                    title            timeStamp                twp  \
0  19525.0   EMS: BACK PAINS/INJURY  2015-12-10 17:10:52        NEW HANOVER   
1  19446.0  EMS: DIABETIC EMERGENCY  2015-12-10 17:29:21  HATFIELD TOWNSHIP   
2  19401.0      Fire: GAS-ODOR/LEAK  2015-12-10 14:39:21         NORRISTOWN   
3  19401.0   EMS: CARDIAC EMERGENCY  2015-12-10 16:47:36         NORRISTOWN   
4      NaN           EMS: DIZZINESS  2015-12-10 16:56:52   LOWER POTTSGROVE   


Map: 100%|██████████| 700/700 [00:00<00:00, 2625.23 examples/s]
 71%|███████▏  | 500/700 [20:26<07:51,  2.36s/it]

{'loss': 0.6794, 'grad_norm': 10.718127250671387, 'learning_rate': 1.4285714285714285e-05, 'epoch': 0.71}


100%|██████████| 700/700 [28:41<00:00,  2.46s/it]


{'train_runtime': 1721.2671, 'train_samples_per_second': 0.407, 'train_steps_per_second': 0.407, 'train_loss': 0.6416927664620535, 'epoch': 1.0}
Model fine-tuning complete and saved to './fine_tuned_model'.


In [7]:
# File: test_fine_tuned_model.ipynb

# Step 1: Import Necessary Libraries
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Step 2: Load the Fine-Tuned Model and Tokenizer
model_path = './fine_tuned_model'
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

# Ensure the padding token is set
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

# Step 3: Generate a Response for a Sample Prompt
def generate_response(prompt, max_length=100):
    inputs = tokenizer.encode(prompt, return_tensors='pt')
    outputs = model.generate(inputs, max_length=max_length, num_return_sequences=1, pad_token_id=tokenizer.pad_token_id)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Sample Prompt
sample_prompt = "What to do during an heart attack?"

# Generate and Print the Response
response = generate_response(sample_prompt)
print("Prompt:", sample_prompt)
print("Response:", response)


Prompt: What to do during an heart attack?
Response: What to do during an heart attack?
Steps:
Call 911 immediately.
Keep the person calm and still.
Keep the person calm and still.
Keep the person calm and still.
Monitor the person's breathing and be prepared to perform CPR if necessary.
Monitor the person's pulse and be prepared to perform CPR if necessary.
Monitor the person's breathing and be prepared to perform CPR if necessary.
Monitor the person's pulse and be prepared to perform CPR if necessary
