In [8]:
import json

def load_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

def save_json(data, file_path):
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(data, file, indent=4, ensure_ascii=False)

def prepare_cross_encoder_dataset(json_data):
    dataset_samples = []
    for item in json_data:
        question = item['Question']

        # Process correct and similar answers with a score of 1
        relevant_contexts = [item['Answer']] + item['Similar_answers']
        for context in relevant_contexts:
            sample = {
                'query': question,
                'context': context,
                'score': 1
            }
            dataset_samples.append(sample)

        # Process poor answers with a score of 0
        for context in item['Poor_answers']:
            sample = {
                'query': question,
                'context': context,
                'score': 0
            }
            dataset_samples.append(sample)

    return dataset_samples

# Load data
json_data = load_json('../data/Final_QA_pairs.json')

# Prepare dataset
cross_encoder_dataset = prepare_cross_encoder_dataset(json_data)

# Save the prepared dataset
save_json(cross_encoder_dataset, '../data/Cross_Encoder_Finetuning_Dataset.json')

print("Dataset prepared and saved successfully!")


Dataset prepared and saved successfully!


In [28]:
#load from csv
import pandas as pd
data = pd.read_csv('../data/finetuning/ce_finetuning_dataset(41-60).csv', header=None, names=['query', 'context', 'score'])
# Prepare the test and train datasets
test_dataset = pd.DataFrame()
train_dataset = pd.DataFrame()

# Iterate over each group, separating test and train samples
for _, group in data.groupby('query'):
    positives = group[group['score'] == 1]
    negatives = group[group['score'] == 0]
    
    # Ensure there is at least one positive and one negative for the test dataset
    if not positives.empty and not negatives.empty:
        selected_positive = positives.sample(n=1)
        selected_negative = negatives.sample(n=1)
        
        # Append selected samples to the test dataset
        test_dataset = pd.concat([test_dataset, selected_positive, selected_negative])

        # Append the remaining data to the training dataset
        # We drop the selected samples by index from the original group
        remaining_samples = group.drop(selected_positive.index).drop(selected_negative.index)
        train_dataset = pd.concat([train_dataset, remaining_samples])

# Reset indices for clean datasets
test_dataset.reset_index(drop=True, inplace=True)
train_dataset.reset_index(drop=True, inplace=True)

print("Test dataset sample size:", len(test_dataset))
print("Training dataset sample size:", len(train_dataset))
# Save test and train datasets to CSV files
test_dataset.to_csv('../data/finetuning/test_dataset.csv', index=False)
train_dataset.to_csv('../data/finetuning/train_dataset.csv', index=False)

print("Datasets saved successfully.")
print("Test dataset preview:")
print(test_dataset.head())
print("Training dataset preview:")
print(train_dataset.head())

Test dataset sample size: 40
Training dataset sample size: 1560
Datasets saved successfully.
Test dataset preview:
                                               query  \
0  A nurse is assessing a patient who has been re...   
1  A nurse is assessing a patient who has been re...   
2  A nurse is responding to a code blue in the em...   
3  A nurse is responding to a code blue in the em...   
4  A patient has been electrocuted and is now unr...   

                                             context  score  
0  (C) Chest Compressions Site of chest compressi...      1  
1  Placement of Defibrillation Pads for Children/...      0  
2  1.6:  OTHER  COMMON  CAUSES  OF  CARDIAC ARRES...      1  
3  3.2: AUTOMATED EXTERNAL DEFIBRILLATORS (AEDs):...      0  
4  (A) Ask someone to get an Automated External D...      1  
Training dataset preview:
                                               query  \
0  A nurse is assessing a patient who has been re...   
1  A nurse is assessing a patient who 

In [2]:
%pip install llama-index-legacy
%pip install llama-index-finetuning
%pip install llama-index-llms-openai
!pip install huggingface_hub --quiet


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
train_dataset = pd.read_csv('../data/finetuning/train_dataset.csv')
class CrossEncoderFinetuningDatasetSample:
    def __init__(self, query, context, score):
        self.query = query
        self.context = context
        self.score = score
finetuning_dataset = [CrossEncoderFinetuningDatasetSample(d[0], d[1], d[2]) for d in train_dataset.values]
print(len(finetuning_dataset))

from llama_index.legacy.finetuning.cross_encoders.cross_encoder import CrossEncoderFinetuneEngine
#from sentence_transformers import SentenceTransformer

# Initialise the cross-encoder fine-tuning engine
finetuning_engine = CrossEncoderFinetuneEngine(
    dataset=finetuning_dataset, epochs=2, batch_size=8
)

# Finetune the cross-encoder model
finetuning_engine.finetune()

1560




Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/195 [00:00<?, ?it/s]

Iteration:   0%|          | 0/195 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [22]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [23]:
finetuning_engine.push_to_hub(
    repo_id="ethan-cyj/Cross-Encoder-Finetuned",
)

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

In [26]:
from sentence_transformers import CrossEncoder

base_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
finetuned_model = CrossEncoder('ethan-cyj/Cross-Encoder-Finetuned')




config.json:   0%|          | 0.00/854 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [32]:
results = []
df = pd.read_csv('../data/finetuning/test_dataset.csv')
for index, row in df.iterrows():
    query = row['query']
    context = row['context']
    base_score = base_model.predict([(query, context)])[0]
    finetuned_score = finetuned_model.predict([(query, context)])[0]
    results.append({
        'query': query,
        'context': context,
        'label': row['score'],
        'base_score': base_score,
        'finetuned_score': finetuned_score
    })

# Convert results to DataFrame
results_df = pd.DataFrame(results)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Assume a threshold for classification
threshold = 0.5
results_df['base_pred'] = (results_df['base_score'] >= threshold).astype(int)
results_df['finetuned_pred'] = (results_df['finetuned_score'] >= threshold).astype(int)

# Calculate metrics
metrics = {
    'Base Accuracy': accuracy_score(results_df['label'], results_df['base_pred']),
    'Base Precision': precision_score(results_df['label'], results_df['base_pred']),
    'Base Recall': recall_score(results_df['label'], results_df['base_pred']),
    'Base F1': f1_score(results_df['label'], results_df['base_pred']),
    'Finetuned Accuracy': accuracy_score(results_df['label'], results_df['finetuned_pred']),
    'Finetuned Precision': precision_score(results_df['label'], results_df['finetuned_pred']),
    'Finetuned Recall': recall_score(results_df['label'], results_df['finetuned_pred']),
    'Finetuned F1': f1_score(results_df['label'], results_df['finetuned_pred'])
}

# Display metrics
print(metrics)



{'Base Accuracy': 0.725, 'Base Precision': 1.0, 'Base Recall': 0.45, 'Base F1': 0.6206896551724138, 'Finetuned Accuracy': 0.875, 'Finetuned Precision': 0.9411764705882353, 'Finetuned Recall': 0.8, 'Finetuned F1': 0.8648648648648649}
