# Testing de GPT Fine Tunned Model

## Connect to GDrive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os

path = '_NLP/Project'

os.chdir(f'/content/drive/MyDrive/{path}')
os.getcwd()

'/content/drive/MyDrive/_NLP/Project'

## Install requirements

In [None]:
!pip uninstall -y openai
!pip install openai==0.28
!pip install datasets
!pip install scikit-learn sentence-transformers
!pip install nltk

[0mCollecting openai==0.28
  Downloading openai-0.28.0-py3-none-any.whl (76 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai
Successfully installed openai-0.28.0


In [None]:
import json
import openai
import pandas as pd
import numpy as np
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

## Add API key

In [None]:
api_key ="sk-proj-##########" #ADD YOU API KEY HERE
openai.api_key = api_key

## Split dataset

In [None]:
dataset = load_dataset('arrow', data_files='data-00000-of-00001.arrow')
df = dataset['train'].to_pandas()
df = df.iloc[:, :-1]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
test_data.head()

Unnamed: 0,input,output
27911,What are some physical signs that may indicate...,What are some physical signs that may indicate...
7251,What is the name of the amino acid that serves...,Arginine is the amino acid that acts as the pr...
32050,Do high or low potency typical antipsychotics ...,High potency typical antipsychotics are more l...
7969,Which type of heart valves are commonly affect...,Viridans streptococci infection is typically s...
6904,"Among all bugs, which one is the most frequent...",Staphylococcus aureus is the bug that is the m...


## Generate answers from the model

In [None]:
def generate_answer(question):
    prompt = question + " ->"
    response = openai.Completion.create(
        model='ft:davinci-002:personal::9KLi6nKN',
        prompt=prompt,
        max_tokens=100,
        top_p=0.9,
        frequency_penalty=2,
        presence_penalty=1,
        stop=["\n"]
    )
    return response.choices[0].text

total_questions = len(test_data)
predictions = []
count = 0
for index, row in test_data.iterrows():
    if count >= 500:
        break
    print(f"Processed question {count + 1} out of {total_questions}")
    question = row["input"]
    predicted_answer = generate_answer(question)
    predictions.append(predicted_answer)
    count = count + 1

Processed question 1 out of 6791
Processed question 2 out of 6791
Processed question 3 out of 6791
Processed question 4 out of 6791
Processed question 5 out of 6791
Processed question 6 out of 6791
Processed question 7 out of 6791
Processed question 8 out of 6791
Processed question 9 out of 6791
Processed question 10 out of 6791
Processed question 11 out of 6791
Processed question 12 out of 6791
Processed question 13 out of 6791
Processed question 14 out of 6791
Processed question 15 out of 6791
Processed question 16 out of 6791
Processed question 17 out of 6791
Processed question 18 out of 6791
Processed question 19 out of 6791
Processed question 20 out of 6791
Processed question 21 out of 6791
Processed question 22 out of 6791
Processed question 23 out of 6791
Processed question 24 out of 6791
Processed question 25 out of 6791
Processed question 26 out of 6791
Processed question 27 out of 6791
Processed question 28 out of 6791
Processed question 29 out of 6791
Processed question 30 o

## Compute some metrics

In [None]:
# Get answers frmo the dataset
count = 0
references = []
for index, row in test_data.iterrows():
    if count >= 500:
        break
    count = count + 1
    references.append(row["output"])

In [None]:
# Load a pre-trained model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Compute embeddings
reference_embeddings = model.encode(references)
prediction_embeddings = model.encode(predictions)



In [None]:
# Log the shape of the embeddings for debugging
print(f"Reference embeddings shape: {np.array(reference_embeddings).shape}")
print(f"Prediction embeddings shape: {np.array(prediction_embeddings).shape}")

Reference embeddings shape: (500, 384)
Prediction embeddings shape: (500, 384)


### Cosine Similarity

In [None]:
# Compute cosine similarity for each pair
cosine_similarities = []
for ref_emb, pred_emb in zip(reference_embeddings, prediction_embeddings):
    cos_sim = cosine_similarity([ref_emb], [pred_emb])[0][0]
    cosine_similarities.append(cos_sim)

In [None]:
# Calculate average cosine similarity
average_cosine_similarity = np.mean(cosine_similarities)
print(f'Average Cosine Similarity: {average_cosine_similarity:.2f}')

Average Cosine Similarity: 0.78


### BLEU Score

In [None]:
# Function to calculate BLEU score
def calculate_bleu(reference, prediction):
    reference_tokens = [nltk.word_tokenize(reference)]
    prediction_tokens = nltk.word_tokenize(prediction)
    # Using smoothing function to avoid zero scores for short sequences
    smoothing_function = SmoothingFunction().method1
    bleu_score = sentence_bleu(reference_tokens, prediction_tokens, smoothing_function=smoothing_function)
    return bleu_score

In [None]:
# Ensure NLTK resources are downloaded
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# Calculate BLEU scores for all predictions
bleu_scores = [calculate_bleu(ref, pred) for ref, pred in zip(references, predictions)]

In [None]:
# Calculate average BLEU score
average_bleu_score = sum(bleu_scores) / len(bleu_scores)
print(f'Average BLEU Score: {average_bleu_score:.2f}')

Average BLEU Score: 0.15


### Exact Match

In [None]:
def exact_match(prediction, ground_truth):
    return prediction == ground_truth

# Assuming predictions and references are lists of answers
em_score = sum(exact_match(pred, ref) for pred, ref in zip(predictions, references)) / len(predictions)
print(f'Exact Match Score: {em_score:.2f}')

Exact Match Score: 0.00


### F1 Score

In [None]:
from collections import Counter

def f1_score(prediction, ground_truth):
    pred_tokens = prediction.split()
    ground_truth_tokens = ground_truth.split()
    common_tokens = Counter(pred_tokens) & Counter(ground_truth_tokens)
    num_common = sum(common_tokens.values())

    if num_common == 0:
        return 0.0

    precision = num_common / len(pred_tokens)
    recall = num_common / len(ground_truth_tokens)
    f1 = 2 * (precision * recall) / (precision + recall)
    return f1

f1_scores = [f1_score(pred, ref) for pred, ref in zip(predictions, references)]
average_f1_score = sum(f1_scores) / len(f1_scores)
print(f'Average F1 Score: {average_f1_score:.2f}')

Average F1 Score: 0.36


# Manual checking with random question from test dataset

In [None]:
import random

# Assuming you have a pandas DataFrame named test_data containing your test dataset
random_index = random.randint(0, len(test_data) - 1)
random_row = test_data.iloc[random_index]
prompt = random_row["input"] + " ->"
actual_answer = random_row["output"]

response = openai.Completion.create(
    model='ft:davinci-002:personal::9KLi6nKN',
    prompt=prompt,
    max_tokens=100,
    top_p=1,
    frequency_penalty=1,
    presence_penalty=1,
    stop=["\n"]
)

print('*************************************')
print('Question: ', prompt)
print('Actual Answer:', actual_answer)
print('Bot Answer: ', response.choices[0].text)


*************************************
Question:  How can low doses of dopamine impact renal blood flow (RBF)? ->
Actual Answer: Low doses of dopamine can cause an increase in RBF.
Bot Answer:   Low doses of dopamine can have either an increase or no effect on RBF..
