# Set LLM API

In [None]:
!pip install langchain
!pip install openai

In [None]:
!pip install pipreqs
!pip install pipreqsnb

In [None]:
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
import pandas as pd
import openai
import os
from difflib import SequenceMatcher
import time
import re
import math
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import ranksums
import json
from collections import defaultdict

In [None]:
# put the huggingface api key
openai_key=""
os.environ['OPENAI_API_KEY'] = openai_key
openai.api_key = openai_key

In [None]:
def get_completion(prompt, model="gpt-3.5-turbo"):
    messages = [{"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0.2, # this is the degree of randomness of the model's output
    )
    return response.choices[0].message["content"]

DataFrame

In [None]:
# Load the Excel file
df1 = pd.ExcelFile('Data/Annotation_1.xlsx')

# Get the sheet names
sheet_names = df1.sheet_names

# Create dataframes with specific names
df_sentences = df1.parse(sheet_names[0])  # Assuming the first sheet contains sentences
df_paragraphs = df1.parse(sheet_names[1])  # Assuming the second sheet contains paragraphs
# Replace NaN values with 0 in df_sentences
df_sentences.fillna(0, inplace=True)

# Replace NaN values with 0 in df_paragraphs
df_paragraphs.fillna(0, inplace=True)

In [None]:
def response_to_dataframe(response):
    extracted_data = []
    lines = response.strip().split('\n')  # Process all lines
    for line in lines:
        parts = line.split(':::', 1)
        if len(parts) != 2:
            print(f"Skipped line (improper format): {line}")  # Log the issue for visibility
            continue
        sentence, labels = parts
        sentence = sentence.strip(' "')
        print("sentence     :",sentence)
        labels = labels.strip()
        print("labels     :",labels)
        extracted_data.append((sentence, labels))
    df = pd.DataFrame(extracted_data, columns=["sentence", "label"])
    return df

final_df = pd.DataFrame(columns=["sentence", "label", "paragraph_id"])  # Initialize an empty DataFrame

# Initialize response_df here
response_df = pd.DataFrame(columns=["response", "paragraph_id"])

for index, row in df.iterrows():
    paragraph = row['Section']
    paragraph_id = row['paragraph_ids']

    # Construct the prompt
    prompt = f"""
    We are processing a regulatory document on food safety. The following concepts are of interest to us and ONLY these concepts should be used in your labels:
    - Data: any information used to convey knowledge, provide assurance or perform analysis;
    - Label Data: information that a food-product package or container must bear.
    - Non-label Data: any food-safety-relevant data other than label data that needs to be collected and/or retained for inclusion in documents such as certificates, reports, guarantees and letters.
    - Measurement: Association of numbers with physical quantities;
    - Colour: (self-evident);
    - Firmness: degree of resistance to deformation;
    - Mass: amount of substance by weight or volume;
    - Pathogen: a microorganism that causes disease;
    - Size: dimension (e.g., length or thickness) or surface area.
    - Temperature: (self-evident).
    - Water Content: humidity or moisture.
    - Time Constraint: A temporal restriction, in our context, is expressed using intervals, deadlines or periodicity.
    Not all concepts are necessarily present. Extract relevant text segments and their associated concepts from the paragraph below and write nothing else!
    The output should be in the following format: 'sentence ::: labels'.
    Paragraph: ```{paragraph}```
    """
    response = get_completion(prompt)
    # Append the response and its associated paragraph_id to response_df
    response_df.loc[len(response_df)] = [response, paragraph_id]
    # print("row number",index)
    # print(response)

    # Convert the response to a dataframe
    current_df = response_to_dataframe(response)
    current_df["paragraph_id"] = paragraph_id  # Add the paragraph id

    # Directly append to final_df
    final_df = pd.concat([final_df, current_df], ignore_index=True)

1. Prepare the ground truth data.
You have a ground truth DataFrame with columns "sentence", "label", and "paragraph_id".

2. Iterate over each extracted sentence, find the best matching ground truth sentence, and compare labels.



In [None]:
df_sentences = df_sentences.rename(columns={'LabelData': 'label data', 'Non-labelData': 'non-label data'})

In [None]:
# Convert the 'paragraph_id' column to int in both DataFrames
final_df['paragraph_ids'] = final_df['paragraph_ids'].astype(int)
df_sentences['paragraph_id'] = df_sentences['paragraph_id'].astype(int)

In [None]:
df_sentences.columns = [col.lower() for col in df_sentences.columns]
labels_list = ["Data", "Label Data", "Non-label Data", "Measurement", "Colour", "Firmness", "Mass", "Pathogen", "Size", "Temperature", "Water Content", "Time Constraint","Overall"]
labels_list = [label.lower() for label in labels_list]

In [None]:
for label in labels_list:
    if df_sentences[label].notna().all():  # Ensure no NaN values
        df_sentences[label] = df_sentences[label].astype(int)
    else:
        print(f"NaN values detected in '{label}' column!")

In [None]:
final_df['sentence'] = final_df['sentence'].str.replace("The output should be: ", "", regex=False)

In [None]:
!pip install python-Levenshtein

In [None]:
import Levenshtein

A function to lowercase the first letter of string

In [None]:
def lowercase_first_letter(s):
    if not s:  # Check if the string is empty
        return s
    return s[0].lower() + s[1:]

# Test
string = "Hello, World!"
result = lowercase_first_letter(string)
print(result)  # Outputs: hello, World!

N-gram Jaccard similarity

In [None]:
def jaccard_similarity(list1, list2):
    s1 = set(list1)
    s2 = set(list2)
    return len(s1.intersection(s2)) / len(s1.union(s2))

def ngrams(input, n):
    input = input.split(' ')
    output = []
    for i in range(len(input)-n+1):
        output.append(' '.join(input[i:i+n]))
    return output

In [None]:
#This function will return the sentence from sentences_list that contains sentence as a substring and has the maximum length. If no such sentence is found, it will return None.
def get_best_match(sentence, indexed_sentences_list,paragraph_id,Levenshtein_threshold,ngram_threshold,ngram_n=3):
    # Strip single quotes from the beginning and end of the sentence
    sentence=sentence.strip("'\"")

    if sentence.startswith('-'):
      sentence = sentence[1:].lstrip()  # Remove the hyphen and any leading spaces
      sentence = sentence.replace('"', '', 1)  # Remove the first occurrence of "

    # Remove ;and or ;or from the end of the sentence
    if sentence.endswith(';and'):
        sentence = sentence[:-4]
    elif sentence.endswith(';or'):
        sentence = sentence[:-3]

    # First, check for substring matches
    for idx, s in indexed_sentences_list:
        if sentence in s:
            return idx,s

    lowered_sentence=lowercase_first_letter(sentence)

    for idx, s in indexed_sentences_list:
      if lowered_sentence in s:
          return idx,s

    for idx, s in indexed_sentences_list:
      if s in sentence:
          # print("Gt substring of S", s)
          return idx,s

    # If no exact substring match is found, check for similarity using Levenshtein distance
    best_match = None
    min_distance = float('inf')  # Initialize to a large value

    best_Levenshtein_match_idx=0
    for idx, s in indexed_sentences_list:
        distance = Levenshtein.distance(lowered_sentence, s)
        if distance < min_distance:
            min_distance = distance
            best_match = s
            best_Levenshtein_match_idx=idx

    # Convert the distance to a similarity ratio
    similarity_ratio = 1 - min_distance / max(len(lowered_sentence), len(best_match))

    if similarity_ratio > Levenshtein_threshold:
        return best_Levenshtein_match_idx,best_match

    elif similarity_ratio < Levenshtein_threshold:
    # Check for substring matches
      clean_sentence = ''.join(e for e in sentence if e.isalnum() or e.isspace()).strip()
      for idx, s in indexed_sentences_list:
          clean_s = ''.join(e for e in s if e.isalnum() or e.isspace()).strip()
          if clean_sentence in clean_s:
              return idx,s

      best_ngram_similarity = 0
      best_ngram_match = None
      best_ngram_match_idx=0

      sentence_ngrams = ngrams(sentence, ngram_n)
      for idx, s in indexed_sentences_list:
          s_ngrams = ngrams(s, ngram_n)
          similarity = jaccard_similarity(sentence_ngrams, s_ngrams)

          if similarity > best_ngram_similarity:
              best_ngram_similarity = similarity
              best_ngram_match = s
              best_ngram_match_idx=idx

      if best_ngram_similarity >= ngram_threshold:
          return best_ngram_match_idx,best_ngram_match

      print(f"Paragraph ID: {paragraph_id}")
      print(f"Unmatched sentence: {sentence}")
      print(f"Best Levenshtein match with similarity ratio: {similarity_ratio} was: {best_match}")
      print(f"Best n-gram match with similarity: {best_ngram_similarity} was: {best_ngram_match}")
      # print(f"Best cosine match with similarity: {best_cosine_match} was: {best_cosine_match[0]}")
      print("-----")

      with open('output.txt', 'a') as file:
        file.write(f"Paragraph ID: {paragraph_id}\n")
        file.write(f"Unmatched sentence: {sentence}\n")
        file.write(f"Best match with similarity ratio: {similarity_ratio} was: {best_match}\n")
        file.write("-----\n")
      return None,None


In [None]:
counters = {label: {"TP": 0, "FP": 0, "FN": 0} for label in labels_list}

# Dictionary to store missing labels for each matched ground truth sentence
missing_labels_dict = {
    (index, statement): [label for label in labels_list if df_sentences.loc[index, label] == 1]
    for index, statement in df_sentences['statement'].items()
}

for label in labels_list:
    init_count = sum([1 for labels in missing_labels_dict.values() if label in labels])
    assert init_count == df_sentences[label].sum(), f"Mismatch for {label} during initialization"

for index, row in final_df.iterrows():
    extracted_sentence = row['sentence']
    paragraph_id = row['paragraph_ids']

    # Filter the ground truth dataframe by the current paragraph_id
    filtered_gt_df = df_sentences[df_sentences['paragraph_id'] == paragraph_id]

    # Prepare a list of tuples (index, statement)
    indexed_statements = list(filtered_gt_df[['statement']].itertuples(index=True, name=None))

    # Find the best matching ground truth sentence and its index
    matched_index, matched_sentence = get_best_match(extracted_sentence, indexed_statements, paragraph_id,Levenshtein_threshold=0.90,ngram_threshold=0.90)

    # If there's a match, compare each label
    # if matched_index and matched_sentence:

    if matched_index is not None and matched_sentence is not None:


        ground_truth_row = filtered_gt_df.loc[filtered_gt_df['statement'] == matched_sentence].iloc[0]

        for label in labels_list:
            # Extracted is 1, Ground Truth is 1: True Positive
            if row[label] == 1 and ground_truth_row[label] == 1:
                if label in missing_labels_dict[matched_index,matched_sentence]:  # Only if the label is still missing
                  counters[label]["TP"] += 1
                  missing_labels_dict[(matched_index, matched_sentence)].remove(label)  # Remove the label as it's no longer missing

            # Extracted is 1, Ground Truth is 0: False Positive
            elif row[label] == 1 and ground_truth_row[label] == 0:
                counters[label]["FP"] += 1


    else:

        for label in labels_list:
            if row[label] == 1:
                counters[label]["FP"] += 1

# FN count is incremented for missing labels in the gt sentences.
for sentence, missing_labels in missing_labels_dict.items():
    for label in missing_labels:
        counters[label]["FN"] += 1

# Compute precision, recall, and F-score for each label
def compute_scores(tp, fp, fn):
    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall = tp / (tp + fn) if tp + fn > 0 else 0
    fscore = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
    return precision, recall, fscore

scores = {}
for label, counts in counters.items():
    precision, recall, fscore = compute_scores(counts["TP"], counts["FP"], counts["FN"])
    scores[label] = {"Precision": precision, "Recall": recall, "F-score": fscore}

In [None]:
with open('output.txt', 'a') as f:
    f.write(str(counters))
    f.write('\n\n')  # Separate the two dictionaries with two newlines
    f.write(str(scores))
    f.write('\n\n')  # Separate the two dictionaries with two newlines
    f.write("-----\n")

In [None]:
for (idx, sentence), labels in missing_labels_dict.items():
    if not labels:  # if the list of labels is empty
        continue

    try:
        # Retrieve the paragraph_id from df_sentences using the index
        paragraph_id = df_sentences.loc[idx, 'paragraph_id']
    except KeyError:
        print(f"ERROR: Index not found in df_sentences: {idx}")
        continue

    with open('output.txt', 'a') as f:
        f.write(f"Paragraph ID: {paragraph_id}\n")
        f.write(f"Sentence: {sentence}\n")
        f.write(f"Missing Labels: {labels}\n")
        f.write("-----\n")

    print(f"Paragraph ID: {paragraph_id}")
    print(f"Sentence: {sentence}")
    print(f"Missing Labels: {labels}")
    print("-----")


In [None]:
# Get the number of missing labels for each sentence
missing_label_counts = [len(labels) for labels in missing_labels_dict.values()]

# Get the range and some basic statistics
min_missing = min(missing_label_counts)
max_missing = max(missing_label_counts)
avg_missing = sum(missing_label_counts) / len(missing_label_counts)

print(f"Range of missing labels: {min_missing} to {max_missing}")
print(f"Average missing labels per sentence: {avg_missing:.2f}")


####Test the created Response and final_df files

In [None]:
# Read the Response.txt file as plain text and split using "++++++"
with open('Resposne.txt', 'r') as file:
    content_response = file.read()
    responses = content_response.split('++++++')

# Clean up any leading/trailing whitespace from each response
responses = [response.strip() for response in responses]
response_df = pd.DataFrame(responses, columns=['Response'])

# Read the paragraph_id.txt file as plain text and split using "++++++"
with open('paragraphid.txt', 'r') as file:
    content_paragraph_id = file.read()
    paragraph_ids = content_paragraph_id.split('++++++')

# Clean up any leading/trailing whitespace from each ID
paragraph_ids = [pid.strip() for pid in paragraph_ids]
paragraph_id_df = pd.DataFrame(paragraph_ids, columns=['Paragraph_ID'])

# Combine both DataFrames side by side
combined_df = pd.concat([response_df, paragraph_id_df], axis=1)

print(combined_df)

In [None]:
def response_to_dataframe(df_input):
    extracted_data = []

    for _, row in df_input.iterrows():
        response = row['Response']
        paragraph_id = row['Paragraph_ID']

        lines = response.strip().split('\n')  # Process all lines
        for line in lines:
            if ':::' in line:
                parts = line.strip().split(':::')
                if len(parts) != 2:
                    print(f"Skipped line (improper format): {line}")  # Log the issue for visibility
                    continue
                sentence, labels = parts
                labels = labels.strip()
                extracted_data.append((sentence, labels, paragraph_id))
            else:
                print(f"Skipped line (improper format): {line}")  # Log the issue for visibility
                continue

    df = pd.DataFrame(extracted_data, columns=["sentence", "labels", "paragraph_ids"])
    return df

In [None]:
final_df=response_to_dataframe(combined_df)

Read data from output final_df and create its dataframe here:

In [None]:
# Read the paragraph d of the sentences
with open('final_df_paragraphid.txt', 'r') as file:
    paragraph_ids_content = file.read()

# Read the content of the sentences file
with open('final_df.txt', 'r') as file:
    content = file.read()

# Split the content by the delimiter and strip each sentence of leading/trailing whitespaces and newlines
sentences = [s.strip() for s in content.strip().split('++++++\n')]

# Create a dataframe from the list of sentences
final_df = pd.DataFrame(sentences, columns=['sentence'])

# Read the content of the labels file
with open('final_df_labels.txt', 'r') as file:
    labels_content = file.read()

# Split the content by the delimiter and strip each label of leading/trailing whitespaces and newlines
labels = [l.strip() for l in labels_content.strip().split('++++++\n')]

# Add the labels as a new column to the dataframe
final_df['labels'] = labels


# Split the content by the delimiter
paragraph_ids = [p.strip() for p in paragraph_ids_content.strip().split('++++++\n')]

# Add the labels as a new column to the dataframe
final_df['paragraph_ids'] = paragraph_ids


# Check if the lengths are consistent
if not (len(sentences) == len(labels) == len(paragraph_ids)):
    raise ValueError("The files have inconsistent numbers of rows")

# Create a dataframe from the lists
final_df = pd.DataFrame({
    'sentence': sentences,
    'labels': labels,
    'paragraph_ids': paragraph_ids
})

# Clean up the last row if needed
final_df['labels'] = final_df['labels'].str.replace('\n++++++', '', regex=False)
final_df['paragraph_ids'] = final_df['paragraph_ids'].str.replace('\n++++++', '', regex=False)

# Display the dataframe
final_df

In [None]:
# Replace unwanted characters in the 'labels' column and convert to lowercase
final_df['labels'] = final_df['labels'].str.replace("""[.'"]""", "", regex=True).str.strip().str.lower()

# Removing extra spaces after the commas
final_df['labels'] = final_df['labels'].str.replace(", ", ",", regex=False)

# Display the cleaned dataframe
final_df

In [None]:
# Filter rows where the label, after being stripped of whitespace, is empty
only_whitespace_rows = final_df[final_df['labels'].str.strip() == ""]

# Display the index, sentence, and label columns of these rows
print(only_whitespace_rows[['sentence', 'labels']])

# Get the indices of these rows from the main dataframe
row_indices = only_whitespace_rows.index.tolist()

print(row_indices)

In [None]:
labels_list = ["Data", "Label Data", "Non-label Data", "Measurement", "Colour", "Firmness", "Mass", "Pathogen", "Size", "Temperature", "Water Content", "Time Constraint"]

# Convert the labels list to lowercase for case-insensitivity (optional)
labels_list = [label.lower() for label in labels_list]

# Initialize all label columns with 0
for label in labels_list:
    final_df[label] = 0


for index, row in final_df.iterrows():
    # Split the labels while preserving content within parentheses
    labels = [label.strip().lower() for label in re.split(r',(?![^\(]*\))', row['labels'])]

    for label in labels:
        if label in labels_list:
            final_df.at[index, label] = 1


        elif '(' in label:
            # Check for main label
            main_label = re.split(r'[(]', label)[0].strip()
            if not main_label:
              print("cases like (...:...) happend", label, index)

            if main_label and main_label in labels_list:
                final_df.at[index, main_label] = 1

            # Extract subtypes within parentheses
            sub_labels = re.findall(r'\((.*?)\)', label)
            for sub_group in sub_labels:
                # Split by colon if present
                subtypes = [sub.strip() for sub in sub_group.split(',') if sub]
                for subtype in subtypes:
                    if subtype in labels_list:
                        final_df.at[index, subtype] = 1
                    else:
                        # Split by spaces to check individual words
                        for word in subtype.split():
                            if word in labels_list:
                                final_df.at[index, word] = 1
                            else:
                                print("sub word in the () not matched: ",word ,index)
        elif ':' in label:
          main_label = re.split(r'[:]', label)[0].strip()
          sub_label = re.split(r'[:]', label)[1].strip()

          if main_label in labels_list:
            final_df.at[index, main_label] = 1
          if sub_label in labels_list:
            final_df.at[index, sub_label] = 1
        else:
            print("label not matched: ",label, "at the index: ",index)

final_df

In [None]:
final_df['overall'] = 0

labels_list = ["Data", "Label Data", "Non-label Data", "Measurement", "Colour", "Firmness", "Mass", "Pathogen", "Size", "Temperature", "Water Content", "Time Constraint"]
measurement_subtypes = ["Colour", "Firmness", "Mass", "Size", "Temperature", "Water Content","Pathogen"]
data_subtypes = ["Label Data", "Non-label Data"]


# Convert the labels list to lowercase for case-insensitivity (optional)
labels_list = [label.lower() for label in labels_list]
measurement_subtypes= [label.lower() for label in measurement_subtypes]
data_subtypes= [label.lower() for label in data_subtypes]

for index, row in final_df.iterrows():
    # Set 'Overall' to 1 if any label is found
    if any(row[label] == 1 for label in labels_list):
        final_df.at[index, 'overall'] = 1

    # Set 'Measurement' to 1 if any of its subtypes are found
    if any(row[subtype] == 1 for subtype in measurement_subtypes):
        final_df.at[index, 'measurement'] = 1

    # Set 'Data' to 1 if any of its subtypes are found
    if any(row[subtype] == 1 for subtype in data_subtypes):
        final_df.at[index, 'data'] = 1

final_df

#Analysis of Results

Zero shot Results

In [None]:
experiments_data = []
experiments_data.append({'data': {'Precision': 0.19672131147540983, 'Recall': 0.8450704225352113, 'F-score': 0.3191489361702128}, 'label data': {'Precision': 0.2556818181818182, 'Recall': 0.8333333333333334, 'F-score': 0.3913043478260869}, 'non-label data': {'Precision': 0.1308411214953271, 'Recall': 0.7368421052631579, 'F-score': 0.22222222222222218}, 'measurement': {'Precision': 0.5979381443298969, 'Recall': 0.4027777777777778, 'F-score': 0.48132780082987553}, 'colour': {'Precision': 0.38461538461538464, 'Recall': 0.4166666666666667, 'F-score': 0.4}, 'firmness': {'Precision': 0.16666666666666666, 'Recall': 0.5, 'F-score': 0.25}, 'mass': {'Precision': 0.6923076923076923, 'Recall': 0.19148936170212766, 'F-score': 0.30000000000000004}, 'pathogen': {'Precision': 0.18181818181818182, 'Recall': 1.0, 'F-score': 0.3076923076923077}, 'size': {'Precision': 0.3333333333333333, 'Recall': 0.2777777777777778, 'F-score': 0.303030303030303}, 'temperature': {'Precision': 0.8235294117647058, 'Recall': 1.0, 'F-score': 0.9032258064516129}, 'water content': {'Precision': 0.4, 'Recall': 0.5, 'F-score': 0.4444444444444445}, 'time constraint': {'Precision': 0.5, 'Recall': 0.17647058823529413, 'F-score': 0.2608695652173913}, 'overall': {'Precision': 0.4968152866242038, 'Recall': 0.7428571428571429, 'F-score': 0.5954198473282444}})
experiments_data.append({'data': {'Precision': 0.21140939597315436, 'Recall': 0.8873239436619719, 'F-score': 0.34146341463414637}, 'label data': {'Precision': 0.25842696629213485, 'Recall': 0.8518518518518519, 'F-score': 0.3965517241379311}, 'non-label data': {'Precision': 0.16483516483516483, 'Recall': 0.7894736842105263, 'F-score': 0.2727272727272727}, 'measurement': {'Precision': 0.5729166666666666, 'Recall': 0.3819444444444444, 'F-score': 0.4583333333333333}, 'colour': {'Precision': 0.4166666666666667, 'Recall': 0.4166666666666667, 'F-score': 0.4166666666666667}, 'firmness': {'Precision': 0.16666666666666666, 'Recall': 0.5, 'F-score': 0.25}, 'mass': {'Precision': 0.7272727272727273, 'Recall': 0.1702127659574468, 'F-score': 0.27586206896551724}, 'pathogen': {'Precision': 0.19047619047619047, 'Recall': 1.0, 'F-score': 0.32}, 'size': {'Precision': 0.2903225806451613, 'Recall': 0.25, 'F-score': 0.2686567164179105}, 'temperature': {'Precision': 0.875, 'Recall': 1.0, 'F-score': 0.9333333333333333}, 'water content': {'Precision': 1.0, 'Recall': 0.5, 'F-score': 0.6666666666666666}, 'time constraint': {'Precision': 0.6, 'Recall': 0.17647058823529413, 'F-score': 0.2727272727272727}, 'overall': {'Precision': 0.5031847133757962, 'Recall': 0.7523809523809524, 'F-score': 0.6030534351145038}})
experiments_data.append({'data': {'Precision': 0.17717717717717718, 'Recall': 0.8309859154929577, 'F-score': 0.2920792079207921}, 'label data': {'Precision': 0.225, 'Recall': 0.8333333333333334, 'F-score': 0.3543307086614173}, 'non-label data': {'Precision': 0.11304347826086956, 'Recall': 0.6842105263157895, 'F-score': 0.19402985074626866}, 'measurement': {'Precision': 0.5754716981132075, 'Recall': 0.4236111111111111, 'F-score': 0.488}, 'colour': {'Precision': 0.35714285714285715, 'Recall': 0.4166666666666667, 'F-score': 0.3846153846153846}, 'firmness': {'Precision': 0.2, 'Recall': 0.5, 'F-score': 0.28571428571428575}, 'mass': {'Precision': 0.8076923076923077, 'Recall': 0.22340425531914893, 'F-score': 0.35}, 'pathogen': {'Precision': 0.2, 'Recall': 1.0, 'F-score': 0.33333333333333337}, 'size': {'Precision': 0.30303030303030304, 'Recall': 0.2777777777777778, 'F-score': 0.28985507246376807}, 'temperature': {'Precision': 0.875, 'Recall': 1.0, 'F-score': 0.9333333333333333}, 'water content': {'Precision': 1.0, 'Recall': 0.5, 'F-score': 0.6666666666666666}, 'time constraint': {'Precision': 0.6, 'Recall': 0.17647058823529413, 'F-score': 0.2727272727272727}, 'overall': {'Precision': 0.46647230320699706, 'Recall': 0.7619047619047619, 'F-score': 0.5786618444846292}})
experiments_data.append({'data': {'Precision': 0.19135802469135801, 'Recall': 0.8732394366197183, 'F-score': 0.31392405063291134}, 'label data': {'Precision': 0.24210526315789474, 'Recall': 0.8518518518518519, 'F-score': 0.3770491803278688}, 'non-label data': {'Precision': 0.11711711711711711, 'Recall': 0.6842105263157895, 'F-score': 0.19999999999999998}, 'measurement': {'Precision': 0.5981308411214953, 'Recall': 0.4444444444444444, 'F-score': 0.5099601593625499}, 'colour': {'Precision': 0.45454545454545453, 'Recall': 0.4166666666666667, 'F-score': 0.43478260869565216}, 'firmness': {'Precision': 0.16666666666666666, 'Recall': 0.5, 'F-score': 0.25}, 'mass': {'Precision': 0.8636363636363636, 'Recall': 0.20212765957446807, 'F-score': 0.3275862068965517}, 'pathogen': {'Precision': 0.2, 'Recall': 1.0, 'F-score': 0.33333333333333337}, 'size': {'Precision': 0.34285714285714286, 'Recall': 0.3333333333333333, 'F-score': 0.3380281690140845}, 'temperature': {'Precision': 0.8235294117647058, 'Recall': 1.0, 'F-score': 0.9032258064516129}, 'water content': {'Precision': 0.75, 'Recall': 0.75, 'F-score': 0.75}, 'time constraint': {'Precision': 0.6, 'Recall': 0.17647058823529413, 'F-score': 0.2727272727272727}, 'overall': {'Precision': 0.4864864864864865, 'Recall': 0.7714285714285715, 'F-score': 0.5966850828729282}})
experiments_data.append({'data': {'Precision': 0.20748299319727892, 'Recall': 0.8591549295774648, 'F-score': 0.3342465753424657}, 'label data': {'Precision': 0.24581005586592178, 'Recall': 0.8148148148148148, 'F-score': 0.37768240343347637}, 'non-label data': {'Precision': 0.14432989690721648, 'Recall': 0.7368421052631579, 'F-score': 0.24137931034482754}, 'measurement': {'Precision': 0.6039603960396039, 'Recall': 0.4236111111111111, 'F-score': 0.4979591836734694}, 'colour': {'Precision': 0.4166666666666667, 'Recall': 0.4166666666666667, 'F-score': 0.4166666666666667}, 'firmness': {'Precision': 0.25, 'Recall': 0.5, 'F-score': 0.3333333333333333}, 'mass': {'Precision': 0.6818181818181818, 'Recall': 0.1595744680851064, 'F-score': 0.25862068965517243}, 'pathogen': {'Precision': 0.2222222222222222, 'Recall': 1.0, 'F-score': 0.3636363636363636}, 'size': {'Precision': 0.35714285714285715, 'Recall': 0.2777777777777778, 'F-score': 0.31250000000000006}, 'temperature': {'Precision': 0.875, 'Recall': 1.0, 'F-score': 0.9333333333333333}, 'water content': {'Precision': 0.5, 'Recall': 0.5, 'F-score': 0.5}, 'time constraint': {'Precision': 0.625, 'Recall': 0.29411764705882354, 'F-score': 0.4}, 'overall': {'Precision': 0.511326860841424, 'Recall': 0.7523809523809524, 'F-score': 0.6088631984585743}})
experiments_data.append({'data': {'Precision': 0.2026578073089701, 'Recall': 0.8591549295774648, 'F-score': 0.32795698924731187}, 'label data': {'Precision': 0.24193548387096775, 'Recall': 0.8333333333333334, 'F-score': 0.37500000000000006}, 'non-label data': {'Precision': 0.13725490196078433, 'Recall': 0.7368421052631579, 'F-score': 0.23140495867768596}, 'measurement': {'Precision': 0.5523809523809524, 'Recall': 0.4027777777777778, 'F-score': 0.46586345381526106}, 'colour': {'Precision': 0.45454545454545453, 'Recall': 0.4166666666666667, 'F-score': 0.43478260869565216}, 'firmness': {'Precision': 0.2, 'Recall': 0.5, 'F-score': 0.28571428571428575}, 'mass': {'Precision': 0.9047619047619048, 'Recall': 0.20212765957446807, 'F-score': 0.3304347826086957}, 'pathogen': {'Precision': 0.18181818181818182, 'Recall': 1.0, 'F-score': 0.3076923076923077}, 'size': {'Precision': 0.26666666666666666, 'Recall': 0.2222222222222222, 'F-score': 0.2424242424242424}, 'temperature': {'Precision': 0.8235294117647058, 'Recall': 1.0, 'F-score': 0.9032258064516129}, 'water content': {'Precision': 0.75, 'Recall': 0.75, 'F-score': 0.75}, 'time constraint': {'Precision': 0.6, 'Recall': 0.17647058823529413, 'F-score': 0.2727272727272727}, 'overall': {'Precision': 0.5032051282051282, 'Recall': 0.7476190476190476, 'F-score': 0.6015325670498084}})
experiments_data.append({'data': {'Precision': 0.18333333333333332, 'Recall': 0.9295774647887324, 'F-score': 0.30626450116009285}, 'label data': {'Precision': 0.24873096446700507, 'Recall': 0.9074074074074074, 'F-score': 0.39043824701195223}, 'non-label data': {'Precision': 0.09395973154362416, 'Recall': 0.7368421052631579, 'F-score': 0.16666666666666669}, 'measurement': {'Precision': 0.6052631578947368, 'Recall': 0.4791666666666667, 'F-score': 0.5348837209302326}, 'colour': {'Precision': 0.5, 'Recall': 0.5, 'F-score': 0.5}, 'firmness': {'Precision': 0.0, 'Recall': 0.0, 'F-score': 0}, 'mass': {'Precision': 0.8260869565217391, 'Recall': 0.20212765957446807, 'F-score': 0.3247863247863248}, 'pathogen': {'Precision': 0.23529411764705882, 'Recall': 1.0, 'F-score': 0.38095238095238093}, 'size': {'Precision': 0.425, 'Recall': 0.4722222222222222, 'F-score': 0.4473684210526316}, 'temperature': {'Precision': 0.8235294117647058, 'Recall': 1.0, 'F-score': 0.9032258064516129}, 'water content': {'Precision': 0.5, 'Recall': 0.5, 'F-score': 0.5}, 'time constraint': {'Precision': 0.4, 'Recall': 0.11764705882352941, 'F-score': 0.1818181818181818}, 'overall': {'Precision': 0.49866666666666665, 'Recall': 0.8904761904761904, 'F-score': 0.6393162393162393}})
experiments_data.append({'data': {'Precision': 0.17366946778711484, 'Recall': 0.8732394366197183, 'F-score': 0.2897196261682243}, 'label data': {'Precision': 0.22727272727272727, 'Recall': 0.8333333333333334, 'F-score': 0.35714285714285715}, 'non-label data': {'Precision': 0.10344827586206896, 'Recall': 0.7894736842105263, 'F-score': 0.18292682926829268}, 'measurement': {'Precision': 0.5779816513761468, 'Recall': 0.4375, 'F-score': 0.4980237154150198}, 'colour': {'Precision': 0.3076923076923077, 'Recall': 0.3333333333333333, 'F-score': 0.32}, 'firmness': {'Precision': 0.0, 'Recall': 0.0, 'F-score': 0}, 'mass': {'Precision': 0.85, 'Recall': 0.18085106382978725, 'F-score': 0.29824561403508776}, 'pathogen': {'Precision': 0.19047619047619047, 'Recall': 1.0, 'F-score': 0.32}, 'size': {'Precision': 0.38461538461538464, 'Recall': 0.4166666666666667, 'F-score': 0.4}, 'temperature': {'Precision': 0.8235294117647058, 'Recall': 1.0, 'F-score': 0.9032258064516129}, 'water content': {'Precision': 1.0, 'Recall': 0.5, 'F-score': 0.6666666666666666}, 'time constraint': {'Precision': 0.3333333333333333, 'Recall': 0.11764705882352941, 'F-score': 0.1739130434782609}, 'overall': {'Precision': 0.4540540540540541, 'Recall': 0.8, 'F-score': 0.5793103448275863}})
experiments_data.append({'data': {'Precision': 0.18155619596541786, 'Recall': 0.8873239436619719, 'F-score': 0.3014354066985646}, 'label data': {'Precision': 0.25, 'Recall': 0.8333333333333334, 'F-score': 0.3846153846153846}, 'non-label data': {'Precision': 0.09375, 'Recall': 0.7894736842105263, 'F-score': 0.16759776536312848}, 'measurement': {'Precision': 0.6272727272727273, 'Recall': 0.4791666666666667, 'F-score': 0.5433070866141733}, 'colour': {'Precision': 0.5, 'Recall': 0.6666666666666666, 'F-score': 0.5714285714285715}, 'firmness': {'Precision': 0.0, 'Recall': 0.0, 'F-score': 0}, 'mass': {'Precision': 0.8095238095238095, 'Recall': 0.18085106382978725, 'F-score': 0.2956521739130435}, 'pathogen': {'Precision': 0.2, 'Recall': 1.0, 'F-score': 0.33333333333333337}, 'size': {'Precision': 0.4722222222222222, 'Recall': 0.4722222222222222, 'F-score': 0.4722222222222222}, 'temperature': {'Precision': 0.8235294117647058, 'Recall': 1.0, 'F-score': 0.9032258064516129}, 'water content': {'Precision': 0.6666666666666666, 'Recall': 0.5, 'F-score': 0.5714285714285715}, 'time constraint': {'Precision': 0.3333333333333333, 'Recall': 0.11764705882352941, 'F-score': 0.1739130434782609}, 'overall': {'Precision': 0.4891304347826087, 'Recall': 0.8571428571428571, 'F-score': 0.6228373702422144}})
experiments_data.append({'data': {'Precision': 0.1745152354570637, 'Recall': 0.8873239436619719, 'F-score': 0.2916666666666667}, 'label data': {'Precision': 0.23115577889447236, 'Recall': 0.8518518518518519, 'F-score': 0.3636363636363636}, 'non-label data': {'Precision': 0.08, 'Recall': 0.631578947368421, 'F-score': 0.14201183431952663}, 'measurement': {'Precision': 0.6339285714285714, 'Recall': 0.4930555555555556, 'F-score': 0.5546875}, 'colour': {'Precision': 0.4375, 'Recall': 0.5833333333333334, 'F-score': 0.5}, 'firmness': {'Precision': 0.0, 'Recall': 0.0, 'F-score': 0}, 'mass': {'Precision': 0.875, 'Recall': 0.22340425531914893, 'F-score': 0.3559322033898305}, 'pathogen': {'Precision': 0.2, 'Recall': 1.0, 'F-score': 0.33333333333333337}, 'size': {'Precision': 0.42105263157894735, 'Recall': 0.4444444444444444, 'F-score': 0.43243243243243246}, 'temperature': {'Precision': 0.875, 'Recall': 1.0, 'F-score': 0.9333333333333333}, 'water content': {'Precision': 1.0, 'Recall': 0.5, 'F-score': 0.6666666666666666}, 'time constraint': {'Precision': 0.3333333333333333, 'Recall': 0.11764705882352941, 'F-score': 0.1739130434782609}, 'overall': {'Precision': 0.4854111405835544, 'Recall': 0.8714285714285714, 'F-score': 0.6235093696763203}})
experiments_data.append({'data': {'Precision': 0.1867816091954023, 'Recall': 0.9154929577464789, 'F-score': 0.31026252983293556}, 'label data': {'Precision': 0.28160919540229884, 'Recall': 0.9074074074074074, 'F-score': 0.4298245614035088}, 'non-label data': {'Precision': 0.09210526315789473, 'Recall': 0.7368421052631579, 'F-score': 0.16374269005847952}, 'measurement': {'Precision': 0.625, 'Recall': 0.4861111111111111, 'F-score': 0.5468749999999999}, 'colour': {'Precision': 0.5294117647058824, 'Recall': 0.75, 'F-score': 0.6206896551724139}, 'firmness': {'Precision': 0.25, 'Recall': 0.5, 'F-score': 0.3333333333333333}, 'mass': {'Precision': 0.75, 'Recall': 0.19148936170212766, 'F-score': 0.3050847457627119}, 'pathogen': {'Precision': 0.21052631578947367, 'Recall': 1.0, 'F-score': 0.34782608695652173}, 'size': {'Precision': 0.4594594594594595, 'Recall': 0.4722222222222222, 'F-score': 0.4657534246575342}, 'temperature': {'Precision': 0.7368421052631579, 'Recall': 1.0, 'F-score': 0.8484848484848484}, 'water content': {'Precision': 1.0, 'Recall': 0.5, 'F-score': 0.6666666666666666}, 'time constraint': {'Precision': 0.3333333333333333, 'Recall': 0.11764705882352941, 'F-score': 0.1739130434782609}, 'overall': {'Precision': 0.5027472527472527, 'Recall': 0.8714285714285714, 'F-score': 0.6376306620209059}})
experiments_data.append({'data': {'Precision': 0.18285714285714286, 'Recall': 0.9014084507042254, 'F-score': 0.30403800475059384}, 'label data': {'Precision': 0.25, 'Recall': 0.8888888888888888, 'F-score': 0.3902439024390244}, 'non-label data': {'Precision': 0.09090909090909091, 'Recall': 0.6842105263157895, 'F-score': 0.16049382716049382}, 'measurement': {'Precision': 0.5862068965517241, 'Recall': 0.4722222222222222, 'F-score': 0.523076923076923}, 'colour': {'Precision': 0.47058823529411764, 'Recall': 0.6666666666666666, 'F-score': 0.5517241379310345}, 'firmness': {'Precision': 0.0, 'Recall': 0.0, 'F-score': 0}, 'mass': {'Precision': 0.875, 'Recall': 0.22340425531914893, 'F-score': 0.3559322033898305}, 'pathogen': {'Precision': 0.25, 'Recall': 1.0, 'F-score': 0.4}, 'size': {'Precision': 0.3170731707317073, 'Recall': 0.3611111111111111, 'F-score': 0.33766233766233766}, 'temperature': {'Precision': 0.8235294117647058, 'Recall': 1.0, 'F-score': 0.9032258064516129}, 'water content': {'Precision': 1.0, 'Recall': 0.5, 'F-score': 0.6666666666666666}, 'time constraint': {'Precision': 0.2, 'Recall': 0.058823529411764705, 'F-score': 0.0909090909090909}, 'overall': {'Precision': 0.4745308310991957, 'Recall': 0.8428571428571429, 'F-score': 0.6072041166380789}})
experiments_data.append({'data': {'Precision': 0.1763085399449036, 'Recall': 0.9014084507042254, 'F-score': 0.2949308755760368}, 'label data': {'Precision': 0.23711340206185566, 'Recall': 0.8518518518518519, 'F-score': 0.3709677419354838}, 'non-label data': {'Precision': 0.08024691358024691, 'Recall': 0.6842105263157895, 'F-score': 0.143646408839779}, 'measurement': {'Precision': 0.6126126126126126, 'Recall': 0.4722222222222222, 'F-score': 0.5333333333333332}, 'colour': {'Precision': 0.5333333333333333, 'Recall': 0.6666666666666666, 'F-score': 0.5925925925925926}, 'firmness': {'Precision': 0.0, 'Recall': 0.0, 'F-score': 0}, 'mass': {'Precision': 0.8571428571428571, 'Recall': 0.19148936170212766, 'F-score': 0.31304347826086953}, 'pathogen': {'Precision': 0.23529411764705882, 'Recall': 1.0, 'F-score': 0.38095238095238093}, 'size': {'Precision': 0.47368421052631576, 'Recall': 0.5, 'F-score': 0.4864864864864865}, 'temperature': {'Precision': 0.8235294117647058, 'Recall': 1.0, 'F-score': 0.9032258064516129}, 'water content': {'Precision': 0.6666666666666666, 'Recall': 0.5, 'F-score': 0.5714285714285715}, 'time constraint': {'Precision': 0.42857142857142855, 'Recall': 0.17647058823529413, 'F-score': 0.25}, 'overall': {'Precision': 0.48021108179419525, 'Recall': 0.8666666666666667, 'F-score': 0.6179966044142615}})
experiments_data.append({'data': {'Precision': 0.1787709497206704, 'Recall': 0.9014084507042254, 'F-score': 0.2983682983682984}, 'label data': {'Precision': 0.26229508196721313, 'Recall': 0.8888888888888888, 'F-score': 0.4050632911392405}, 'non-label data': {'Precision': 0.08024691358024691, 'Recall': 0.6842105263157895, 'F-score': 0.143646408839779}, 'measurement': {'Precision': 0.6637931034482759, 'Recall': 0.5347222222222222, 'F-score': 0.5923076923076923}, 'colour': {'Precision': 0.5714285714285714, 'Recall': 0.6666666666666666, 'F-score': 0.6153846153846153}, 'firmness': {'Precision': 0.16666666666666666, 'Recall': 0.5, 'F-score': 0.25}, 'mass': {'Precision': 0.92, 'Recall': 0.24468085106382978, 'F-score': 0.3865546218487395}, 'pathogen': {'Precision': 0.23529411764705882, 'Recall': 1.0, 'F-score': 0.38095238095238093}, 'size': {'Precision': 0.5277777777777778, 'Recall': 0.5277777777777778, 'F-score': 0.5277777777777778}, 'temperature': {'Precision': 0.8235294117647058, 'Recall': 1.0, 'F-score': 0.9032258064516129}, 'water content': {'Precision': 1.0, 'Recall': 0.5, 'F-score': 0.6666666666666666}, 'time constraint': {'Precision': 0.2, 'Recall': 0.058823529411764705, 'F-score': 0.0909090909090909}, 'overall': {'Precision': 0.49318801089918257, 'Recall': 0.861904761904762, 'F-score': 0.6273830155979202}})
experiments_data.append({'data': {'Precision': 0.17403314917127072, 'Recall': 0.8873239436619719, 'F-score': 0.2909930715935335}, 'label data': {'Precision': 0.24607329842931938, 'Recall': 0.8703703703703703, 'F-score': 0.3836734693877551}, 'non-label data': {'Precision': 0.07878787878787878, 'Recall': 0.6842105263157895, 'F-score': 0.14130434782608695}, 'measurement': {'Precision': 0.6548672566371682, 'Recall': 0.5138888888888888, 'F-score': 0.5758754863813228}, 'colour': {'Precision': 0.6, 'Recall': 0.5, 'F-score': 0.5454545454545454}, 'firmness': {'Precision': 0.0, 'Recall': 0.0, 'F-score': 0}, 'mass': {'Precision': 0.8, 'Recall': 0.2127659574468085, 'F-score': 0.33613445378151263}, 'pathogen': {'Precision': 0.2222222222222222, 'Recall': 1.0, 'F-score': 0.3636363636363636}, 'size': {'Precision': 0.4473684210526316, 'Recall': 0.4722222222222222, 'F-score': 0.4594594594594595}, 'temperature': {'Precision': 0.8235294117647058, 'Recall': 1.0, 'F-score': 0.9032258064516129}, 'water content': {'Precision': 0.6666666666666666, 'Recall': 0.5, 'F-score': 0.5714285714285715}, 'time constraint': {'Precision': 0.3333333333333333, 'Recall': 0.11764705882352941, 'F-score': 0.1739130434782609}, 'overall': {'Precision': 0.49595687331536387, 'Recall': 0.8761904761904762, 'F-score': 0.6333907056798623}})
experiments_data.append({'data': {'Precision': 0.18181818181818182, 'Recall': 0.9014084507042254, 'F-score': 0.3026004728132388}, 'label data': {'Precision': 0.2422680412371134, 'Recall': 0.8703703703703703, 'F-score': 0.3790322580645161}, 'non-label data': {'Precision': 0.10344827586206896, 'Recall': 0.7894736842105263, 'F-score': 0.18292682926829268}, 'measurement': {'Precision': 0.6213592233009708, 'Recall': 0.4444444444444444, 'F-score': 0.5182186234817814}, 'colour': {'Precision': 0.6666666666666666, 'Recall': 0.6666666666666666, 'F-score': 0.6666666666666666}, 'firmness': {'Precision': 0.0, 'Recall': 0.0, 'F-score': 0}, 'mass': {'Precision': 0.8636363636363636, 'Recall': 0.20212765957446807, 'F-score': 0.3275862068965517}, 'pathogen': {'Precision': 0.21052631578947367, 'Recall': 1.0, 'F-score': 0.34782608695652173}, 'size': {'Precision': 0.4, 'Recall': 0.3333333333333333, 'F-score': 0.3636363636363636}, 'temperature': {'Precision': 0.7777777777777778, 'Recall': 1.0, 'F-score': 0.8750000000000001}, 'water content': {'Precision': 0.6666666666666666, 'Recall': 0.5, 'F-score': 0.5714285714285715}, 'time constraint': {'Precision': 0.3333333333333333, 'Recall': 0.11764705882352941, 'F-score': 0.1739130434782609}, 'overall': {'Precision': 0.49725274725274726, 'Recall': 0.861904761904762, 'F-score': 0.6306620209059234}})
experiments_data.append({'data': {'Precision': 0.17008797653958943, 'Recall': 0.8169014084507042, 'F-score': 0.2815533980582524}, 'label data': {'Precision': 0.22105263157894736, 'Recall': 0.7777777777777778, 'F-score': 0.3442622950819672}, 'non-label data': {'Precision': 0.09722222222222222, 'Recall': 0.7368421052631579, 'F-score': 0.17177914110429449}, 'measurement': {'Precision': 0.59375, 'Recall': 0.3958333333333333, 'F-score': 0.47500000000000003}, 'colour': {'Precision': 0.4666666666666667, 'Recall': 0.5833333333333334, 'F-score': 0.5185185185185186}, 'firmness': {'Precision': 0.0, 'Recall': 0.0, 'F-score': 0}, 'mass': {'Precision': 0.8235294117647058, 'Recall': 0.14893617021276595, 'F-score': 0.25225225225225223}, 'pathogen': {'Precision': 0.21052631578947367, 'Recall': 1.0, 'F-score': 0.34782608695652173}, 'size': {'Precision': 0.37037037037037035, 'Recall': 0.2777777777777778, 'F-score': 0.3174603174603175}, 'temperature': {'Precision': 0.7222222222222222, 'Recall': 0.9285714285714286, 'F-score': 0.8125000000000001}, 'water content': {'Precision': 1.0, 'Recall': 0.5, 'F-score': 0.6666666666666666}, 'time constraint': {'Precision': 0.4, 'Recall': 0.11764705882352941, 'F-score': 0.1818181818181818}, 'overall': {'Precision': 0.48295454545454547, 'Recall': 0.8095238095238095, 'F-score': 0.604982206405694}})
experiments_data.append({'data': {'Precision': 0.1791907514450867, 'Recall': 0.8732394366197183, 'F-score': 0.2973621103117505}, 'label data': {'Precision': 0.225, 'Recall': 0.8333333333333334, 'F-score': 0.3543307086614173}, 'non-label data': {'Precision': 0.09701492537313433, 'Recall': 0.6842105263157895, 'F-score': 0.1699346405228758}, 'measurement': {'Precision': 0.6371681415929203, 'Recall': 0.5, 'F-score': 0.5603112840466925}, 'colour': {'Precision': 0.6153846153846154, 'Recall': 0.6666666666666666, 'F-score': 0.64}, 'firmness': {'Precision': 0.25, 'Recall': 0.5, 'F-score': 0.3333333333333333}, 'mass': {'Precision': 0.8823529411764706, 'Recall': 0.1595744680851064, 'F-score': 0.2702702702702703}, 'pathogen': {'Precision': 0.23529411764705882, 'Recall': 1.0, 'F-score': 0.38095238095238093}, 'size': {'Precision': 0.48717948717948717, 'Recall': 0.5277777777777778, 'F-score': 0.5066666666666667}, 'temperature': {'Precision': 0.875, 'Recall': 1.0, 'F-score': 0.9333333333333333}, 'water content': {'Precision': 1.0, 'Recall': 0.5, 'F-score': 0.6666666666666666}, 'time constraint': {'Precision': 0.25, 'Recall': 0.058823529411764705, 'F-score': 0.09523809523809523}, 'overall': {'Precision': 0.4905149051490515, 'Recall': 0.861904761904762, 'F-score': 0.6252158894645942}})
experiments_data.append({'data': {'Precision': 0.18361581920903955, 'Recall': 0.9154929577464789, 'F-score': 0.3058823529411765}, 'label data': {'Precision': 0.24102564102564103, 'Recall': 0.8703703703703703, 'F-score': 0.3775100401606426}, 'non-label data': {'Precision': 0.09859154929577464, 'Recall': 0.7368421052631579, 'F-score': 0.17391304347826084}, 'measurement': {'Precision': 0.6330275229357798, 'Recall': 0.4791666666666667, 'F-score': 0.5454545454545454}, 'colour': {'Precision': 0.5833333333333334, 'Recall': 0.5833333333333334, 'F-score': 0.5833333333333334}, 'firmness': {'Precision': 0.0, 'Recall': 0.0, 'F-score': 0}, 'mass': {'Precision': 0.8333333333333334, 'Recall': 0.2127659574468085, 'F-score': 0.3389830508474576}, 'pathogen': {'Precision': 0.23529411764705882, 'Recall': 1.0, 'F-score': 0.38095238095238093}, 'size': {'Precision': 0.4864864864864865, 'Recall': 0.5, 'F-score': 0.4931506849315069}, 'temperature': {'Precision': 0.8235294117647058, 'Recall': 1.0, 'F-score': 0.9032258064516129}, 'water content': {'Precision': 0.6666666666666666, 'Recall': 0.5, 'F-score': 0.5714285714285715}, 'time constraint': {'Precision': 0.25, 'Recall': 0.058823529411764705, 'F-score': 0.09523809523809523}, 'overall': {'Precision': 0.49175824175824173, 'Recall': 0.8523809523809524, 'F-score': 0.6236933797909407}})
experiments_data.append({'data': {'Precision': 0.1662269129287599, 'Recall': 0.8873239436619719, 'F-score': 0.28}, 'label data': {'Precision': 0.21904761904761905, 'Recall': 0.8518518518518519, 'F-score': 0.3484848484848485}, 'non-label data': {'Precision': 0.07975460122699386, 'Recall': 0.6842105263157895, 'F-score': 0.14285714285714288}, 'measurement': {'Precision': 0.5575221238938053, 'Recall': 0.4375, 'F-score': 0.49027237354085607}, 'colour': {'Precision': 0.5, 'Recall': 0.5833333333333334, 'F-score': 0.5384615384615384}, 'firmness': {'Precision': 0.0, 'Recall': 0.0, 'F-score': 0}, 'mass': {'Precision': 0.8333333333333334, 'Recall': 0.1595744680851064, 'F-score': 0.2678571428571429}, 'pathogen': {'Precision': 0.23529411764705882, 'Recall': 1.0, 'F-score': 0.38095238095238093}, 'size': {'Precision': 0.38095238095238093, 'Recall': 0.4444444444444444, 'F-score': 0.41025641025641024}, 'temperature': {'Precision': 0.8235294117647058, 'Recall': 1.0, 'F-score': 0.9032258064516129}, 'water content': {'Precision': 0.75, 'Recall': 0.75, 'F-score': 0.75}, 'time constraint': {'Precision': 0.2, 'Recall': 0.058823529411764705, 'F-score': 0.0909090909090909}, 'overall': {'Precision': 0.4740932642487047, 'Recall': 0.8714285714285714, 'F-score': 0.6140939597315437}})

Fine-tuning Results

In [None]:
experiments_data = []
experiments_data.append({'overall': {'Precision': 0.9067853457172342, 'Recall': 0.8500000000000001, 'F-score': 0.8771980676328504}, 'data': {'Precision': 0.7650243040212108, 'Recall': 0.7253521126760563, 'F-score': 0.743421052631579}, 'labeldata': {'Precision': 0.7763636363636364, 'Recall': 0.6759259259259259, 'F-score': 0.7219387755102042}, 'non-labeldata': {'Precision': 0.7427536231884058, 'Recall': 0.7894736842105263, 'F-score': 0.7625482625482626}, 'measurement': {'Precision': 0.9121212121212121, 'Recall': 0.8298611111111112, 'F-score': 0.8690362847773194}, 'colour': {'Precision': 0.9444444444444444, 'Recall': 0.6666666666666666, 'F-score': 0.7809523809523811}, 'firmness': {'Precision': 0.3333333333333333, 'Recall': 0.75, 'F-score': 0.45}, 'mass': {'Precision': 0.8822801674914351, 'Recall': 0.6808510638297872, 'F-score': 0.7685064935064935}, 'pathogen': {'Precision': 0.3125, 'Recall': 0.5, 'F-score': 0.375}, 'size': {'Precision': 0.8768939393939394, 'Recall': 0.7916666666666667, 'F-score': 0.8320545609548168}, 'temperature': {'Precision': 0.9615384615384616, 'Recall': 0.8571428571428571, 'F-score': 0.905982905982906}, 'water content': {'Precision': 0.5, 'Recall': 0.125, 'F-score': 0.2}, 'time constraint': {'Precision': 1.0, 'Recall': 0.2647058823529412, 'F-score': 0.41774891774891776}})
experiments_data.append({'overall': {'Precision': 0.8998130179907013, 'Recall': 0.8523809523809525, 'F-score': 0.8753168795140384}, 'data': {'Precision': 0.7650243040212108, 'Recall': 0.7253521126760563, 'F-score': 0.743421052631579}, 'labeldata': {'Precision': 0.7763636363636364, 'Recall': 0.6759259259259259, 'F-score': 0.7219387755102042}, 'non-labeldata': {'Precision': 0.7427536231884058, 'Recall': 0.7894736842105263, 'F-score': 0.7625482625482626}, 'measurement': {'Precision': 0.9022058823529412, 'Recall': 0.8333333333333333, 'F-score': 0.8662930135557874}, 'colour': {'Precision': 0.8888888888888888, 'Recall': 0.6666666666666666, 'F-score': 0.761904761904762}, 'firmness': {'Precision': 0.3333333333333333, 'Recall': 0.75, 'F-score': 0.45}, 'mass': {'Precision': 0.876244131455399, 'Recall': 0.6808510638297872, 'F-score': 0.7661466738389815}, 'pathogen': {'Precision': 0.25, 'Recall': 0.375, 'F-score': 0.29166666666666663}, 'size': {'Precision': 0.8787878787878788, 'Recall': 0.8055555555555556, 'F-score': 0.8405797101449276}, 'temperature': {'Precision': 0.9615384615384616, 'Recall': 0.8571428571428571, 'F-score': 0.905982905982906}, 'water content': {'Precision': 0.5, 'Recall': 0.125, 'F-score': 0.2}, 'time constraint': {'Precision': 1.0, 'Recall': 0.2647058823529412, 'F-score': 0.41774891774891776}})
experiments_data.append({'overall': {'Precision': 0.8998130179907013, 'Recall': 0.7571428571428571, 'F-score': 0.8302872062663186}, 'data': {'Precision': 0.9183673469387755, 'Recall': 0.6338028169014085, 'F-score': 0.7500000000000001}, 'labeldata': {'Precision': 0.9354838709677419, 'Recall': 0.5370370370370371, 'F-score': 0.6823529411764706}, 'non-labeldata': {'Precision': 0.8888888888888888, 'Recall': 0.8421052631578947, 'F-score': 0.8648648648648649}, 'measurement': {'Precision': 0.88, 'Recall': 0.7638888888888888, 'F-score': 0.8178438661710037}, 'colour': {'Precision': 0.8888888888888888, 'Recall': 0.6666666666666666, 'F-score': 0.761904761904762}, 'firmness': {'Precision': 0.5, 'Recall': 0.5, 'F-score': 0.5}, 'mass': {'Precision': 0.8793103448275862, 'Recall': 0.5425531914893617, 'F-score': 0.6710526315789473}, 'pathogen': {'Precision': 0.2222222222222222, 'Recall': 0.5, 'F-score': 0.30769230769230765}, 'size': {'Precision': 0.75, 'Recall': 0.75, 'F-score': 0.75}, 'temperature': {'Precision': 1.0, 'Recall': 0.8571428571428571, 'F-score': 0.923076923076923}, 'water content': {'Precision': 0, 'Recall': 0.0, 'F-score': 0}, 'time constraint': {'Precision': 1.0, 'Recall': 0.23529411764705882, 'F-score': 0.38095238095238093}})
experiments_data.append({'overall': {'Precision': 0.8756756756756757, 'Recall': 0.7714285714285715, 'F-score': 0.8202531645569621}, 'data': {'Precision': 0.7678571428571429, 'Recall': 0.6056338028169014, 'F-score': 0.6771653543307087}, 'labeldata': {'Precision': 0.717948717948718, 'Recall': 0.5185185185185185, 'F-score': 0.6021505376344085}, 'non-labeldata': {'Precision': 0.8823529411764706, 'Recall': 0.7894736842105263, 'F-score': 0.8333333333333333}, 'measurement': {'Precision': 0.8721804511278195, 'Recall': 0.8055555555555556, 'F-score': 0.8375451263537906}, 'colour': {'Precision': 0.8888888888888888, 'Recall': 0.6666666666666666, 'F-score': 0.761904761904762}, 'firmness': {'Precision': 0.3333333333333333, 'Recall': 0.5, 'F-score': 0.4}, 'mass': {'Precision': 0.8695652173913043, 'Recall': 0.6382978723404256, 'F-score': 0.736196319018405}, 'pathogen': {'Precision': 0.2857142857142857, 'Recall': 0.5, 'F-score': 0.36363636363636365}, 'size': {'Precision': 0.8, 'Recall': 0.7777777777777778, 'F-score': 0.7887323943661971}, 'temperature': {'Precision': 1.0, 'Recall': 0.8571428571428571, 'F-score': 0.923076923076923}, 'water content': {'Precision': 0, 'Recall': 0.0, 'F-score': 0}, 'time constraint': {'Precision': 0.8, 'Recall': 0.23529411764705882, 'F-score': 0.3636363636363636}})
experiments_data.append({'overall': {'Precision': 0.8804347826086957, 'Recall': 0.7714285714285715, 'F-score': 0.8223350253807107}, 'data': {'Precision': 0.7962962962962963, 'Recall': 0.6056338028169014, 'F-score': 0.688}, 'labeldata': {'Precision': 0.7567567567567568, 'Recall': 0.5185185185185185, 'F-score': 0.6153846153846154}, 'non-labeldata': {'Precision': 0.8823529411764706, 'Recall': 0.7894736842105263, 'F-score': 0.8333333333333333}, 'measurement': {'Precision': 0.8656716417910447, 'Recall': 0.8055555555555556, 'F-score': 0.8345323741007193}, 'colour': {'Precision': 0.8888888888888888, 'Recall': 0.6666666666666666, 'F-score': 0.761904761904762}, 'firmness': {'Precision': 0.3333333333333333, 'Recall': 0.5, 'F-score': 0.4}, 'mass': {'Precision': 0.8714285714285714, 'Recall': 0.648936170212766, 'F-score': 0.7439024390243902}, 'pathogen': {'Precision': 0.2857142857142857, 'Recall': 0.5, 'F-score': 0.36363636363636365}, 'size': {'Precision': 0.7777777777777778, 'Recall': 0.7777777777777778, 'F-score': 0.7777777777777778}, 'temperature': {'Precision': 1.0, 'Recall': 0.7857142857142857, 'F-score': 0.88}, 'water content': {'Precision': 0, 'Recall': 0.0, 'F-score': 0}, 'time constraint': {'Precision': 0.8, 'Recall': 0.23529411764705882, 'F-score': 0.3636363636363636}})
experiments_data.append({'overall': {'Precision': 0.8810810810810811, 'Recall': 0.7761904761904762, 'F-score': 0.8253164556962025}, 'data': {'Precision': 0.8, 'Recall': 0.6197183098591549, 'F-score': 0.6984126984126985}, 'labeldata': {'Precision': 0.7631578947368421, 'Recall': 0.5370370370370371, 'F-score': 0.6304347826086957}, 'non-labeldata': {'Precision': 0.8823529411764706, 'Recall': 0.7894736842105263, 'F-score': 0.8333333333333333}, 'measurement': {'Precision': 0.8656716417910447, 'Recall': 0.8055555555555556, 'F-score': 0.8345323741007193}, 'colour': {'Precision': 0.8888888888888888, 'Recall': 0.6666666666666666, 'F-score': 0.761904761904762}, 'firmness': {'Precision': 0.3333333333333333, 'Recall': 0.5, 'F-score': 0.4}, 'mass': {'Precision': 0.8591549295774648, 'Recall': 0.648936170212766, 'F-score': 0.7393939393939394}, 'pathogen': {'Precision': 0.2857142857142857, 'Recall': 0.5, 'F-score': 0.36363636363636365}, 'size': {'Precision': 0.8, 'Recall': 0.7777777777777778, 'F-score': 0.7887323943661971}, 'temperature': {'Precision': 1.0, 'Recall': 0.7857142857142857, 'F-score': 0.88}, 'water content': {'Precision': 0, 'Recall': 0.0, 'F-score': 0}, 'time constraint': {'Precision': 0.8, 'Recall': 0.23529411764705882, 'F-score': 0.3636363636363636}})
experiments_data.append({'overall': {'Precision': 0.8877551020408163, 'Recall': 0.8285714285714286, 'F-score': 0.8571428571428572}, 'data': {'Precision': 0.7540983606557377, 'Recall': 0.647887323943662, 'F-score': 0.6969696969696969}, 'labeldata': {'Precision': 0.775, 'Recall': 0.5740740740740741, 'F-score': 0.6595744680851063}, 'non-labeldata': {'Precision': 0.7142857142857143, 'Recall': 0.7894736842105263, 'F-score': 0.7500000000000001}, 'measurement': {'Precision': 0.9029850746268657, 'Recall': 0.8402777777777778, 'F-score': 0.8705035971223023}, 'colour': {'Precision': 0.875, 'Recall': 0.5833333333333334, 'F-score': 0.7000000000000001}, 'firmness': {'Precision': 0, 'Recall': 0.0, 'F-score': 0}, 'mass': {'Precision': 0.8947368421052632, 'Recall': 0.723404255319149, 'F-score': 0.8}, 'pathogen': {'Precision': 0.2, 'Recall': 0.25, 'F-score': 0.22222222222222224}, 'size': {'Precision': 0.8235294117647058, 'Recall': 0.7777777777777778, 'F-score': 0.7999999999999999}, 'temperature': {'Precision': 1.0, 'Recall': 0.8571428571428571, 'F-score': 0.923076923076923}, 'water content': {'Precision': 0, 'Recall': 0.0, 'F-score': 0}, 'time constraint': {'Precision': 0.75, 'Recall': 0.35294117647058826, 'F-score': 0.48}})
experiments_data.append({'overall': {'Precision': 0.875, 'Recall': 0.8333333333333334, 'F-score': 0.8536585365853658}, 'data': {'Precision': 0.6666666666666666, 'Recall': 0.647887323943662, 'F-score': 0.6571428571428573}, 'labeldata': {'Precision': 0.6595744680851063, 'Recall': 0.5740740740740741, 'F-score': 0.613861386138614}, 'non-labeldata': {'Precision': 0.6818181818181818, 'Recall': 0.7894736842105263, 'F-score': 0.7317073170731707}, 'measurement': {'Precision': 0.9104477611940298, 'Recall': 0.8472222222222222, 'F-score': 0.8776978417266188}, 'colour': {'Precision': 0.875, 'Recall': 0.5833333333333334, 'F-score': 0.7000000000000001}, 'firmness': {'Precision': 0, 'Recall': 0.0, 'F-score': 0}, 'mass': {'Precision': 0.8947368421052632, 'Recall': 0.723404255319149, 'F-score': 0.8}, 'pathogen': {'Precision': 0.16666666666666666, 'Recall': 0.25, 'F-score': 0.2}, 'size': {'Precision': 0.8235294117647058, 'Recall': 0.7777777777777778, 'F-score': 0.7999999999999999}, 'temperature': {'Precision': 1.0, 'Recall': 0.8571428571428571, 'F-score': 0.923076923076923}, 'water content': {'Precision': 0, 'Recall': 0.0, 'F-score': 0}, 'time constraint': {'Precision': 0.75, 'Recall': 0.35294117647058826, 'F-score': 0.48}})
experiments_data.append({'overall': {'Precision': 0.8923076923076924, 'Recall': 0.8285714285714286, 'F-score': 0.8592592592592593}, 'data': {'Precision': 0.71875, 'Recall': 0.647887323943662, 'F-score': 0.6814814814814815}, 'labeldata': {'Precision': 0.7209302325581395, 'Recall': 0.5740740740740741, 'F-score': 0.6391752577319588}, 'non-labeldata': {'Precision': 0.7142857142857143, 'Recall': 0.7894736842105263, 'F-score': 0.7500000000000001}, 'measurement': {'Precision': 0.9097744360902256, 'Recall': 0.8402777777777778, 'F-score': 0.8736462093862816}, 'colour': {'Precision': 0.875, 'Recall': 0.5833333333333334, 'F-score': 0.7000000000000001}, 'firmness': {'Precision': 0, 'Recall': 0.0, 'F-score': 0}, 'mass': {'Precision': 0.8947368421052632, 'Recall': 0.723404255319149, 'F-score': 0.8}, 'pathogen': {'Precision': 0.2, 'Recall': 0.25, 'F-score': 0.22222222222222224}, 'size': {'Precision': 0.8235294117647058, 'Recall': 0.7777777777777778, 'F-score': 0.7999999999999999}, 'temperature': {'Precision': 1.0, 'Recall': 0.8571428571428571, 'F-score': 0.923076923076923}, 'water content': {'Precision': 0, 'Recall': 0.0, 'F-score': 0}, 'time constraint': {'Precision': 0.6666666666666666, 'Recall': 0.35294117647058826, 'F-score': 0.46153846153846156}})
experiments_data.append({'overall': {'Precision': 0.8872549019607843, 'Recall': 0.861904761904762, 'F-score': 0.8743961352657006}, 'data': {'Precision': 0.7361111111111112, 'Recall': 0.7464788732394366, 'F-score': 0.7412587412587414}, 'labeldata': {'Precision': 0.7755102040816326, 'Recall': 0.7037037037037037, 'F-score': 0.7378640776699029}, 'non-labeldata': {'Precision': 0.6521739130434783, 'Recall': 0.7894736842105263, 'F-score': 0.7142857142857143}, 'measurement': {'Precision': 0.9007633587786259, 'Recall': 0.8194444444444444, 'F-score': 0.858181818181818}, 'colour': {'Precision': 0.8888888888888888, 'Recall': 0.6666666666666666, 'F-score': 0.761904761904762}, 'firmness': {'Precision': 0.3333333333333333, 'Recall': 0.5, 'F-score': 0.4}, 'mass': {'Precision': 0.8714285714285714, 'Recall': 0.648936170212766, 'F-score': 0.7439024390243902}, 'pathogen': {'Precision': 0.25, 'Recall': 0.25, 'F-score': 0.25}, 'size': {'Precision': 0.8823529411764706, 'Recall': 0.8333333333333334, 'F-score': 0.8571428571428571}, 'temperature': {'Precision': 1.0, 'Recall': 0.9285714285714286, 'F-score': 0.962962962962963}, 'water content': {'Precision': 0, 'Recall': 0.0, 'F-score': 0}, 'time constraint': {'Precision': 1.0, 'Recall': 0.23529411764705882, 'F-score': 0.38095238095238093}})
experiments_data.append({'overall': {'Precision': 0.8872549019607843, 'Recall': 0.861904761904762, 'F-score': 0.8743961352657006}, 'data': {'Precision': 0.7397260273972602, 'Recall': 0.7605633802816901, 'F-score': 0.75}, 'labeldata': {'Precision': 0.78, 'Recall': 0.7222222222222222, 'F-score': 0.7500000000000001}, 'non-labeldata': {'Precision': 0.6521739130434783, 'Recall': 0.7894736842105263, 'F-score': 0.7142857142857143}, 'measurement': {'Precision': 0.9, 'Recall': 0.8125, 'F-score': 0.854014598540146}, 'colour': {'Precision': 0.8888888888888888, 'Recall': 0.6666666666666666, 'F-score': 0.761904761904762}, 'firmness': {'Precision': 0.3333333333333333, 'Recall': 0.5, 'F-score': 0.4}, 'mass': {'Precision': 0.8591549295774648, 'Recall': 0.648936170212766, 'F-score': 0.7393939393939394}, 'pathogen': {'Precision': 0.25, 'Recall': 0.25, 'F-score': 0.25}, 'size': {'Precision': 0.8787878787878788, 'Recall': 0.8055555555555556, 'F-score': 0.8405797101449276}, 'temperature': {'Precision': 1.0, 'Recall': 0.8571428571428571, 'F-score': 0.923076923076923}, 'water content': {'Precision': 0, 'Recall': 0.0, 'F-score': 0}, 'time constraint': {'Precision': 1.0, 'Recall': 0.23529411764705882, 'F-score': 0.38095238095238093}})
experiments_data.append({'overall': {'Precision': 0.882051282051282, 'Recall': 0.819047619047619, 'F-score': 0.8493827160493826}, 'data': {'Precision': 0.7361111111111112, 'Recall': 0.7464788732394366, 'F-score': 0.7412587412587414}, 'labeldata': {'Precision': 0.7169811320754716, 'Recall': 0.7037037037037037, 'F-score': 0.7102803738317758}, 'non-labeldata': {'Precision': 0.7894736842105263, 'Recall': 0.7894736842105263, 'F-score': 0.7894736842105263}, 'measurement': {'Precision': 0.8984375, 'Recall': 0.7986111111111112, 'F-score': 0.8455882352941176}, 'colour': {'Precision': 0.8, 'Recall': 0.3333333333333333, 'F-score': 0.47058823529411764}, 'firmness': {'Precision': 0.0, 'Recall': 0.0, 'F-score': 0}, 'mass': {'Precision': 0.88, 'Recall': 0.7021276595744681, 'F-score': 0.7810650887573964}, 'pathogen': {'Precision': 0.25, 'Recall': 0.25, 'F-score': 0.25}, 'size': {'Precision': 0.8484848484848485, 'Recall': 0.7777777777777778, 'F-score': 0.8115942028985507}, 'temperature': {'Precision': 1.0, 'Recall': 0.7857142857142857, 'F-score': 0.88}, 'water content': {'Precision': 0, 'Recall': 0.0, 'F-score': 0}, 'time constraint': {'Precision': 1.0, 'Recall': 0.23529411764705882, 'F-score': 0.38095238095238093}})
experiments_data.append({'overall': {'Precision': 0.8793969849246231, 'Recall': 0.8333333333333334, 'F-score': 0.8557457212713937}, 'data': {'Precision': 0.7361111111111112, 'Recall': 0.7464788732394366, 'F-score': 0.7412587412587414}, 'labeldata': {'Precision': 0.7169811320754716, 'Recall': 0.7037037037037037, 'F-score': 0.7102803738317758}, 'non-labeldata': {'Precision': 0.7894736842105263, 'Recall': 0.7894736842105263, 'F-score': 0.7894736842105263}, 'measurement': {'Precision': 0.8939393939393939, 'Recall': 0.8194444444444444, 'F-score': 0.855072463768116}, 'colour': {'Precision': 0.8333333333333334, 'Recall': 0.4166666666666667, 'F-score': 0.5555555555555556}, 'firmness': {'Precision': 0.3333333333333333, 'Recall': 0.5, 'F-score': 0.4}, 'mass': {'Precision': 0.88, 'Recall': 0.7021276595744681, 'F-score': 0.7810650887573964}, 'pathogen': {'Precision': 0.25, 'Recall': 0.25, 'F-score': 0.25}, 'size': {'Precision': 0.8285714285714286, 'Recall': 0.8055555555555556, 'F-score': 0.8169014084507044}, 'temperature': {'Precision': 1.0, 'Recall': 0.7857142857142857, 'F-score': 0.88}, 'water content': {'Precision': 0, 'Recall': 0.0, 'F-score': 0}, 'time constraint': {'Precision': 1.0, 'Recall': 0.23529411764705882, 'F-score': 0.38095238095238093}})
experiments_data.append({'overall': {'Precision': 0.9263157894736842, 'Recall': 0.8380952380952381, 'F-score': 0.88}, 'data': {'Precision': 0.7903225806451613, 'Recall': 0.6901408450704225, 'F-score': 0.7368421052631579}, 'labeldata': {'Precision': 0.7727272727272727, 'Recall': 0.6296296296296297, 'F-score': 0.6938775510204083}, 'non-labeldata': {'Precision': 0.8333333333333334, 'Recall': 0.7894736842105263, 'F-score': 0.8108108108108109}, 'measurement': {'Precision': 0.9242424242424242, 'Recall': 0.8472222222222222, 'F-score': 0.8840579710144927}, 'colour': {'Precision': 1.0, 'Recall': 0.6666666666666666, 'F-score': 0.8}, 'firmness': {'Precision': 0.3333333333333333, 'Recall': 1.0, 'F-score': 0.5}, 'mass': {'Precision': 0.9054054054054054, 'Recall': 0.7127659574468085, 'F-score': 0.7976190476190476}, 'pathogen': {'Precision': 0.375, 'Recall': 0.75, 'F-score': 0.5}, 'size': {'Precision': 0.875, 'Recall': 0.7777777777777778, 'F-score': 0.823529411764706}, 'temperature': {'Precision': 0.9230769230769231, 'Recall': 0.8571428571428571, 'F-score': 0.888888888888889}, 'water content': {'Precision': 1.0, 'Recall': 0.25, 'F-score': 0.4}, 'time constraint': {'Precision': 1.0, 'Recall': 0.29411764705882354, 'F-score': 0.45454545454545453}})
experiments_data.append({'overall': {'Precision': 0.9123711340206185, 'Recall': 0.8428571428571429, 'F-score': 0.8762376237623761}, 'data': {'Precision': 0.7903225806451613, 'Recall': 0.6901408450704225, 'F-score': 0.7368421052631579}, 'labeldata': {'Precision': 0.7727272727272727, 'Recall': 0.6296296296296297, 'F-score': 0.6938775510204083}, 'non-labeldata': {'Precision': 0.8333333333333334, 'Recall': 0.7894736842105263, 'F-score': 0.8108108108108109}, 'measurement': {'Precision': 0.9044117647058824, 'Recall': 0.8541666666666666, 'F-score': 0.8785714285714286}, 'colour': {'Precision': 0.8888888888888888, 'Recall': 0.6666666666666666, 'F-score': 0.761904761904762}, 'firmness': {'Precision': 0.3333333333333333, 'Recall': 1.0, 'F-score': 0.5}, 'mass': {'Precision': 0.8933333333333333, 'Recall': 0.7127659574468085, 'F-score': 0.7928994082840236}, 'pathogen': {'Precision': 0.25, 'Recall': 0.5, 'F-score': 0.3333333333333333}, 'size': {'Precision': 0.8787878787878788, 'Recall': 0.8055555555555556, 'F-score': 0.8405797101449276}, 'temperature': {'Precision': 0.9230769230769231, 'Recall': 0.8571428571428571, 'F-score': 0.888888888888889}, 'water content': {'Precision': 1.0, 'Recall': 0.25, 'F-score': 0.4}, 'time constraint': {'Precision': 1.0, 'Recall': 0.29411764705882354, 'F-score': 0.45454545454545453}})
experiments_data.append({'overall': {'Precision': 0.90625, 'Recall': 0.8285714285714286, 'F-score': 0.8656716417910447}, 'data': {'Precision': 0.8103448275862069, 'Recall': 0.6619718309859155, 'F-score': 0.7286821705426356}, 'labeldata': {'Precision': 0.8648648648648649, 'Recall': 0.5925925925925926, 'F-score': 0.7032967032967032}, 'non-labeldata': {'Precision': 0.7142857142857143, 'Recall': 0.7894736842105263, 'F-score': 0.7500000000000001}, 'measurement': {'Precision': 0.8947368421052632, 'Recall': 0.8263888888888888, 'F-score': 0.8592057761732851}, 'colour': {'Precision': 0.8888888888888888, 'Recall': 0.6666666666666666, 'F-score': 0.761904761904762}, 'firmness': {'Precision': 0.0, 'Recall': 0.0, 'F-score': 0}, 'mass': {'Precision': 0.9014084507042254, 'Recall': 0.6808510638297872, 'F-score': 0.7757575757575758}, 'pathogen': {'Precision': 0.3076923076923077, 'Recall': 1.0, 'F-score': 0.47058823529411764}, 'size': {'Precision': 0.8484848484848485, 'Recall': 0.7777777777777778, 'F-score': 0.8115942028985507}, 'temperature': {'Precision': 1.0, 'Recall': 0.7857142857142857, 'F-score': 0.88}, 'water content': {'Precision': 1.0, 'Recall': 0.25, 'F-score': 0.4}, 'time constraint': {'Precision': 1.0, 'Recall': 0.29411764705882354, 'F-score': 0.45454545454545453}})
experiments_data.append({'overall': {'Precision': 0.9047619047619048, 'Recall': 0.8142857142857143, 'F-score': 0.857142857142857}, 'data': {'Precision': 0.8518518518518519, 'Recall': 0.647887323943662, 'F-score': 0.7360000000000001}, 'labeldata': {'Precision': 0.9117647058823529, 'Recall': 0.5740740740740741, 'F-score': 0.7045454545454545}, 'non-labeldata': {'Precision': 0.75, 'Recall': 0.7894736842105263, 'F-score': 0.7692307692307692}, 'measurement': {'Precision': 0.8731343283582089, 'Recall': 0.8125, 'F-score': 0.841726618705036}, 'colour': {'Precision': 0.8888888888888888, 'Recall': 0.6666666666666666, 'F-score': 0.761904761904762}, 'firmness': {'Precision': 0.0, 'Recall': 0.0, 'F-score': 0}, 'mass': {'Precision': 0.863013698630137, 'Recall': 0.6702127659574468, 'F-score': 0.7544910179640719}, 'pathogen': {'Precision': 0.3333333333333333, 'Recall': 1.0, 'F-score': 0.5}, 'size': {'Precision': 0.8529411764705882, 'Recall': 0.8055555555555556, 'F-score': 0.8285714285714286}, 'temperature': {'Precision': 1.0, 'Recall': 0.7857142857142857, 'F-score': 0.88}, 'water content': {'Precision': 1.0, 'Recall': 0.25, 'F-score': 0.4}, 'time constraint': {'Precision': 1.0, 'Recall': 0.29411764705882354, 'F-score': 0.45454545454545453}})
experiments_data.append({'overall': {'Precision': 0.863849765258216, 'Recall': 0.8761904761904762, 'F-score': 0.8699763593380615}, 'data': {'Precision': 0.7571428571428571, 'Recall': 0.7464788732394366, 'F-score': 0.75177304964539}, 'labeldata': {'Precision': 0.8085106382978723, 'Recall': 0.7037037037037037, 'F-score': 0.7524752475247524}, 'non-labeldata': {'Precision': 0.6521739130434783, 'Recall': 0.7894736842105263, 'F-score': 0.7142857142857143}, 'measurement': {'Precision': 0.8278145695364238, 'Recall': 0.8680555555555556, 'F-score': 0.847457627118644}, 'colour': {'Precision': 0.6666666666666666, 'Recall': 0.6666666666666666, 'F-score': 0.6666666666666666}, 'firmness': {'Precision': 0.5, 'Recall': 0.5, 'F-score': 0.5}, 'mass': {'Precision': 0.8333333333333334, 'Recall': 0.7446808510638298, 'F-score': 0.7865168539325842}, 'pathogen': {'Precision': 0.14285714285714285, 'Recall': 0.25, 'F-score': 0.18181818181818182}, 'size': {'Precision': 0.8, 'Recall': 0.8888888888888888, 'F-score': 0.8421052631578948}, 'temperature': {'Precision': 0.9166666666666666, 'Recall': 0.7857142857142857, 'F-score': 0.8461538461538461}, 'water content': {'Precision': 0.0, 'Recall': 0.0, 'F-score': 0}, 'time constraint': {'Precision': 0.8, 'Recall': 0.23529411764705882, 'F-score': 0.3636363636363636}})
experiments_data.append({'overall': {'Precision': 0.8591549295774648, 'Recall': 0.8714285714285714, 'F-score': 0.8652482269503546}, 'data': {'Precision': 0.828125, 'Recall': 0.7464788732394366, 'F-score': 0.7851851851851853}, 'labeldata': {'Precision': 0.926829268292683, 'Recall': 0.7037037037037037, 'F-score': 0.8000000000000002}, 'non-labeldata': {'Precision': 0.6521739130434783, 'Recall': 0.7894736842105263, 'F-score': 0.7142857142857143}, 'measurement': {'Precision': 0.8266666666666667, 'Recall': 0.8611111111111112, 'F-score': 0.8435374149659863}, 'colour': {'Precision': 0.7272727272727273, 'Recall': 0.6666666666666666, 'F-score': 0.6956521739130435}, 'firmness': {'Precision': 0.3333333333333333, 'Recall': 0.5, 'F-score': 0.4}, 'mass': {'Precision': 0.8352941176470589, 'Recall': 0.7553191489361702, 'F-score': 0.7932960893854748}, 'pathogen': {'Precision': 0.125, 'Recall': 0.25, 'F-score': 0.16666666666666666}, 'size': {'Precision': 0.7948717948717948, 'Recall': 0.8611111111111112, 'F-score': 0.8266666666666667}, 'temperature': {'Precision': 0.8461538461538461, 'Recall': 0.7857142857142857, 'F-score': 0.8148148148148148}, 'water content': {'Precision': 0.0, 'Recall': 0.0, 'F-score': 0}, 'time constraint': {'Precision': 0.8, 'Recall': 0.23529411764705882, 'F-score': 0.3636363636363636}})
experiments_data.append({'overall': {'Precision': 0.8984, 'Recall':0.8 , 'F-score': 0.8463}, 'data': {'Precision': 0.8771929824561403, 'Recall': 0.704225352112676, 'F-score': 0.7812499999999999}, 'labeldata': {'Precision': 0.9444444444444444, 'Recall': 0.6296296296296297, 'F-score': 0.7555555555555556}, 'non-labeldata': {'Precision': 0.8, 'Recall': 0.8421052631578947, 'F-score': 0.8205128205128205}, 'measurement': {'Precision': 0.875, 'Recall': 0.7777777777777778, 'F-score': 0.823529411764706}, 'colour': {'Precision': 0.9, 'Recall': 0.75, 'F-score': 0.8181818181818182}, 'firmness': {'Precision': 0.0, 'Recall': 0.0, 'F-score': 0}, 'mass': {'Precision': 0.90625, 'Recall': 0.6170212765957447, 'F-score': 0.7341772151898734}, 'pathogen': {'Precision': 0.5, 'Recall': 0.5, 'F-score': 0.5}, 'size': {'Precision': 0.8787878787878788, 'Recall': 0.8055555555555556, 'F-score': 0.8405797101449276}, 'temperature': {'Precision': 1.0, 'Recall': 0.7857142857142857, 'F-score': 0.88}, 'water content': {'Precision': 1.0, 'Recall': 0.25, 'F-score': 0.4}, 'time constraint': {'Precision': 1.0,'Recall': 0.2941,'F-score': 0.4545}})

In [None]:
label_names = ['data', 'labeldata', 'non-labeldata', 'measurement', 'colour', 'firmness', 'mass', 'pathogen', 'size', 'temperature', 'water content', 'time constraint','overall']
accumulated_metrics = {label: {'Precision': 0, 'Recall': 0, 'F-score': 0} for label in label_names}
averages = {label: {'Precision': 0, 'Recall': 0, 'F-score': 0} for label in label_names}

In [None]:
for experiment in experiments_data:
    for label, metrics in experiment.items():
        accumulated_metrics[label]['Precision'] += metrics['Precision']
        accumulated_metrics[label]['Recall'] += metrics['Recall']
        accumulated_metrics[label]['F-score'] += metrics['F-score']

In [None]:
for label, metrics in accumulated_metrics.items():
    metrics['Precision'] /= 20
    metrics['Recall'] /= 20
    metrics['F-score'] /= 20

In [None]:
for label, metrics in accumulated_metrics.items():
    print(f"Label: {label}")
    print(f"Average Precision: {metrics['Precision']}")
    print(f"Average Recall: {metrics['Recall']}")
    print(f"Average F-score: {metrics['F-score']}")
    print("------")

In [None]:
# Create empty lists for each metric of each label
accumulated_metrics = {label: {'Precision': [], 'Recall': [], 'F-score': []} for label in experiments_data[0].keys()}

# Accumulate the metrics for all experiments
for experiment in experiments_data:
    for label, metrics in experiment.items():
        accumulated_metrics[label]['Precision'].append(metrics['Precision'])
        accumulated_metrics[label]['Recall'].append(metrics['Recall'])
        accumulated_metrics[label]['F-score'].append(metrics['F-score'])

In [None]:
for label, metrics in accumulated_metrics.items():
    print(metrics['Precision'])

Visualization

In [None]:
def visualization(metrics_data, metric_name):
    """
    Visualize a given metric.

    Args:
    - metrics_data (dict): A dictionary containing lists of metric values for each label.
    - metric_name (str): Name of the metric being visualized (e.g., "Precision", "Recall").
    """

    ticks = ['Overall', 'Measurement', 'Time\nConstraint', 'Data',
             'Temperature', 'Mass', 'Size',
             'Non-label\nData', 'Label\nData']

    formatted_ticks = {
    'Time\nConstraint': 'time constraint',
    'Non-label\nData': 'non-labeldata',
    'Label\nData': 'labeldata'
    }

    data = [metrics_data[formatted_ticks[tick] if tick in formatted_ticks else tick.lower().replace('\n', '')] for tick in ticks]

    #data = [metrics_data[tick.lower().replace('\n', '')] for tick in ticks]

    means = [Average(item) for item in data]

    positions = np.array(np.arange(0, 12, 1))
    widths = 0.4

    fig, ax = plt.subplots(figsize=(7, 7))

    def define_box_properties(plot_name, color_code):
        for k, v in plot_name.items():
            plt.setp(plot_name.get(k), color=color_code, linewidth=2)

        # Use plot function to draw a small line to name the legend.
        plt.plot([], c=color_code)

    for i, d in enumerate(data):
        bp = ax.boxplot(d, positions=[i], widths=widths, showmeans=True, vert=False, sym='r+')
        define_box_properties(bp, 'blue')

        for line in bp['means']:
            x, y = line.get_xydata()[0][0], line.get_xydata()[0][1]
            text = '{:.2f}'.format(means[i])
            plt.annotate(text, xy=(x - 0.01, y + 0.21), fontsize=15, fontweight='bold')

    # Set the title and other plot details
    plt.title(f"GPT {metric_name}", fontsize=15, fontweight='bold')
    plt.yticks(np.array(np.arange(0, 9)), ticks, fontsize=15, fontweight='bold')
    plt.xticks(fontsize=15, fontweight='bold')
    plt.xticks(np.arange(0, 1.1, 0.2))
    plt.savefig(f'GPT {metric_name}.pdf', bbox_inches='tight')
    plt.show()

# Average function (assuming you have it defined elsewhere in your code)
def Average(lst):
    return sum(lst) / len(lst)


In [None]:
def extract_metric_data(accumulated_metrics, metric_name):
    return {label: metrics[metric_name] for label, metrics in accumulated_metrics.items()}

accumulated_metrics_for_precision = extract_metric_data(accumulated_metrics, 'Precision')
accumulated_metrics_for_recall = extract_metric_data(accumulated_metrics, 'Recall')
accumulated_metrics_for_fscore = extract_metric_data(accumulated_metrics, 'F-score')

In [None]:
visualization(accumulated_metrics_for_precision, 'Precision')

In [None]:
visualization(accumulated_metrics_for_recall, 'Recall')

In [None]:
visualization(accumulated_metrics_for_fscore, 'F-score')

Statistical significance tests

In [None]:
# Assuming you named your csv file as "bert_data.csv"
bert_df = pd.read_csv("Evaluation Results/RQ1/dfboxplots/df_boxplotBERTbase.csv")

In [None]:
bert_df[0:20]

In [None]:
bert_df.columns

In [None]:
#wilcoxcon significance test
def wilcoxcon(lst1,lst2):

  # perform the Wilcoxon rank-sum test
  statistic, p_value = ranksums(lst1, lst2)
  # print the test results
  # print("Wilcoxon rank-sum test:")
  return(p_value)

In [None]:
#asymetric vargha delany significance test
def Average(lst):
    return sum(lst) / len(lst)
def a12(lst1,lst2,rev=True):
      if Average(lst1) < Average(lst2):
        rev=False

      more = same = 0.0
      for x in lst1:
          for y in lst2:
              if   x==y : same += 1
              elif rev     and x > y : more += 1
              elif not rev and x < y : more += 1
      res = (more + 0.5*same)  / (len(lst1)*len(lst2))
      if   0.71 <res :
        description = 'Large'
      elif 0.64 <res <=0.71:
        description = 'Medium'
      elif 0.56 <res <= 0.64:
        description = 'Small'
      elif res <= 0.56:
        description = 'negligible'

      if rev==False:
        res=1-res
        if res<0.29:
          description = 'Large'
        elif 0.29<=res<0.36:
          description = 'Medium'
        elif 0.36<=res<0.44:
          description = 'Small'
        elif 0.44<=res:
          description = 'negligible'
      return res, description

In [None]:
mapping = {
    'Measurement': 'measurement',
    'Temperature': 'temperature',
    'Mass': 'mass',
    'Size': 'size',
    'Time Constraint': 'time constraint',
    'Non-labelData': 'non-labeldata',
    'LabelData': 'labeldata',
    'Data': 'data',
    'Overall': 'overall',
}

In [None]:
# Create an empty dataframe with labels as columns
df_statistic = pd.DataFrame(columns=['Label', 'Metric', 'A12', 'P-value'])

for bert_label, gpt_label in mapping.items():
    # Step 1: Extract BERT data
    precision_bert_data = bert_df[bert_label][:20].tolist()
    recall_bert_data = bert_df[bert_label][20:40].tolist()
    fscore_bert_data = bert_df[bert_label][40:60].tolist()

    # Step 2: Extract GPT data (using gpt_label from the mapping)
    precision_gpt_data = [experiment[gpt_label]['Precision'] for experiment in experiments_data]
    recall_gpt_data = [experiment[gpt_label]['Recall'] for experiment in experiments_data]
    fscore_gpt_data = [experiment[gpt_label]['F-score'] for experiment in experiments_data]


    # Step 3: Compute the A12 statistic
    precision_a12_result = a12(precision_bert_data, precision_gpt_data)
    recall_a12_result = a12(recall_bert_data, recall_gpt_data)
    fscore_a12_result = a12(fscore_bert_data, fscore_gpt_data)

    # Step 4: Compute the Wilcoxon rank-sum test p-values
    precision_p_value = wilcoxcon(precision_bert_data, precision_gpt_data)
    recall_p_value = wilcoxcon(recall_bert_data, recall_gpt_data)
    fscore_p_value = wilcoxcon(fscore_bert_data, fscore_gpt_data)

    # Append to dataframe
    df_statistic = df_statistic.append({
        'Label': bert_label,
        'Metric': 'Precision',
        'A12': precision_a12_result,
        'P-value': precision_p_value
    }, ignore_index=True)

    df_statistic = df_statistic.append({
        'Label': bert_label,
        'Metric': 'Recall',
        'A12': recall_a12_result,
        'P-value': recall_p_value
    }, ignore_index=True)

    df_statistic = df_statistic.append({
        'Label': bert_label,
        'Metric': 'F-score',
        'A12': fscore_a12_result,
        'P-value': fscore_p_value
    }, ignore_index=True)

    print(f"Label: {gpt_label}")
    print(f"Precision A12 result: {precision_a12_result} | P-value: {precision_p_value}")
    print(f"Recall A12 result: {recall_a12_result} | P-value: {recall_p_value}")
    print(f"F-score A12 result: {fscore_a12_result} | P-value: {fscore_p_value}")
    print("============================================")

In [None]:
df_statistic.to_csv("statBERTbase-GPT.csv")

Finetuning GPT3.5



1.   prepare data
2.   upload data
3.   create fine tune job
4.   use fine-tuned model



In [None]:
# Load the Excel file
xlsx_file = pd.ExcelFile('Data/SFCR_1.xlsx')

# Get the sheet names
sheet_names = xlsx_file.sheet_names

# Create dataframes with specific names
df_sentences = xlsx_file.parse(sheet_names[0])  # Assuming the first sheet contains sentences
df_paragraphs = xlsx_file.parse(sheet_names[1])  # Assuming the second sheet contains paragraphs

# Removing rows where 'paragraph_id' is NaN
df_paragraphs = df_paragraphs.dropna(subset=['paragraph_id'])
# Resetting the index (optional)
df_paragraphs = df_paragraphs.reset_index(drop=True)
# Replace NaN values with 0 in df_sentences
df_sentences.fillna(0, inplace=True)

Prepare data

In [None]:
delimiter="%%"
# Specified columns
specified_columns = ['Overall', 'Data', 'LabelData', 'Non-labelData', 'Measurement', 'Temperature', 'Size', 'Mass', 'Water Content', 'Pathogen', 'Firmness', 'Colour', 'Time Constraint']

# Convert paragraphs dataframe to dictionary
paragraphs = {int(row['paragraph_id']): row['Statement'] for _, row in df_paragraphs.iterrows()}

# Convert sentences dataframe to dictionary
sentences = {}
for _, row in df_sentences.iterrows():
    pid = int(row['paragraph_id'])
    if pid not in sentences:
        sentences[pid] = []

    # Extract specified labels for each sentence
    labels = [str(int(float(row[col]))) for col in specified_columns]
    sentences[pid].append((row['Statement'], labels))



# Generate JSON data
test_data = []
for pid, section in paragraphs.items():
    user_message = section
    assistant_message = ' #### '.join([f"{sentence} ::: {', '.join(labels)}" for sentence, labels in sentences[pid]])
    test_data.append({
        "messages": [
            {
                "role": "system",
                "content": f"""You are an asssitsant tasked with extracting relevant text segments and their labels from the provided food safety paragraph. The entire paragraph is delimited within {delimiter} characters. Use only the provided labels in this exact order: 'Overall', 'Data', 'Label Data', 'Non-label Data', 'Measurement', 'Temperature', 'Size', 'Mass', 'Water Content', 'Pathogen', 'Firmness', 'Colour', 'Time Constraint'.
-Data: any information used to convey knowledge, provide assurance, or perform analysis. This includes 'Label Data' and 'Non-label Data'.-Label Data: a subtype of 'Data' that includes information that a food-product package or container must bear.-Non-label Data: a subtype of 'Data' that includes any food-safety-relevant data other than \
label data that needs to be collected and/or retained for inclusion in documents such as certificates, reports, guarantees, and letters.-Measurement: Association of numbers with physical quantities. This includes measurements of 'Colour', 'Firmness', 'Mass', 'Pathogen', 'Size', 'Temperature', and 'Water Content'.-Colour: a subtype of 'Measurement' \
that is self-evident.-Firmness: a subtype of 'Measurement' that refers to the degree of resistance to deformation.-Mass: a subtype of 'Measurement' that refers to the amount of substance by weight or volume.-Pathogen: a subtype of 'Measurement' that refers to a microorganism that causes disease.-Size: a subtype of 'Measurement' that refers to dimension \
(e.g., length or thickness) or surface area.-Temperature: a subtype of 'Measurement' that is self-evident.-Water Content: a subtype of 'Measurement' that refers to humidity or moisture.- Time Constraint: A temporal restriction, in our context, is expressed using intervals, deadlines or periodicity.-Overall: requirements-related provisions that include all the introduced concepts."""


            },
            {
                "role": "user",
                "content": f"{delimiter}{user_message}{delimiter}"
            },
            {
                "role": "assistant",
                "content": assistant_message
            }
        ]
    })

# Save to JSON file
with open('test_data.json', 'w') as file:
    json.dump(test_data, file, indent=4)

print("JSON data generated and saved to test_data.json")

JSON data generated and saved to test_data.json


In [None]:
# Load the JSON data
with open('fine_tuning_data.json', 'r') as file:
    data = json.load(file)

Number of examples for fine-tuning data

In [None]:
# Initial dataset stats
print("Num examples:", len(data))
print("First example:")
for message in data[0]["messages"]:
    print(message)

In [None]:
# Format error checks
format_errors = defaultdict(int)

for ex in data:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue

    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue

    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1

        if any(k not in ("role", "content", "name") for k in message):
            format_errors["message_unrecognized_key"] += 1

        if message.get("role", None) not in ("system", "user", "assistant"):
            format_errors["unrecognized_role"] += 1

        content = message.get("content", None)
        if not content or not isinstance(content, str):
            format_errors["missing_content"] += 1

    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

In [None]:
!pip install tiktoken

In [None]:
import tiktoken
# Token counting functions
encoding = tiktoken.get_encoding("cl100k_base")

# not exact!
# simplified from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

In [None]:
# Warnings and tokens counts
n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in data:
    messages = ex["messages"]
    if not any(message["role"] == "system" for message in messages):
        n_missing_system += 1
    if not any(message["role"] == "user" for message in messages):
        n_missing_user += 1
    n_messages.append(len(messages))
    convo_lens.append(num_tokens_from_messages(messages))
    assistant_message_lens.append(num_assistant_tokens_from_messages(messages))

print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)
print_distribution(n_messages, "num_messages_per_example")
print_distribution(convo_lens, "num_total_tokens_per_example")
print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
n_too_long = sum(l > 4096 for l in convo_lens)
print(f"\n{n_too_long} examples may be over the 4096 token limit, they will be truncated during fine-tuning")

Total number of tokens for fine-tuning data with 3 epochs

In [None]:
# Pricing and default n_epochs estimate
MAX_TOKENS_PER_EXAMPLE = 4096

TARGET_EPOCHS = 3
MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
MIN_DEFAULT_EPOCHS = 1
MAX_DEFAULT_EPOCHS = 25

n_epochs = TARGET_EPOCHS
n_train_examples = len(data)
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
    n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")
print("See pricing page to estimate total costs")


Total number of tokens for fine-tuning data with 2 epochs

In [None]:
# Pricing and default n_epochs estimate
MAX_TOKENS_PER_EXAMPLE = 4096

TARGET_EPOCHS = 2
MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
MIN_DEFAULT_EPOCHS = 1
MAX_DEFAULT_EPOCHS = 25

n_epochs = TARGET_EPOCHS
n_train_examples = len(data)
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
    n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")
print("See pricing page to estimate total costs")

Prepare Data

In [None]:
def save_to_jsonl(conversations, file_path):
    with open(file_path, 'w') as file:
        for conversation in conversations:
            json_line = json.dumps(conversation)
            file.write(json_line + '\n')

In [None]:
# Load the JSON data
with open('fine_tuning_data.json', 'r') as file:
    sampledata = json.load(file)

In [None]:
save_to_jsonl(sampledata,'fine_tuning.jsonl')

Upload Data

In [None]:
training_file_name = 'fine_tuning.jsonl'

In [None]:
with open("fine_tuning.jsonl") as file:
  response=openai.File.create(
      file=file,
      purpose='fine-tune'
  )
file_id=response['id']
print(f"file uploaded successfully with ID:{file_id}")

Create fine tune job

In [None]:
suffix_name = "RE2024-test"


response = openai.FineTuningJob.create(
    training_file="file-SRaoYklhSeW95KmGXEz8FnGS",
    model="gpt-3.5-turbo",
    suffix=suffix_name,
    hyperparameters={"n_epochs":3}
)

job_id = response["id"]
print(f"Fine-tuning job created successfully with ID:{job_id}")

In [None]:
# response = openai.FineTuningJob.retrieve(job_id)
print(response)

In [None]:
response = openai.FineTuningJob.list_events(id=job_id, limit=50)

events = response["data"]
events.reverse()

for event in events:
    print(event["message"])


Use fine-tuned model

In [None]:
# Load the Excel file
df1 = pd.ExcelFile('Data/Annotation_1.xlsx')

# Get the sheet names
sheet_names = df1.sheet_names

# Create dataframes with specific names
df_sentences = df1.parse(sheet_names[0])  # Assuming the first sheet contains sentences
df_paragraphs = df1.parse(sheet_names[1])  # Assuming the second sheet contains paragraphs
# Replace NaN values with 0 in df_sentences
df_sentences.fillna(0, inplace=True)

# Replace NaN values with 0 in df_paragraphs
df_paragraphs.fillna(0, inplace=True)

In [None]:
def get_completion_from_messages(messages,
                                 model="ft:gpt-3.5-turbo-0613:university-of-ottawa:re2024-test:8CbY4mSJ",
                                 temperature=0.4,
                                 max_tokens=None):
    # If max_tokens is not provided, calculate it based on the message tokens
    if max_tokens is None:
        input_tokens = num_tokens_from_messages(messages)
        max_tokens = 4096 - input_tokens

    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=temperature,
        max_tokens=max_tokens,
    )
    return response.choices[0].message["content"]


In [None]:
system_message=f"""Your task is to extract sentences from the provided food safety paragraph and label them based on the concepts they contain. The paragraph is enclosed within %% characters. Use only the following labels: 'Overall', 'Data', 'Label Data', 'Non-label Data', 'Measurement', 'Temperature', 'Size', 'Mass', 'Water Content', 'Pathogen', 'Firmness', 'Colour', 'Time Constraint'. If a concept is present in a given sentence, label the sentence accordingly. The concepts are defined as follows: -Data: any information used to convey knowledge, provide assurance, or perform analysis. This includes 'Label Data' and 'Non-label Data'. -Label Data: a subtype of 'Data' that includes information that a food-product package or container must bear. -Non-label Data: a subtype of 'Data' that includes any food-safety-relevant data other than label data that needs to be collected and/or retained for inclusion in documents such as certificates, reports, guarantees, and letters. -Measurement: Association of numbers with physical quantities. This includes measurements of 'Colour', 'Firmness', 'Mass', 'Pathogen', 'Size', 'Temperature', and 'Water Content'. -Colour: a subtype of 'Measurement' that is self-evident.-Firmness: a subtype of 'Measurement' that refers to the degree of resistance to deformation. -Mass: a subtype of 'Measurement' that refers to the amount of substance by weight or volume. -Pathogen: a subtype of 'Measurement' that refers to a microorganism that causes disease. -Size: a subtype of 'Measurement' that refers to dimension (e.g., length or thickness) or surface area. -Temperature: a subtype of 'Measurement' that is self-evident.-Water Content: a subtype of 'Measurement' that refers to humidity or moisture. -Time Constraint: A temporal restriction, in our context, is expressed using intervals, deadlines or periodicity. -Overall: requirements-related provisions that include all the introduced concepts."""
delimiter="%%"

Code for getting responses from the  fine-tuned model on whole test set

In [None]:
def response_to_dataframe(response):
    extracted_data = []
    lines = response.strip().split('####')  # Process all lines
    for line in lines:
        line = line.strip()
        if not line:  # skip empty lines after splitting
            continue

        parts = line.split(':::', 1)

        if len(parts) == 2:
            sentence, labels = parts
            sentence = sentence.strip(' "')
            labels = labels.strip()
        elif len(parts) == 1:
            print(f"Only sentence found (no label): {line}")  # Log the issue for visibility
            sentence = parts[0].strip(' "')
            labels = None
        else:
            print(f"Skipped line (improper format): {line}")  # Log the issue for visibility
            continue

        extracted_data.append((sentence, labels))

    df = pd.DataFrame(extracted_data, columns=["sentence", "label"])
    return df


In [None]:
final_df = pd.DataFrame(columns=["sentence", "label", "paragraph_id"])  # Initialize an empty DataFrame

# Initialize response_df here
response_df = pd.DataFrame(columns=["response", "paragraph_id"])

# Iterate over the dataframe to get model's responses
for index, row in df_paragraphs.iterrows():
    time.sleep(5)
    paragraph = row['Statement']
    paragraph_id = row['paragraph_id']
    user_message=f"%%{paragraph}%%"
    messages = [
        {'role': 'system', 'content': system_message},
        {'role': 'user', 'content': user_message}
    ]
    response = get_completion_from_messages(messages)

    # Append the response and its associated paragraph_id to response_df
    response_df.loc[len(response_df)] = [response, paragraph_id]

    # Convert the response to a dataframe
    current_df = response_to_dataframe(response)
    current_df["paragraph_id"] = paragraph_id  # Add the paragraph id

    # Directly append to final_df
    final_df = pd.concat([final_df, current_df], ignore_index=True)


In [None]:
response_df.to_csv('response.csv', encoding='utf-8-sig')

In [None]:
final_df.to_csv('finaldf.csv', encoding='utf-8-sig')

In [None]:
def get_best_match(sentence, indexed_sentences_list,paragraph_id,threshold,ngram_threshold, ngram_n=3):
    # Strip single quotes from the beginning and end of the sentence
    sentence=sentence.strip("'\"")
    # First, check for substring matches
    for idx, s in indexed_sentences_list:
        if sentence in s:
            # print("substring match", s)
            return idx,s

    lowered_sentence=lowercase_first_letter(sentence)

    for idx, s in indexed_sentences_list:
      if lowered_sentence in s:
          return idx,s

    for idx, s in indexed_sentences_list:
      if s in sentence:
          # print("Gt substring of S", s)
          return idx,s


    # If no exact substring match is found, check for similarity using Levenshtein distance
    best_match = None
    min_distance = float('inf')  # Initialize to a large value

    best_Levenshtein_match_idx=0
    for idx, s in indexed_sentences_list:
        distance = Levenshtein.distance(lowered_sentence, s)
        if distance < min_distance:
            min_distance = distance
            best_match = s
            best_Levenshtein_match_idx=idx

    # Convert the distance to a similarity ratio
    similarity_ratio = 1 - min_distance / max(len(lowered_sentence), len(best_match))

    if similarity_ratio > threshold:
        # print("best_Levenshtein_match_idx", best_match)
        return best_Levenshtein_match_idx,best_match

    elif similarity_ratio < threshold:
    # Check for substring matches
      clean_sentence = ''.join(e for e in sentence if e.isalnum() or e.isspace()).strip()
      for idx, s in indexed_sentences_list:
          clean_s = ''.join(e for e in s if e.isalnum() or e.isspace()).strip()
          if clean_sentence in clean_s:
              return idx,s

      best_ngram_similarity = 0
      best_ngram_match = None
      best_ngram_match_idx=0

      sentence_ngrams = ngrams(sentence, ngram_n)
      for idx, s in indexed_sentences_list:
          s_ngrams = ngrams(s, ngram_n)
          similarity = jaccard_similarity(sentence_ngrams, s_ngrams)

          if similarity > best_ngram_similarity:
              best_ngram_similarity = similarity
              best_ngram_match = s
              best_ngram_match_idx=idx

      if best_ngram_similarity >= ngram_threshold:
          return best_ngram_match_idx,best_ngram_match

      # print(f"Paragraph ID: {paragraph_id}")
      # print(f"Unmatched sentence: {sentence}")
      # print(f"Best Levenshtein match with similarity ratio: {similarity_ratio} was: {best_match}")
      # print(f"Best n-gram match with similarity: {best_ngram_similarity} was: {best_ngram_match}")
      # print("-----")

      # with open('output.txt', 'a') as file:
      #   file.write(f"Paragraph ID: {paragraph_id}\n")
      #   file.write(f"Unmatched sentence: {sentence}\n")
      #   file.write(f"Best match with similarity ratio: {similarity_ratio} was: {best_match}\n")
      #   file.write("-----\n")
      return None,None

In [None]:
# Compute precision, recall, and F-score for each label
def compute_scores(tp, fp, fn):
    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall = tp / (tp + fn) if tp + fn > 0 else 0
    fscore = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
    return precision, recall, fscore

In [None]:
labels_list = ["Overall","Data", "LabelData", "Non-labelData", "Measurement", "Colour", "Firmness", "Mass", "Pathogen", "Size", "Temperature", "Water Content", "Time Constraint"]

# Convert the labels list to lowercase for case-insensitivity (optional)
labels_list = [label.lower() for label in labels_list]

# Initialize all label columns with 0
for label in labels_list:
    final_df[label] = 0

# Update the label columns based on the labels column
for index, row in final_df.iterrows():

    # Check if the label is a string
    if isinstance(row['label'], str):
        labels = [label.strip().lower() for label in row['label'].split(',')]  # Stripping whitespace and converting to lowercase
        for label in labels:
            if label in labels_list:
                final_df.at[index, label] = 1
            else:
                print(f"Unrecognized label: {label} in row {index}")
                print(final_df['sentence'].iloc[index])
                print("+++++++")

In [None]:
measurement_subtypes = ["Colour", "Firmness", "Mass", "Size", "Temperature", "Water Content","Pathogen"]
data_subtypes = ["LabelData", "Non-labelData"]


measurement_subtypes= [label.lower() for label in measurement_subtypes]
data_subtypes= [label.lower() for label in data_subtypes]

for index, row in final_df.iterrows():
    # Set 'Overall' to 1 if any label is found
    if any(row[label] == 1 for label in labels_list):
        final_df.at[index, 'overall'] = 1

    # Set 'Measurement' to 1 if any of its subtypes are found
    if any(row[subtype] == 1 for subtype in measurement_subtypes):
        final_df.at[index, 'measurement'] = 1

    # Set 'Data' to 1 if any of its subtypes are found
    if any(row[subtype] == 1 for subtype in data_subtypes):
        final_df.at[index, 'data'] = 1


In [None]:
counters = {label: {"TP": 0, "FP": 0, "FN": 0} for label in labels_list}

df_sentences.columns = [col.lower() for col in df_sentences.columns]
unmatched_gt_sentences = {index: statement for index, statement in df_sentences['statement'].items()}
matched_extracted_sentences = defaultdict(list)  # using collections.defaultdict
# Dictionary to store missing labels for each matched ground truth sentence
missing_labels_dict = {
    (index, statement): [label for label in labels_list if df_sentences.loc[index, label] == 1]
    for index, statement in df_sentences['statement'].items()
}

for label in labels_list:
    init_count = sum([1 for labels in missing_labels_dict.values() if label in labels])
    assert init_count == df_sentences[label].sum(), f"Mismatch for {label} during initialization"

for index, row in final_df.iterrows():
# for index, row in final_df.iloc[:5].iterrows():
    extracted_sentence = row['sentence']
    paragraph_id = row['paragraph_id']
    # print("extracted_sentence: ", extracted_sentence)

    # Filter the ground truth dataframe by the current paragraph_id
    filtered_gt_df = df_sentences[df_sentences['paragraph_id'] == paragraph_id]

    # Prepare a list of tuples (index, statement)
    indexed_statements = list(filtered_gt_df[['statement']].itertuples(index=True, name=None))

    # Find the best matching ground truth sentence and its index
    matched_index, matched_sentence = get_best_match(extracted_sentence, indexed_statements, paragraph_id,threshold=0.90,ngram_threshold=0.90)

    # If there's a match, compare each label
    #if matched_index and matched_sentence:
    if matched_index is not None and matched_sentence is not None:

        matched_extracted_sentences[(matched_index, matched_sentence)].append(extracted_sentence)
        # Remove the matched sentence from unmatched_gt_sentences
        unmatched_gt_sentences.pop(matched_index, None)
        # print("matched_sentence: ", matched_sentence)

        ground_truth_row = filtered_gt_df.loc[filtered_gt_df['statement'] == matched_sentence].iloc[0]

        for label in labels_list:
            # Extracted is 1, Ground Truth is 1: True Positive
            if row[label] == 1 and ground_truth_row[label] == 1:
                if label in missing_labels_dict[matched_index,matched_sentence]:  # Only if the label is still missing
                  counters[label]["TP"] += 1
                  missing_labels_dict[(matched_index, matched_sentence)].remove(label)  # Remove the label as it's no longer missing

            # Extracted is 1, Ground Truth is 0: False Positive
            elif row[label] == 1 and ground_truth_row[label] == 0:
                counters[label]["FP"] += 1
    else:
        # print("unmatched extracted sentence: ", extracted_sentence)

        for label in labels_list:
            if row[label] == 1:
                counters[label]["FP"] += 1

# FN count is incremented for missing labels in the gt sentences.
for sentence, missing_labels in missing_labels_dict.items():
    for label in missing_labels:
        counters[label]["FN"] += 1


scores = {}
for label, counts in counters.items():
    precision, recall, fscore = compute_scores(counts["TP"], counts["FP"], counts["FN"])
    scores[label] = {"Precision": precision, "Recall": recall, "F-score": fscore}

In [None]:
with open('output.txt', 'a') as f:
    f.write(str(counters))
    f.write('\n\n')  # Separate the two dictionaries with two newlines
    f.write(str(scores))
    f.write('\n\n')  # Separate the two dictionaries with two newlines