# 3. Data Preparation:

This phase covers all activities to construct the final dataset (data that will be fed into the modeling tools) from the initial raw data. Tasks usually include table, record, and attribute selection as well as transformation and cleaning of data for modeling tools.

Text Cleaning and Normalization

1. Remove special characters and numbers: We'll use regex to remove any non-alphabetic characters, keeping only letters and spaces.
2. Convert to lowercase: This step ensures consistency across all text entries.
3. Remove accents: Spanish text often contains accented characters, which we'll normalize to their non-accented equivalents.
4. Remove extra whitespaces: Trim leading and trailing spaces and replace multiple spaces with a single space.

!python -m spacy download es_core_news_sm

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Example: Word frequency analysis for insights into common terms in "motivos"
from collections import Counter
from wordcloud import WordCloud

from collections import Counter
import re
from wordcloud import WordCloud
import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
import re
import unicodedata
import spacy
from collections import defaultdict
import pandas as pd 
import ast

## 3.1 Null or NaN fixing

In [2]:
# Load the dataset
df = pd.read_csv('../files/intent.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6679 entries, 0 to 6678
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   motivos  6679 non-null   object 
 1   crec     6679 non-null   int64  
 2   cred     6679 non-null   int64  
 3   equ      6679 non-null   int64  
 4   inic     6679 non-null   int64  
 5   inv      6679 non-null   int64  
 6   mkt      6679 non-null   int64  
 7   no       6679 non-null   int64  
 8   renta    6679 non-null   int64  
 9   sueldo   6679 non-null   int64  
 10  temp     6673 non-null   float64
dtypes: float64(1), int64(9), object(1)
memory usage: 574.1+ KB


In [4]:
# As we saw in the data analysis phase, the only value that doesn't have any category and is a nan is the row 1130, so lets see what are the "motivos" here
df.loc[1130]['motivos']

'Capital de trabajo\n,0,0,0,0,0,0,0,1,0,0\nRe invertir en materiales de manufacturacion,0,0,0,0,1,0,0,0,0,0\nPARA COMPRAR MAS PRODUCTOS  ASI COMO REFRIGERADORES.,0,0,1,0,1,0,0,0,0,0\nPARA COMPRA DE EQUIPO DE COMPUTO Y CONSUMIBLES,0,0,1,0,0,0,0,0,0,0\nInversión en desarrollo departamentos y compra de herramientas para la empresa,0,0,1,0,0,0,0,0,0,0\nRealizar ampliación de mi negocio independiente,1,0,0,0,0,0,0,0,0,0\nPara materias primas herramientas insumos y ampliación,0,0,0,0,1,0,0,0,0,0\nIncrementar mi negocio comprando suministros y manejar mas créditos,0,0,0,0,1,0,0,0,0,0\nInvertir en equipo de computo y tecnologias de la información,0,0,1,0,0,0,0,0,0,0\nInversion en productos y pago de facturas,0,1,0,0,0,0,0,0,0,0\nRe invertir en materiales de manufacturacion,0,0,1,0,0,0,0,0,0,0\nPAGAR CREDITOS CAROS E IMPUESTOS PENDIENTES,0,1,0,0,0,0,0,0,0,0\nExpancion y remodelacion de mobiliario del negocio,0,0,0,0,0,0,0,1,0,0\nCompra y diversificación de productos.El negocio es virtual y con 

In [5]:
# We see that we have rows inside ths motivo, probably due to a bad query or something went wrong. We need to fix it.

# The problematic row
motivo_text = df.loc[1130]['motivos']

# Split the text into lines
lines = motivo_text.strip().split('\n')

# Define the target labels
target_columns = ['crec', 'cred', 'equ', 'inic', 'inv', 'mkt', 'no', 'renta', 'sueldo', 'temp']

# Prepare a list to store parsed data
data = []

# Parse each line, skipping the first motive and attaching it to the second row
first_motivo_skipped = False
for line in lines:
    parts = line.split(',')
    motivo = parts[0].strip()  # Extract the motivo text
    values = list(map(int, parts[1:]))  # Convert the values to integers
    data.append([motivo] + values)  # Combine motivo and values into a single list

# Move the first motivo to the second row
if len(data) > 1:
    data[1][0] = data[0][0]  # Copy motivo from the first row to the second row
    data.pop(0)  # Remove the first row

# Create a new DataFrame with the parsed data
parsed_df = pd.DataFrame(data, columns=['motivos'] + target_columns)


# Display the resulting DataFrame
display(parsed_df)

Unnamed: 0,motivos,crec,cred,equ,inic,inv,mkt,no,renta,sueldo,temp
0,Capital de trabajo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,Re invertir en materiales de manufacturacion,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,PARA COMPRAR MAS PRODUCTOS ASI COMO REFRIGERA...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,PARA COMPRA DE EQUIPO DE COMPUTO Y CONSUMIBLES,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Inversión en desarrollo departamentos y compra...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Realizar ampliación de mi negocio independiente,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Para materias primas herramientas insumos y am...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
7,Incrementar mi negocio comprando suministros y...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
8,Invertir en equipo de computo y tecnologias de...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Inversion en productos y pago de facturas,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# Delete the last row 
parsed_df = parsed_df.iloc[:-1]

# Reset the index
parsed_df.reset_index(drop=True, inplace=True)

It appears that the descriptions of the "motivos" fit the assigned labels. Now, we must concatenate the dataframe with the main one.

In [7]:
df = pd.concat([df, parsed_df], axis=0)

Now we lets fill with cero the rest of the nans, since they already have a category.

In [8]:
# Fill NaN values with 0 d
df = df.fillna(0)

## 3.2 Data type transformation.

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6701 entries, 0 to 21
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   motivos  6701 non-null   object 
 1   crec     6701 non-null   float64
 2   cred     6701 non-null   float64
 3   equ      6701 non-null   float64
 4   inic     6701 non-null   float64
 5   inv      6701 non-null   float64
 6   mkt      6701 non-null   float64
 7   no       6701 non-null   float64
 8   renta    6701 non-null   float64
 9   sueldo   6701 non-null   float64
 10  temp     6701 non-null   float64
dtypes: float64(10), object(1)
memory usage: 628.2+ KB


We see that originaly all the categories where int, and now they are float, which is wrong. Lets convert them again into int.

In [10]:
# Convert the target columns to int type
df[target_columns] = df[target_columns].astype(int)

# Display the DataFrame to confirm the changes
print(df[target_columns].dtypes)

crec      int64
cred      int64
equ       int64
inic      int64
inv       int64
mkt       int64
no        int64
renta     int64
sueldo    int64
temp      int64
dtype: object


## 3.3 Text Normalization

In [11]:
# Text cleaning function
def clean_text(text):
    text = re.sub(r'[^a-zA-ZáéíóúÁÉÍÓÚñÑ\s]', '', text)
    text = text.lower()
    text = ''.join(c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn')
    text = ' '.join(text.split())
    return text

# Apply text cleaning
df['cleaned_text'] = df['motivos'].apply(clean_text)

Text Preprocessing

1. Tokenization: Split the text into individual words or tokens.
2. Remove stopwords: Eliminate common Spanish words that don't carry significant meaning for our classification task.
3. Lemmatization: Reduce words to their base or dictionary form. This is particularly important for Spanish, which has rich verb conjugations and noun/adjective agreements.

In [12]:
# Load Spanish language model for spaCy
nlp = spacy.load('es_core_news_sm')

# Text preprocessing function
def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return ' '.join(tokens)

# Apply text preprocessing
df['preprocessed_text'] = df['cleaned_text'].apply(preprocess_text)

In [13]:
df.head()

Unnamed: 0,motivos,crec,cred,equ,inic,inv,mkt,no,renta,sueldo,temp,cleaned_text,preprocessed_text
0,Crear un departamento de ventas e inversión a ...,0,0,0,0,0,1,0,0,0,0,crear un departamento de ventas e inversion a ...,crear departamento venta inversion publicidad
1,establecerme en un local y agregar materia pri...,0,0,0,0,1,0,0,1,0,0,establecerme en un local y agregar materia pri...,establecerme local agregar materia prima stock
2,Compra de equipo e incrementar inventario,0,0,1,0,1,0,0,0,0,0,compra de equipo e incrementar inventario,compra equipo incrementar inventario
3,Invertir en crecimiento de flotilla de unidade...,0,0,1,0,0,0,0,0,0,0,invertir en crecimiento de flotilla de unidade...,invertir crecimiento flotilla unidad carga seg...
4,Para comprar mercancía y comprar lonas nuevas,0,0,0,0,1,0,0,0,0,0,para comprar mercancia y comprar lonas nuevas,comprar mercancia comprar lona


## 3.4 Chat GPT Json Preparation

In [14]:
import json
import pandas as pd
import json
from sklearn.model_selection import train_test_split


# Prepare the dataset for chat model format
def prepare_data(df):
    prepared_data = []
    for _, row in df.iterrows():
        entry = {
            'messages': [
                {'role': 'system', 'content': 'This model processes financial assistance requests.'},
                {'role': 'user', 'content': row['cleaned_text']},
                {'role': 'assistant', 'content': ','.join(map(str, row[1:11].values))}
            ]
        }
        prepared_data.append(entry)
    return prepared_data

# Prepare the dataset
prepared_dataset = prepare_data(df)

In [15]:
prepared_dataset

[{'messages': [{'role': 'system',
    'content': 'This model processes financial assistance requests.'},
   {'role': 'user',
    'content': 'crear un departamento de ventas e inversion a publicidad'},
   {'role': 'assistant', 'content': '0,0,0,0,0,1,0,0,0,0'}]},
 {'messages': [{'role': 'system',
    'content': 'This model processes financial assistance requests.'},
   {'role': 'user',
    'content': 'establecerme en un local y agregar materia prima en stock'},
   {'role': 'assistant', 'content': '0,0,0,0,1,0,0,1,0,0'}]},
 {'messages': [{'role': 'system',
    'content': 'This model processes financial assistance requests.'},
   {'role': 'user', 'content': 'compra de equipo e incrementar inventario'},
   {'role': 'assistant', 'content': '0,0,1,0,1,0,0,0,0,0'}]},
 {'messages': [{'role': 'system',
    'content': 'This model processes financial assistance requests.'},
   {'role': 'user',
    'content': 'invertir en crecimiento de flotilla de unidades de carga para seguir cumpliendo con los 

In [16]:
# Split the prepared dataset into training and validation datasets
train_data, valid_data = train_test_split(prepared_dataset, test_size=0.2, random_state=42)

In [17]:
# Save the training set to a JSONL file
with open('train_data.jsonl', 'w') as train_file:
    for entry in train_data:
        json.dump(entry, train_file)
        train_file.write('\n')

# Save the validation set to a JSONL file
with open('valid_data.jsonl', 'w') as valid_file:
    for entry in valid_data:
        json.dump(entry, valid_file)
        valid_file.write('\n')

# 4. Fine-Tune GPT-3.5:

In [18]:
import os
from dotenv import load_dotenv
from openai import OpenAI


In [19]:
llm_model = "gpt-3.5-turbo"

In [20]:
client = OpenAI()

def upload_dataset_to_openai():
	validation_file = client.files.create(
		file=open("valid_data.jsonl", "rb"),
		purpose="fine-tune"
	)
	training_file = client.files.create(
		file=open("train_data.jsonl", "rb"),
		purpose="fine-tune",
	)
	return training_file, validation_file

def train_openai_model(training_file, validation_file):
	results = client.fine_tuning.jobs.create(
		training_file=training_file.id,
		validation_file=validation_file.id,
		model="gpt-3.5-turbo"
	)

# Already fine tuned.
#training_file, validation_file = upload_dataset_to_openai()
#train_openai_model(training_file, validation_file)

In [25]:
valid_data

[{'messages': [{'role': 'system',
    'content': 'This model processes financial assistance requests.'},
   {'role': 'user',
    'content': 'surtir mi negocio para que tenga de todo y poder ampliarla'},
   {'role': 'assistant', 'content': '0,0,0,0,1,0,0,0,0,0'}]},
 {'messages': [{'role': 'system',
    'content': 'This model processes financial assistance requests.'},
   {'role': 'user',
    'content': 'para capitalizarlo y llegar a mas lugares ya que solo vendo en la cd de tehuacan y necesito invertir mas'},
   {'role': 'assistant', 'content': '1,0,0,0,0,0,0,0,0,0'}]},
 {'messages': [{'role': 'system',
    'content': 'This model processes financial assistance requests.'},
   {'role': 'user', 'content': 'analisis medicos de fertilidad'},
   {'role': 'assistant', 'content': '0,0,0,0,0,0,1,0,0,0'}]},
 {'messages': [{'role': 'system',
    'content': 'This model processes financial assistance requests.'},
   {'role': 'user',
    'content': 'para poder tener un fondo con que pagar proveedore

In [59]:
def call_openai(system_prompt,prompt, model=llm_model):
	chat_completion = client.chat.completions.create(
		messages=[
			{
				"role": "system",
				"content": system_prompt
			},
			{
				"role": "user",
				"content": prompt,
			}
		],
		model=model,
	)
	return chat_completion.choices[0].message.content


def evaluate_dataset(model, validation_dataset):
    results = []
    real_data=[]
    for row in validation_dataset:
        target_columns = ['crec', 'cred', 'equ', 'inic', 'inv', 'mkt', 'no', 'renta', 'sueldo', 'temp']
        
        # Define the system prompt for the task
        system_prompt = "This model processes financial assistance requests and classifies them into categories."

        # Create a user prompt based on the 'motivos' column
        prompt = row['messages'][1]['content']

        try:
            # Call OpenAI API to get the classification
            result =call_openai(system_prompt, prompt, model)
            result_list = [int(label) for label in result.split(',')]

            result_predicted = [target_columns[i] for i in range(len(target_columns)) if result_list[i] == 1]

            real_result = row['messages'][2]['content']
            real_result_list = [int(label) for label in real_result.split(',')]          
            real_result_label = [target_columns[i] for i in range(len(target_columns)) if real_result_list[i] == 1]


            # Post-process the result if necessary (e.g., convert to binary labels)
            # Assuming result is a comma-separated string of labels
            
            result_labels = [int(label) for label in result.split(',')]
            results.append(result_labels)
            real_data.append(real_result)
            print("----------")
            print(f'User prompt: {prompt}\nPredicted category:{result_predicted}\nReal result category:{real_result_label}')
        except Exception as e:
             print(f"Error en el valor retornado por el modelo {result} saltando el prompt {prompt}")

    # Create a DataFrame for validation results and calculate accuracy compared to actual queries
    validation_df = pd.DataFrame()
    validation_df['predicted_labels'] = results

    # Compare predicted labels with the actual labels (assuming actual labels are in columns 1 to 10)
    validation_df['actual_labels'] = real_data

    # Calculate accuracy
    accuracy = (validation_df['predicted_labels'] == validation_df['actual_labels']).mean()

    # Save results to a CSV file
    validation_df.to_csv(f"{model}_validation_accuracy_results.csv", index=False)
    print(f"Accuracy: {accuracy}")

    return accuracy

evaluate_dataset(model="ft:gpt-3.5-turbo-0125:suprabound::AZ9ij1gt",validation_dataset=valid_data)

----------
User prompt: surtir mi negocio para que tenga de todo y poder ampliarla
Predicted category:['inv']
Real result category:['inv']
----------
User prompt: para capitalizarlo y llegar a mas lugares ya que solo vendo en la cd de tehuacan y necesito invertir mas
Predicted category:['crec']
Real result category:['crec']
----------
User prompt: analisis medicos de fertilidad
Predicted category:['no']
Real result category:['no']
----------
User prompt: para poder tener un fondo con que pagar proveedores y deudas cuando mis clientes no paguen a tiempo
Predicted category:['cred']
Real result category:['inv']
----------
User prompt: lo utilizaremos para capital de trabajo e innovacion en tecnologia
Predicted category:['crec']
Real result category:['crec']
----------
User prompt: hacer crecer mi marca expandirme y mejorarla
Predicted category:['crec']
Real result category:['crec']
----------
User prompt: me dedico a la venta de productos de limpieza y le vendo a la unam le doy credito y 

np.float64(0.0)

In [85]:
# Read the validation dataframe
validation_df = pd.read_csv("ft:gpt-3.5-turbo-0125:suprabound::AZ9ij1gt_validation_accuracy_results.csv")

In [86]:
# Transform the different formats to standarize them
# Function to convert string to list of integers
def convert_string_to_list(s):
    return list(map(int, s.split(',')))

# Apply the conversion to the 'actual_labels' column
validation_df['actual_labels'] = validation_df['actual_labels'].apply(convert_string_to_list)

# Function to convert string representation of list to actual list 
def convert_string_list_to_list(s): 
    return ast.literal_eval(s)

# Apply the conversion to the 'predicted_labels' column 
validation_df['predicted_labels'] = validation_df['predicted_labels'].apply(convert_string_list_to_list)

In [87]:
# Perform element-wise multiplication and create the new column 
validation_df['multiplied_labels'] = validation_df.apply(lambda row: [a * b for a, b in zip(row['predicted_labels'], row['actual_labels'])], axis=1)

# Create the fourth column indicating if there is a single 1 in the multiplied_labels vector 
validation_df['contains_single_1'] = validation_df['multiplied_labels'].apply(lambda x: 1 in x)

We have two ways to measure the model performance:

- 1. By comparing the exact respones of the model and the real data. 
- 2. By comparing the model output with the real data but considering some of the partial coincidences, when there is more than one label.

In [88]:
# Comparing directly the outputs, we see:
accuracy = (validation_df['predicted_labels'] == validation_df['actual_labels']).mean()
accuracy

np.float64(0.6239700374531835)

In [None]:
# Contemplating the model's partial predictions.
validation_df['contains_single_1'].mean()

np.float64(0.7617977528089888)

We see that the second measure is an accuracy of 76.18%, a great improvement from the 62.4% without this flexibilty. This means that though it its working fine, and the model is predicting partially good those cases with multiple labels, it need an adjustment for those cases since they are minority in the data set, 20% and this is why the trained model tend to do a single one classification rather tahn a multiple classification.

# 5. Second approach (using label names instead of vectors)





When considering how to format the output for a multilabel classification task using a model like GPT-3.5 Turbo, you have two main options: outputting a vector of binary labels (0s and 1s) or outputting the names of the labels directly. Each approach has its advantages and potential drawbacks.

### **Comparison of Output Formats**

| **Output Format** | **Advantages**                                                                 | **Disadvantages**                                                             |
|-------------------|--------------------------------------------------------------------------------|------------------------------------------------------------------------------|
| **Binary Vector** | - Simple and structured format.                                                | - Requires the model to learn exact positions for each label.                |
|                   | - Directly maps to the typical multilabel classification setup.                | - May be less interpretable without context (i.e., what each position means).|
| **Label Names**   | - More interpretable as it provides explicit label names.                      | - Requires additional logic to handle multiple labels (e.g., concatenation). |
|                   | - Easier to understand and debug outputs.                                      | - Potentially more complex prompt engineering to ensure correct label naming.|

### **Recommendation**

For a task involving GPT-3.5 Turbo, using label names directly might be more advantageous because:

1. **Interpretability:** Outputting label names makes it easier to understand and verify the model's predictions, especially when multiple labels can apply.
2. **Prompt Flexibility:** You can craft prompts that guide the model to list applicable labels, which might align better with natural language processing capabilities.
3. **Debugging and Validation:** It's easier to debug and validate outputs when they are human-readable and directly correspond to known categories.

So, based on this, we are going to try the second approach, using Label Names.

In [45]:
result = [x for x in ['hola', 'que', 'tal'] if x in [0,1,0]==1]
result

[]

In [47]:
words = ['hola', 'que', 'tal']
binary_vector = [0, 1,1]

# Use index to check corresponding value in binary_vector
result = [words[i] for i in range(len(words)) if binary_vector[i] == 1]

print(result)


['que', 'tal']


In [49]:
def prepare_data_with_labels(df):
    label_names = ['crec', 'cred', 'equ', 'inic', 'inv', 'mkt', 'no', 'renta', 'sueldo', 'temp']
    prepared_data = []
    for _, row in df.iterrows():
        active_labels = [label_names[i] for i, val in enumerate(row[1:11]) if val == 1]
        entry = {
            'messages': [
                {'role': 'system', 'content': 'This model processes financial loan requests.'},
                {'role': 'user', 'content': row['motivos']},
                {'role': 'assistant', 'content': ', '.join(active_labels)}
            ]
        }
        prepared_data.append(entry)
    return prepared_data

# Prepare the dataset
prepared_label_dataset = prepare_data_with_labels(df)

In [50]:
prepared_label_dataset

[{'messages': [{'role': 'system',
    'content': 'This model processes financial loan requests.'},
   {'role': 'user',
    'content': 'Crear un departamento de ventas e inversión a publicidad'},
   {'role': 'assistant', 'content': 'mkt'}]},
 {'messages': [{'role': 'system',
    'content': 'This model processes financial loan requests.'},
   {'role': 'user',
    'content': 'establecerme en un local y agregar materia prima en stock.'},
   {'role': 'assistant', 'content': 'inv, renta'}]},
 {'messages': [{'role': 'system',
    'content': 'This model processes financial loan requests.'},
   {'role': 'user', 'content': 'Compra de equipo e incrementar inventario'},
   {'role': 'assistant', 'content': 'equ, inv'}]},
 {'messages': [{'role': 'system',
    'content': 'This model processes financial loan requests.'},
   {'role': 'user',
    'content': 'Invertir en crecimiento de flotilla de unidades de carga para seguir cumpliendo con los requerimientos de nuestros clientes  a quien damos nuestros

In [None]:
# Split the prepared dataset into training and validation datasets
train_data, valid_data = train_test_split(prepared_dataset, test_size=0.2, random_state=42)

In [None]:
# Save the training set to a JSONL file
with open('train_data.jsonl', 'w') as train_file:
    for entry in train_data:
        json.dump(entry, train_file)
        train_file.write('\n')

# Save the validation set to a JSONL file
with open('valid_data.jsonl', 'w') as valid_file:
    for entry in valid_data:
        json.dump(entry, valid_file)
        valid_file.write('\n')

training_file, validation_file = upload_dataset_to_openai()
train_openai_model(training_file, validation_file)

In [None]:

def evaluate_dataset(model, validation_dataset):
    results = []
    label_names = ['crec', 'cred', 'equ', 'inic', 'inv', 'mkt', 'no', 'renta', 'sueldo', 'temp']

    for row in validation_dataset:
        # Define the system prompt for the task
        system_prompt = "This model processes financial loan requests and classifies them into categories."

        # Create a user prompt based on the 'motivos' column
        prompt = row['motivos']

        # Call OpenAI API to get the classification
        result = call_openai(system_prompt, prompt, model)
        result_predicted = [label_names[i] for i in range(len(label_names)) if result[i] == 1]

        

        # Split the resulting labels assuming they are returned as a comma-separated string
        predicted_label_names = [label.strip() for label in result.split(',') if label.strip() in label_names]

        # Convert predicted label names to binary vector
        predicted_vector = [1 if name in predicted_label_names else 0 for name in label_names]
        results.append(predicted_vector)

    # Create a DataFrame for validation results and calculate accuracy compared to actual queries
    validation_df = pd.DataFrame(validation_dataset)
    validation_df['predicted_labels'] = results

    # Generate actual binary vectors for comparison
    actual_vectors = validation_df.iloc[:, 1:11].values.tolist()
    validation_df['actual_labels'] = actual_vectors

    # Calculate accuracy: measure the number of exact matches
    accuracy = sum([pred == actual for pred, actual in zip(results, actual_vectors)]) / len(results)

    # Save results to a CSV file
    validation_df.to_csv(f"{model}_validation_accuracy_results.csv", index=False)
    print(f"Accuracy: {accuracy}")

    return accuracy

# Example usage:
# evaluate_dataset(model="ft:gpt-3.5-turbo-0125:suprabound::AOBfVT8c", validation_dataset=validation_file)