# Notebook to split datasets into proper files adn find imbalances in data

## Imports

In [1]:
import os

from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()


import pandas as pd
from sklearn.model_selection import train_test_split

from utils import parse_ratings

import ast

  from .autonotebook import tqdm as notebook_tqdm


## Prompt dataset for RL training and final evaluation

In [2]:
RL_DATA_PATH = os.getenv("RL_DATA_PATH")
PROMPT_DATASET = os.getenv("PROMPT_DATASET_CSV")

# load prompt dataset
df = pd.read_csv(PROMPT_DATASET, sep=';')

## Random split

In [3]:
print(df.columns)

Index(['prompt', 'precondition_texts', 'precondition_positions'], dtype='object')


In [4]:
# separate into df with short answers and long answers
short_df = df[df['precondition_texts'].apply(lambda x: len(ast.literal_eval(x)) <= 10)]
long_df = df[df['precondition_texts'].apply(lambda x: len(ast.literal_eval(x)) > 10)]


train_df_short, temp_df_short = train_test_split(short_df, test_size=0.25, random_state=42)
train_df_long, temp_df_long = train_test_split(long_df, test_size=0.5, random_state=42)

val_df_short, test_df_short = train_test_split(temp_df_short, test_size=0.5, random_state=42)
val_df_long, test_df_long = train_test_split(temp_df_long, test_size=0.5, random_state=42)



train_df = pd.concat([train_df_short, train_df_long], ignore_index=True)
val_df = pd.concat([val_df_short, val_df_long], ignore_index=True)
test_df = pd.concat([test_df_short, test_df_long], ignore_index=True)

#TODO: change this back if using separate split

# train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
# val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)



# Check the sizes
print(f"Train: {len(train_df)}")
print(f"Validation: {len(val_df)}")
print(f"Test: {len(test_df)}")

count_train = train_df['precondition_texts'].apply(lambda x: isinstance(ast.literal_eval(x), dict) and len(ast.literal_eval(x)) >= 10).sum()
count_val = val_df['precondition_texts'].apply(lambda x: isinstance(ast.literal_eval(x), dict) and len(ast.literal_eval(x)) >= 10).sum()
count_test = test_df['precondition_texts'].apply(lambda x: isinstance(ast.literal_eval(x), dict) and len(ast.literal_eval(x)) >= 10).sum()


print(f"The lomg amswer counts for train, val, test are: {count_train, count_val, count_test}")

# Save to CSV files without the index column
train_df.to_csv(RL_DATA_PATH + "/train_random.csv", index=False, sep=';')
val_df.to_csv(RL_DATA_PATH + "/validation_random.csv", index=False, sep=';')
test_df.to_csv(RL_DATA_PATH + "/test_random.csv", index=False, sep=';')

Train: 18
Validation: 4
Test: 4
The lomg amswer counts for train, val, test are: (np.int64(2), np.int64(1), np.int64(1))


## Dedicated separate split

In [5]:
# Use facts rijksbegroting as valid + test since if is made of 9 facts, hence 9 different prompts out of 25 --> about 30%
phrase1 = "--- Aantal Subfacts ---"
phrase2 = "Comptabiliteitswet 2016"

mask = df['prompt'].str.contains(phrase1, case=False, na=False) & df['prompt'].str.contains(phrase2, case=False, na=False)

train_df_det = df[~mask]
val_test_df_det = df[mask]

# Second split: validation + test
val_df_det, test_df_det = train_test_split(val_test_df_det, test_size=0.5, random_state=42)

# Check the sizes
print(f"Train: {len(train_df_det)}")
print(f"Validation: {len(val_df_det)}")
print(f"Test: {len(test_df_det)}")

# Save to CSV files without the index column
train_df_det.to_csv(RL_DATA_PATH + "/train_determined.csv", index=False, sep=';')
val_df_det.to_csv(RL_DATA_PATH + "/validation_determined.csv", index=False, sep=';')
test_df_det.to_csv(RL_DATA_PATH + "/test_determined.csv", index=False, sep=';')


Train: 17
Validation: 4
Test: 5


## Feedback dataset for Reward model finetuning

In [6]:
# Reward data path
REWARD_DATA_PATH = os.getenv("REWARD_DATA_PATH")

# Reward model data
FILE_1 = os.getenv("FILE_1")
FILE_5 = os.getenv("FILE_5")
FILE_7 = os.getenv("FILE_7")
FILE_9 = os.getenv("FILE_9")
FILE_10_1 = os.getenv("FILE_10_1")
FILE_10_2 = os.getenv("FILE_10_2")
FILE_SYNTH = os.getenv("FILE_SYNTH")

In [7]:
# load dataframes
df_1 = pd.read_csv(FILE_1, sep=";")
df_5 = pd.read_csv(FILE_5, sep=";")
df_7 = pd.read_csv(FILE_7, sep=";")
df_9 = pd.read_csv(FILE_9, sep=";")
df_10_1 = pd.read_csv(FILE_10_1, sep=";")
df_10_2 = pd.read_csv(FILE_10_2, sep=";")
df_synth = pd.read_csv(FILE_SYNTH, sep=";")

df_human = pd.concat([df_1, df_5, df_7, df_9, df_10_1, df_10_2], ignore_index=True)

#### Re-structure df synthetic to fit in training loop

In [8]:
print("Synthetic feedback shape:", df_synth.shape)


# Save the current headers since forgot to store headers in csv file
old_headers = df_synth.columns.tolist()

# Step 2: Insert the headers as the first row
df_synth.loc[-1] = old_headers # Add headers as a new row
df_synth.index = df_synth.index + 1 # Shift index
df_synth = df_synth.sort_index() # Sort index to place the new row at the top


# Step 3: Assign new headers
df_synth.columns = ['file', 
                            'frame_ID', 
                            'frame_type', 
                            'frame_text', 
                            'precondition_id', 
                            'precondition_text', 
                            'precondition_position', 
                            'response_text', 
                            'prompt_config_examples', 
                            'prompt_config_chain_of_thought', 
                            'feedback_extraction', 
                            'feedback_detection', 
                            'additional_feedback',
                            'synthetic_feedback',
                ]

print(df_synth.columns)


df_synth['prompt_config_examples'] = (df_synth['prompt_config_examples']                                              
                                                .astype(str)
                                                .str.strip()
                                                .str.lower()
                                                .map({'true': True, 'false': False})
)

df_synth['prompt_config_chain_of_thought'] = (df_synth['prompt_config_chain_of_thought']
                                                .astype(str)
                                                .str.strip()
                                                .str.lower()
                                                .map({'true': True, 'false': False})
)



print("Synthetic feedback shape:", df_synth.shape)

Synthetic feedback shape: (563, 14)
Index(['file', 'frame_ID', 'frame_type', 'frame_text', 'precondition_id',
       'precondition_text', 'precondition_position', 'response_text',
       'prompt_config_examples', 'prompt_config_chain_of_thought',
       'feedback_extraction', 'feedback_detection', 'additional_feedback',
       'synthetic_feedback'],
      dtype='object')
Synthetic feedback shape: (564, 14)


## Random split (human data)

In [9]:
train_df_human, temp_df_human = train_test_split(df_human, test_size=0.3, random_state=42)
val_df_human, test_df_human = train_test_split(temp_df_human, test_size=0.5, random_state=42)

# Check the sizes
print(f"Train: {len(train_df_human)}")
print(f"Validation: {len(val_df_human)}")
print(f"Test: {len(test_df_human)}")

# Save to CSV files without the index column
train_df_human.to_csv(REWARD_DATA_PATH + "/train_human_random.csv", index=False, sep=';')
val_df_human.to_csv(REWARD_DATA_PATH + "/validation_human_random.csv", index=False, sep=';')
test_df_human.to_csv(REWARD_DATA_PATH + "/test_human_random.csv", index=False, sep=';')

Train: 650
Validation: 139
Test: 140


## Defined split (human data)

In [10]:
# Use participatiewet since it makes up about one third (0.27) of the data and is hence good for test + eval

df_human['file'].value_counts()

file
Interpretatie_Vw_over_besluiten_op_aanvragen_voor_een_verblijfsvergunning_regulier_bepaalde_tijd.json    432
Participatiewet_most_recent_public.json                                                                  255
rijksbegrotingscyclus.json                                                                               242
Name: count, dtype: int64

In [11]:
train_df_human_det = df_human[df_human['file'] != 'Participatiewet_most_recent_public.json']

temp_df_human_det = df_human[df_human['file'] == 'Participatiewet_most_recent_public.json']

# Second split: validation + test
val_df_human_det, test_df_human_det = train_test_split(temp_df_human_det, test_size=0.5, random_state=42)

# Check the sizes
print(f"Train: {len(train_df_human_det)}")
print(f"Validation: {len(val_df_human_det)}")
print(f"Test: {len(test_df_human_det)}")

# Save to CSV files without the index column
train_df_human_det.to_csv(REWARD_DATA_PATH + "/train_human_determined.csv", index=False, sep=';')
val_df_human_det.to_csv(REWARD_DATA_PATH + "/validation_human_determined.csv", index=False, sep=';')
test_df_human_det.to_csv(REWARD_DATA_PATH + "/test_human_determined.csv", index=False, sep=';')

Train: 674
Validation: 127
Test: 128


## Random split (synthetic data)

In [12]:
train_df_synth, temp_df_synth = train_test_split(df_synth, test_size=0.25, random_state=42)
val_df_synth, test_df_synth = train_test_split(temp_df_synth, test_size=0.5, random_state=42)

# Check the sizes
print(f"Train: {len(train_df_synth)}")
print(f"Validation: {len(val_df_synth)}")
print(f"Test: {len(test_df_synth)}")

# Save to CSV files without the index column
train_df_synth.to_csv(REWARD_DATA_PATH + "/train_synth_random.csv", index=False, sep=';')
val_df_synth.to_csv(REWARD_DATA_PATH + "/validation_synth_random.csv", index=False, sep=';')
test_df_synth.to_csv(REWARD_DATA_PATH + "/test_synth_random.csv", index=False, sep=';')

Train: 423
Validation: 70
Test: 71


## Determined split (synthetic data)

In [13]:
df_synth['file'].value_counts()

# TAking rijksbegrotingscyclus since it make up roughly 30% of the data

file
Participatiewet_most_recent_public.json                                                                  208
Interpretatie_Vw_over_besluiten_op_aanvragen_voor_een_verblijfsvergunning_regulier_bepaalde_tijd.json    188
rijksbegrotingscyclus.json                                                                               168
Name: count, dtype: int64

In [14]:
train_df_synth_det = df_synth[df_synth['file'] != 'rijksbegrotingscyclus.json']

temp_df_synth_det = df_synth[df_synth['file'] == 'rijksbegrotingscyclus.json']

# Second split: validation + test
val_df_synth_det, test_df_synth_det = train_test_split(temp_df_synth_det, test_size=0.5, random_state=42)

# Check the sizes
print(f"Train: {len(train_df_synth_det)}")
print(f"Validation: {len(val_df_synth_det)}")
print(f"Test: {len(test_df_synth_det)}")

# Save to CSV files without the index column
train_df_synth_det.to_csv(REWARD_DATA_PATH + "/train_synth_determined.csv", index=False, sep=';')
val_df_synth_det.to_csv(REWARD_DATA_PATH + "/validation_synth_determined.csv", index=False, sep=';')
test_df_synth_det.to_csv(REWARD_DATA_PATH + "/test_synth_determined.csv", index=False, sep=';')

Train: 396
Validation: 84
Test: 84


# Find data imbalances

In [15]:
columns_of_interest = ['feedback_extraction','feedback_detection']


for col in columns_of_interest:
    df_human[col] = df_human[col].apply(parse_ratings)
    df_synth[col] = df_synth[col].apply(parse_ratings)


# Function to get unique value counts for each column
def unique_value_counts(df, columns_of_interest):
    return {col: df[col].value_counts() for col in columns_of_interest}

# Get value counts for each DataFrame
df_human_counts = unique_value_counts(df_human, columns_of_interest)
df_synth_counts = unique_value_counts(df_synth, columns_of_interest)

print(f"Human data counts: {df_human_counts}")
print(f"Synthetic data counts: {df_synth_counts}")

Human data counts: {'feedback_extraction': feedback_extraction
0    499
3    231
2    104
1     95
Name: count, dtype: int64, 'feedback_detection': feedback_detection
4    630
6    280
5     19
Name: count, dtype: int64}
Synthetic data counts: {'feedback_extraction': feedback_extraction
0    377
2     94
1     50
3     43
Name: count, dtype: int64, 'feedback_detection': feedback_detection
4    404
6    144
5     16
Name: count, dtype: int64}
