In [1]:
import os
import pandas as pd
import pyarrow.parquet as pq
from google import genai
from dotenv import load_dotenv

In [2]:
load_dotenv()
api_key = os.getenv("GENAI_API_KEY")

client = genai.Client(api_key=api_key)

In [3]:
def read_parquet_files_by_split_prefix(data_dir, split):
    split_dataframes = []
    required_prefix = f"{split}-"
    if not os.path.isdir(data_dir):
        print(f"Error: Data directory '{data_dir}' not found.")
        return split_dataframes

    print(f"Looking for files with prefix '{required_prefix}' in: {data_dir}")
    try:
        entries = os.listdir(data_dir)

        for entry in entries:
            filepath = os.path.join(data_dir, entry)
            if os.path.isfile(filepath) and entry.startswith(required_prefix) and entry.endswith(".parquet"):
                print(f"  Reading: {filepath}")
                try:
                    df = pd.read_parquet(filepath)
                    split_dataframes.append(df)
                except Exception as e:
                    print(f"  Error reading {filepath}: {e}")

    except Exception as e:
         print(f"Error listing files in {data_dir}: {e}")


    return split_dataframes

def filter_entries_by_language(dataframe, language):

    if 'language' not in dataframe.columns:
        print("Error: 'language' column not found in the DataFrame.")
        return pd.DataFrame() 
    filtered_df = dataframe[dataframe['language'] == language]
    return filtered_df
    

In [4]:
data_directory = '../data'

In [5]:
test_split_dataframes = read_parquet_files_by_split_prefix(data_directory, split='test')

Looking for files with prefix 'test-' in: ../data
  Reading: ../data/test-00001-of-00004.parquet
  Reading: ../data/test-00002-of-00004.parquet
  Reading: ../data/test-00003-of-00004.parquet
  Reading: ../data/test-00000-of-00004.parquet


In [6]:
test_split_dfs = pd.concat(test_split_dataframes, ignore_index=True)
test_split_dfs

Unnamed: 0,type,grade,subject,language,chemical_structure,table,figure,graph,sample_id,image,answer_key
0,text,12,Mathematics,Croatian,0,0,0,0,1b71ac67-5d48-411e-993c-feeed577f2fc,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,
1,text,12,Mathematics,Croatian,0,0,0,0,d93f4f8a-40bf-4c46-a1cf-0a747211a637,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,
2,text,12,Mathematics,Croatian,0,0,0,0,74823f84-d26f-4eb7-9812-1fb53dc9089f,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,
3,text,12,Mathematics,Croatian,0,0,0,0,2759ecad-cbdf-4c4b-9249-cbb16405a920,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,
4,text,12,Mathematics,Croatian,0,0,0,0,177c86a9-2ba9-4dfd-8a12-fe0e0c37f496,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,
...,...,...,...,...,...,...,...,...,...,...,...
3560,text,12,Mathematics,Croatian,0,0,0,0,94ca0e6c-32ba-4e6e-81cc-8e361238b1f2,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,
3561,image_text,12,Mathematics,Croatian,0,0,0,1,38d5f485-f8a2-4fee-9eeb-b4f6bc13dd60,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,
3562,image_text,12,Mathematics,Croatian,0,0,1,0,b1bea9d3-ef40-4cb2-aee6-141d21fad590,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,
3563,text,12,Mathematics,Croatian,0,0,0,0,a40fdf04-795e-40a2-a932-6ab8ac2d16ac,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,


In [8]:
# val_split_dfs.to_parquet('../data/val_split_dfs.parquet', index=False)

In [7]:
test_split_dfs[test_split_dfs['language'] == 'Bulgarian']

Unnamed: 0,type,grade,subject,language,chemical_structure,table,figure,graph,sample_id,image,answer_key
2895,image_text,12,History,Bulgarian,0,0,1,0,54e1aeca-4ef1-47e3-af20-4d8c9edee5ae,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,
2896,image_text,12,History,Bulgarian,0,0,1,0,6869ad38-97e7-4576-951b-7b537a40f029,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,
2897,text,12,History,Bulgarian,0,0,0,0,1c6bf0f9-0e69-4846-8806-bf71aec9e7f5,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,
2898,text,12,History,Bulgarian,0,0,0,0,de585dbc-0da3-4625-baa6-ef280abbe3ab,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,
2899,image_text,12,History,Bulgarian,0,0,1,0,e4002c2e-def5-4536-ab18-5990e9ca5355,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,
...,...,...,...,...,...,...,...,...,...,...,...
3090,text,12,Mathematics,Bulgarian,0,0,0,0,1786d320-925a-4a35-a2a7-7ce614b7c26e,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,
3091,text,12,Mathematics,Bulgarian,0,0,0,0,855364ac-21d4-4724-874d-d8c5d6d2b3a7,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,
3092,text,12,Mathematics,Bulgarian,0,0,0,0,ea93ffcc-63cb-4fa2-b8b4-77b86562b253,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,
3093,text,12,Mathematics,Bulgarian,0,0,0,0,ac085692-6f5c-47ef-a211-ccc9b0332d80,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,


In [9]:
bg_test = filter_entries_by_language(test_split_dfs, 'Bulgarian')

In [10]:
bg_test.to_parquet('../data/bg_test.parquet', index=False)

In [13]:
test_split_dfs = filter_entries_by_language(test_split_dfs, 'English')

In [14]:
test_split_dfs.to_parquet('test_split_eng.parquet', index=False)

In [None]:
if test_split_dataframes:
    print(f"\nSuccessfully read {len(test_split_dataframes)} Parquet files for the 'test' split.")
    # Concatenate the list of DataFrames for the 'test' split if needed
    test_df = pd.concat(test_split_dataframes, ignore_index=True)
    print(f"Concatenated 'test' data shape: {test_df.shape}")
    # print(test_df.head()) # Uncomment to see the first few rows
else:
    print("\nNo data read for the 'test' split.")

In [None]:
test_split_dataframes[0]

In [None]:
test_df = pd.concat(test_split_dataframes, ignore_index=True)

In [2]:
test_df = pd.read_parquet('./test_df_en.parquet')

In [3]:
test_df

Unnamed: 0,type,grade,subject,language,chemical_structure,table,figure,graph,sample_id,image,answer_key
0,text,12,Physics,English,0,0,1,0,996d484a-43b4-4766-96ff-0747cf8fd841,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,
1,text,12,Physics,English,0,0,1,0,57778143-868c-44f1-9d80-a6039c52fa1f,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,
2,text,12,Physics,English,0,0,0,0,8e20d59f-4d57-41fd-ac84-a77df0ca2f87,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,
3,text,12,Physics,English,0,0,1,0,a036bd0a-9dc5-4aff-8e3d-033f960dd1c7,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,
4,text,12,Physics,English,0,0,0,0,d3faa9de-efe6-416b-b92f-26ed5025f233,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,
...,...,...,...,...,...,...,...,...,...,...,...
507,text,12,Physics,English,0,0,0,0,373add4d-6fe3-46a6-b4d6-77b16c2665ba,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,
508,text,12,Physics,English,0,0,0,0,61684a84-aef7-44e2-b56c-e36a768a0685,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,
509,text,12,Physics,English,0,0,0,1,3ec5f05c-d14d-49b6-84d9-860eb805a651,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,
510,text,12,Physics,English,0,0,0,1,29ed343a-a779-4e49-8ead-77bcdd54f1ce,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,


In [12]:
# test_df_en = filter_entries_by_language(test_df, 'English')

In [6]:
test_df['subject'].unique()

array(['Physics'], dtype=object)

In [None]:
val_df = pd.read_parquet('./data/val_split_dfs.parquet')

In [None]:

import io
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np

def display_image_from_bytes(image_bytes):
    """
    Displays an image from bytes using matplotlib.

    Args:
        image_bytes (bytes): The bytes representing the image.
    """
    # Convert bytes to a PIL Image
    image = Image.open(io.BytesIO(image_bytes))

    # Display the image using matplotlib
    plt.imshow(np.array(image))
    plt.axis('off')  # Hide axis
    plt.show()

# Display the first image from the test DataFrame
if not test_df_en.empty:
    first_image_bytes = test_df_en.iloc[1]['image']['bytes']
    display_image_from_bytes(first_image_bytes)
else:

    print("No English entries found in the test DataFrame.")


In [None]:
# save test_df_en to parquet
output_file = 'test_df_en.parquet'
test_df_en.to_parquet(output_file, index=False)
print(f"Filtered DataFrame saved to {output_file}")
# save test_df_en to csv
output_file_csv = 'test_df_en.csv'
test_df_en.to_csv(output_file_csv, index=False)
print(f"Filtered DataFrame saved to {output_file_csv}")


In [None]:



# Read the 'train' split
train_split_dataframes = read_parquet_files_by_split_prefix(data_directory, split='train')

if train_split_dataframes:
    print(f"\nSuccessfully read {len(train_split_dataframes)} Parquet files for the 'train' split.")
    # Concatenate the list of DataFrames for the 'train' split if needed
    train_df = pd.concat(train_split_dataframes, ignore_index=True)
    print(f"Concatenated 'train' data shape: {train_df.shape}")
    # print(train_df.head()) # Uncomment to see the first few rows
else:
     print("\nNo data read for the 'train' split.")

# Read the 'validation' split
validation_split_dataframes = read_parquet_files_by_split_prefix(data_directory, split='validation')

if validation_split_dataframes:
    print(f"\nSuccessfully read {len(validation_split_dataframes)} Parquet files for the 'validation' split.")
    # Concatenate the list of DataFrames for the 'validation' split if needed
    validation_df = pd.concat(validation_split_dataframes, ignore_index=True)
    print(f"Concatenated 'validation' data shape: {validation_df.shape}")
    # print(validation_df.head()) # Uncomment to see the first few rows
else:
    print("\nNo data read for the 'validation' split.")

In [2]:
prompt_template = (
    "Provide an overview of the {aspect} of coal mining in {region} "
    "as of {year}. What are the key challenges and mitigation efforts?"
)

first_prompt = prompt_template.format(
    aspect="env impact",
    region='the Powder River Basin',
    year=2023
)


second_prompt = prompt_template.format(
    aspect="radioactive deposits",
    region='nuclear AI power plant',
    year=2042
)


print("--- Prompt Template ---")
print(prompt_template)
print(f"Year: {specific_year}")
print("\n--- First prompt ---")
print(first_prompt)

print("\n--- Second prompt ---")
print(second_prompt)


# This `final_prompt` string is what you would send in the API call payload.

--- Prompt Template ---
Provide an overview of the {aspect} of coal mining in {region} as of {year}. What are the key challenges and mitigation efforts?
Year: 2023

--- First prompt ---
Provide an overview of the env impact of coal mining in the Powder River Basin as of 2023. What are the key challenges and mitigation efforts?

--- Second prompt ---
Provide an overview of the radioactive deposits of coal mining in nuclear AI power plant as of 2042. What are the key challenges and mitigation efforts?
