In [15]:
import pandas as pd
import gdown
import os
import json

In [7]:
TRAIN_JSON_URL = "https://drive.google.com/file/d/1flMO-Y42FLe1qEzOqgjg591Cvz5hEdGh/view?usp=drive_link"
TEST_JSON_URL = "https://drive.google.com/file/d/10_oiLZzgQNesdXOe2WQ61PnT9qan8aAE/view?usp=drive_link"
VALIDATE_JSON_URL = "https://drive.google.com/file/d/1X9JU3_-eqSBaT_XxE2LfxOL7RpvoK01w/view?usp=drive_link"

TRAIN_JSON_LIST = [TRAIN_JSON_URL, "train.json"]
TEST_JSON_LIST = [TEST_JSON_URL, "test.json"]
VALIDATE_JSON_LIST = [VALIDATE_JSON_URL, "validate.json"]

In [None]:
def download_json_googledrive(data_list = None):
    if data_list is None:
        raise ValueError("data_list cannot be None. Expected a list like: [google_drive_url, output_filename].")
    if len(data_list) != 2:
        raise ValueError("data_list must contain exactly two items: [google_drive_url, output_filename].")
    
    if not os.path.exists(data_list[1]):
        print(f"File '{data_list[1]}' is not downloaded")
        print("Downloading from google drive....")
        gdown.download(data_list[0], output=data_list[1], fuzzy=True)
    
    else:
        print(f"File '{data_list[1]}' exists. Will not be downloaded from google drive")


download_json_googledrive(TRAIN_JSON_LIST)
download_json_googledrive(TEST_JSON_LIST)
download_json_googledrive(VALIDATE_JSON_LIST)


File 'train.json' exists. Will not be downloaded from google drive
File 'test.json' exists. Will not be downloaded from google drive
File 'validate.json' exists. Will not be downloaded from google drive


In [36]:
def json_to_pandas(json_file, save_to_csv=False):
    try:
        with open(json_file, "r", encoding="utf-8") as file:
            data = json.load(file)

        if isinstance(data, dict):
            data = [data]

        df = pd.DataFrame(data)

    except json.JSONDecodeError:
        records = []
        with open(json_file, "r", encoding="utf-8") as file:
            for line in file:
                line = line.strip()
                if line:
                    records.append(json.loads(line))
        df = pd.DataFrame(records)

    if save_to_csv:
        
        csv_path = f"{os.path.splitext(json_file)[0]}.csv"
        if not os.path.exists(csv_path):
            print(f"Creating csv file for {json_file}...")
            df.to_csv(csv_path, index=False)
            print(f"Created a csv to '{csv_path}' for '{json_file}'")
        else:
            print(f"csv file for {json_file} already exists at {csv_path}")

    return df

def merge_data(df_list, save_to_csv = False):
    # Convert unhashable list columns to tuples so duplicates can be removed
    for df in df_list:
        if 'triple' in df.columns:
            df['triple'] = df['triple'].apply(lambda x: tuple(x) if isinstance(x, list) else x)

    df_all = pd.concat(df_list, ignore_index=True)
    df_all = df_all.drop_duplicates()


    if save_to_csv:
        csv_path = "full_data.csv"
        if not os.path.exists(csv_path):
            df.to_csv(csv_path, index=False)
            print("Created a merged csv file")
        else:
            print(f"csv file already exists at {csv_path}")


    return df_all


df_train = json_to_pandas(TRAIN_JSON_LIST[1], save_to_csv=True)
df_test = json_to_pandas(TEST_JSON_LIST[1], save_to_csv= True)
df_validate = json_to_pandas(VALIDATE_JSON_LIST[1], save_to_csv= True)

df_list = [df_train, df_test, df_validate]

df = merge_data(df_list= df_list, save_to_csv=True)



csv file for train.json already exists at train.csv
csv file for test.json already exists at test.csv
csv file for validate.json already exists at validate.csv
csv file already exists at full_data.csv
