In [6]:
import os
import pandas as pd

def load_csv_files(folder_path, skip_folder=None):
    """
    Recursively load all CSV files from the specified folder and its subfolders into Pandas DataFrames,
    while skipping a specified folder.

    Args:
        folder_path (str): Path to the folder containing CSV files.
        skip_folder (str, optional): Name of the folder to skip (relative to folder_path).

    Returns:
        dict: A dictionary where keys are CSV file paths (relative to folder_path)
              and values are the corresponding Pandas DataFrames.
    """
    dataframes = {}

    # Walk through all directories and files
    for root, dirs, files in os.walk(folder_path):
        # Skip the specified folder
        if skip_folder and skip_folder in dirs:
            dirs.remove(skip_folder)  # This prevents os.walk() from descending into the folder

        for file_name in files:
            if file_name.endswith("data.csv"):
                file_path = os.path.join(root, file_name)

                # Create a unique name for the DataFrame (relative path without extension)
                rel_path = os.path.relpath(file_path, folder_path)
                dataframe_name = os.path.splitext(rel_path)[0].replace(os.sep, "_")

                try:
                    # Read the CSV file into a Pandas DataFrame
                    df = pd.read_csv(file_path)

                    # Store DataFrame in the dictionary
                    dataframes[dataframe_name] = df

                    print(f"Loaded: {file_path} -> DataFrame: {dataframe_name}")
                except Exception as e:
                    print(f"Could not load: {file_path} -> DataFrame: {dataframe_name}. Error: {e}")

    return dataframes

# Specify the folder containing your CSV files
folder_path = "sources"

# Load all CSV files into DataFrames
dataframes_dict = load_csv_files(folder_path)

# Example: Access a specific DataFrame
for name, df in dataframes_dict.items():
    print(f"DataFrame Name: {name}")
    print(df.head(5))  # Display the first few rows


Loaded: sources/tourism/data.csv -> DataFrame: tourism_data
Loaded: sources/education/data.csv -> DataFrame: education_data


  df = pd.read_csv(file_path)


Loaded: sources/healthcare/data.csv -> DataFrame: healthcare_data


  df = pd.read_csv(file_path)


Loaded: sources/ecommerce/data.csv -> DataFrame: ecommerce_data
Loaded: sources/finance/data.csv -> DataFrame: finance_data
DataFrame Name: tourism_data
   Trip ID       Destination Start date   End date  Duration (days)  \
0        1        London, UK   5/1/2023   5/8/2023              7.0   
1        2  Phuket, Thailand  6/15/2023  6/20/2023              5.0   
2        3   Bali, Indonesia   7/1/2023   7/8/2023              7.0   
3        4     New York, USA  8/15/2023  8/29/2023             14.0   
4        5      Tokyo, Japan  9/10/2023  9/17/2023              7.0   

   Traveler name  Traveler age Traveler gender Traveler nationality  \
0     John Smith          35.0            Male             American   
1       Jane Doe          28.0          Female             Canadian   
2      David Lee          45.0            Male               Korean   
3  Sarah Johnson          29.0          Female              British   
4     Kim Nguyen          26.0          Female           Vietname

In [7]:
for name, df in dataframes_dict.items():
    print(f"\nDataFrame Name: {name}")
    print("Columns:")
    for column in df.columns:
        print(f"  - {column}")


DataFrame Name: tourism_data
Columns:
  - Trip ID
  - Destination
  - Start date
  - End date
  - Duration (days)
  - Traveler name
  - Traveler age
  - Traveler gender
  - Traveler nationality
  - Accommodation type
  - Accommodation cost
  - Transportation type
  - Transportation cost

DataFrame Name: education_data
Columns:
  - age
  - workclass
  - fnlwgt
  - education
  - education-num
  - marital-status
  - occupation
  - relationship
  - race
  - sex
  - capital-gain
  - capital-loss
  - hours-per-week
  - native-country
  - salary

DataFrame Name: healthcare_data
Columns:
  - Unnamed: 0
  - start_date
  - end_date
  - drug_type
  - drug_name
  - drug_name_poe
  - drug_name_generic
  - formulary_code
  - gsn
  - ndc
  - product_strength
  - dose_value
  - dose_unit
  - form_value_dispensed
  - form_unit_dispensed
  - route
  - row_id_2
  - hospital_admission_id_2
  - admit_time
  - discharge_time
  - admission_type
  - admission_location
  - discharge_location
  - insurance
  -

In [8]:
for name, df in dataframes_dict.items():
    print(f"\n - {name}")


 - tourism_data

 - education_data

 - healthcare_data

 - ecommerce_data

 - finance_data


In [15]:
tourism_data = dataframes_dict['tourism_data']
tourism_data.head(5)

Unnamed: 0,Trip ID,Destination,Start date,End date,Duration (days),Traveler name,Traveler age,Traveler gender,Traveler nationality,Accommodation type,Accommodation cost,Transportation type,Transportation cost
0,1,"London, UK",5/1/2023,5/8/2023,7.0,John Smith,35.0,Male,American,Hotel,1200,Flight,600
1,2,"Phuket, Thailand",6/15/2023,6/20/2023,5.0,Jane Doe,28.0,Female,Canadian,Resort,800,Flight,500
2,3,"Bali, Indonesia",7/1/2023,7/8/2023,7.0,David Lee,45.0,Male,Korean,Villa,1000,Flight,700
3,4,"New York, USA",8/15/2023,8/29/2023,14.0,Sarah Johnson,29.0,Female,British,Hotel,2000,Flight,1000
4,5,"Tokyo, Japan",9/10/2023,9/17/2023,7.0,Kim Nguyen,26.0,Female,Vietnamese,Airbnb,700,Train,200


In [17]:
row_count = tourism_data.shape[0]
print("column_count",row_count)
nan_rows = tourism_data[tourism_data.isna().any(axis=1)]
nan_rows

column_count 139


Unnamed: 0,Trip ID,Destination,Start date,End date,Duration (days),Traveler name,Traveler age,Traveler gender,Traveler nationality,Accommodation type,Accommodation cost,Transportation type,Transportation cost
71,72,,,,,,,,,,,,
82,83,"Rome, Italy",4/15/2025,4/22/2025,7.0,James Kim,41.0,Male,American,Hotel,100.0,,
127,128,,,,,,,,,,,,


In [None]:
tourism_data.dropna(inplace=True)
education_data

In [9]:
education_data = dataframes_dict['education_data']
education_data.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [11]:
education_data.drop(["fnlwgt","education-num","workclass","relationship","sex","capital-gain","capital-loss"],axis=1,inplace=True)

In [12]:
row_count = education_data.shape[0]
print("column_count",row_count)
nan_rows = education_data[education_data.isna().any(axis=1)]
nan_rows

column_count 32561


Unnamed: 0,age,education,marital-status,occupation,race,hours-per-week,native-country,salary


In [13]:
education_data.dropna(inplace=True)
education_data

Unnamed: 0,age,education,marital-status,occupation,race,hours-per-week,native-country,salary
0,39,Bachelors,Never-married,Adm-clerical,White,40,United-States,<=50K
1,50,Bachelors,Married-civ-spouse,Exec-managerial,White,13,United-States,<=50K
2,38,HS-grad,Divorced,Handlers-cleaners,White,40,United-States,<=50K
3,53,11th,Married-civ-spouse,Handlers-cleaners,Black,40,United-States,<=50K
4,28,Bachelors,Married-civ-spouse,Prof-specialty,Black,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...
32556,27,Assoc-acdm,Married-civ-spouse,Tech-support,White,38,United-States,<=50K
32557,40,HS-grad,Married-civ-spouse,Machine-op-inspct,White,40,United-States,>50K
32558,58,HS-grad,Widowed,Adm-clerical,White,40,United-States,<=50K
32559,22,HS-grad,Never-married,Adm-clerical,White,20,United-States,<=50K


In [14]:
education_data.insert(0, 'index', range(1, len(education_data) + 1))
education_data

Unnamed: 0,index,age,education,marital-status,occupation,race,hours-per-week,native-country,salary
0,1,39,Bachelors,Never-married,Adm-clerical,White,40,United-States,<=50K
1,2,50,Bachelors,Married-civ-spouse,Exec-managerial,White,13,United-States,<=50K
2,3,38,HS-grad,Divorced,Handlers-cleaners,White,40,United-States,<=50K
3,4,53,11th,Married-civ-spouse,Handlers-cleaners,Black,40,United-States,<=50K
4,5,28,Bachelors,Married-civ-spouse,Prof-specialty,Black,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...
32556,32557,27,Assoc-acdm,Married-civ-spouse,Tech-support,White,38,United-States,<=50K
32557,32558,40,HS-grad,Married-civ-spouse,Machine-op-inspct,White,40,United-States,>50K
32558,32559,58,HS-grad,Widowed,Adm-clerical,White,40,United-States,<=50K
32559,32560,22,HS-grad,Never-married,Adm-clerical,White,20,United-States,<=50K


In [46]:
def row_to_sentence(row,primary_key=""):
    # Start the sentence with an introductory phrase
    sentence = ""
    
    # Dynamically iterate over all columns
    for col in row.index:
        if col!=primary_key:
            value = str(row[col]).strip()  # Ensure value is a string and remove leading/trailing spaces
            if value.lower() != "nan":  # Skip NaN values
                col = col.lower()
                value = value.strip("'")  # Remove surrounding single quotes if present
                value = value.lower()
                sentence += f"{col} is {value}, "
    
    # Remove the trailing comma and space, then end with a period
    sentence = sentence.rstrip(", ") + "."
    return sentence

# Apply the function to each row and create a new column for sentences
tourism_data['sentence'] = tourism_data.apply(lambda row: row_to_sentence(row, primary_key="Trip ID"), axis=1)

# # Display the resulting DataFrame with the sentences
print(tourism_data[['sentence']].head())


In [None]:
tourism_data

In [None]:
tourism_data.to_csv("sources/tourism/data_sentence.csv",index=False)

*Run from here*:

In [18]:
import torch
torch.cuda.empty_cache()
import warnings
warnings.simplefilter("ignore")

In [19]:
import pandas as pd
import os
from transformers import LlamaTokenizer, LlamaForCausalLM, AutoConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
import random

In [20]:
class Model:
    def __init__(self):
        device = torch.device("cuda:0")
        model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
        self.model = LlamaForCausalLM.from_pretrained(
            model_name,# config = config, 
            torch_dtype=torch.float16,
            device_map='auto',
        ).to(device)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.tokenizer.eos_token = self.tokenizer.pad_token  # Set PAD token to EOS
        
    def predict(self,prompt, user_prompt=""):
        model,tokenizer = self.model, self.tokenizer
        temp = random.random()
        messages = [
        {"role": "system", "content": prompt},
        {"role": "user", "content": user_prompt}
        ]
        prompt = tokenizer.apply_chat_template(messages, tokenize=False)
        inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
        # print(inputs)
        generate_ids = model.generate(**inputs, max_new_tokens=4096, do_sample=True, temperature=temp,pad_token_id=tokenizer.eos_token_id) # Disable sampling for deterministic output
        generate_ids = generate_ids[0][len(inputs["input_ids"][0]):-1]
        infer_res = tokenizer.decode(generate_ids)
        return infer_res
        
    def enhance_sentence_with_llama(self,sentence):
        model = self.model
        # Construct the prompt
        system_prompt = "You are a creative AI that rephrases given sentences into engaging, conversational stories while incorporating all provided datapoints. Ensure that no information is omitted or added, and skip any datapoints labeled as 'nan'. Do not rephrase the object of a sentence. For example, if the sentence is 'start date is 9/22/2023', do not change the date to a different format. Respond only with the rephrased sentence without any additional commentary."
        user_prompt = f"""
    Rephrase the following sentence into a conversational story, ensuring all datapoints are included while skipping 'nan' values. Do not introduce any extra or false details.
    
    Original sentence: {sentence}
    
    Creative sentence:"""
        creative_sentence = self.predict(system_prompt,user_prompt)
        creative_sentence = creative_sentence.replace("<|start_header_id|>assistant<|end_header_id|>\n\n", "")
        
        # Extract only the generated part
        # creative_sentence = response.split("Creative sentence:")[-1].strip()
        return creative_sentence

In [21]:
model = Model()
model.enhance_sentence_with_llama("Row ID 348: ROW_ID_x is '1136896.0', SUBJECT_ID is '23', HADM_ID_x is '124321.0', ICUSTAY_ID is '234044.0', STARTDATE is '2157-10-21 00:00:00', ENDDATE is '2157-10-25 00:00:00', DRUG_TYPE is 'MAIN', DRUG is 'Sodium Chloride 0.9%  Flush', DRUG_NAME_POE is 'Sodium Chloride 0.9%  Flush', DRUG_NAME_GENERIC is 'Sodium Chloride 0.9%  Flush', FORMULARY_DRUG_CD is 'NACLFLUSH', GSN is 'nan', NDC is '0.0', PROD_STRENGTH is 'Syringe', DOSE_VAL_RX is '3', DOSE_UNIT_RX is 'mL', FORM_VAL_DISP is '0.6', FORM_UNIT_DISP is 'SYR', ROUTE is 'IV', ROW_ID_y is '23.0', HADM_ID_y is '124321.0', ADMITTIME is '2157-10-18 19:34:00', DISCHTIME is '2157-10-25 14:00:00', DEATHTIME is 'nan', ADMISSION_TYPE is 'EMERGENCY', ADMISSION_LOCATION is 'TRANSFER FROM HOSP/EXTRAM', DISCHARGE_LOCATION is 'HOME HEALTH CARE', INSURANCE is 'Medicare', LANGUAGE is 'ENGL', RELIGION is 'CATHOLIC', MARITAL_STATUS is 'MARRIED', ETHNICITY is 'WHITE', EDREGTIME is 'nan', EDOUTTIME is 'nan', DIAGNOSIS is 'BRAIN MASS', HOSPITAL_EXPIRE_FLAG is '0.0', HAS_CHARTEVENTS_DATA is '1.0'.")

Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████| 4/4 [00:04<00:00,  1.01s/it]
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"It was October 21st, 2157, and in room 348, a patient with the subject ID '23' lay in bed, connected to an IV drip of Sodium Chloride 0.9% Flush, a critical medication that is also known as Sodium Chloride 0.9% Flush. This medication, classified as a main drug, came in a syringe form with a strength of Syringe. The doctor prescribed a dose of 3 milliliters to be administered via the IV route. The medication's formula value was 0.6 Syringe, but its formula unit was unspecified. The patient had been admitted to the hospital as an emergency on October 18th, 2157, at 7:34 PM and was discharged on October 25th, 2157, at 2:00 PM. The patient's discharge location was their home for home health care."

In [22]:
# Define the checkpoint function
def process_with_checkpoint(model,df, checkpoint_file, start_index=0, batch_size=10):
    # Load existing checkpoint if it exists
    if os.path.exists(checkpoint_file):
        df = pd.read_csv(checkpoint_file)
        print("Loaded existing checkpoint.")
        if 'creative_sentence' not in df.columns:
            df['creative_sentence'] = None
        
        # df = df.head(100)
    
    try:
        # Process the dataframe in batches
        for i in range(start_index, len(df), batch_size):
            # Process a batch of rows
            batch = df.iloc[i:i + batch_size]
            
            for idx, row in batch.iterrows():
                if pd.isna(row['creative_sentence']):  # Only process rows not yet completed
                    df.at[idx, 'creative_sentence'] = model.enhance_sentence_with_llama(row['sentence'])
            
            # Save progress after processing each batch
            df.to_csv(checkpoint_file, index=False)
            print(f"Checkpoint saved at row {i + batch_size}.")
    except Exception as e:
        print(f"Error occurred: {e}")
        # Save the checkpoint if an error occurs
        df.to_csv(checkpoint_file, index=False)
        print("Checkpoint saved after error.")
    
    return df

In [None]:
# Read the CSV file back into a DataFrame
checkpoint_path = "sources/tourism/data_sentence.csv"
tourism_df = pd.read_csv(checkpoint_path)    
# Process the dataframe with checkpointing
tourism_df = process_with_checkpoint(model,tourism_df, checkpoint_file=checkpoint_path, batch_size=20)
# Save the final result
tourism_df.to_csv(checkpoint_path, index=False)

In [None]:
tourism_df = pd.read_csv(checkpoint_path)
none_count = tourism_df['creative_sentence'].isnull().sum()
total = len(tourism_df)
creative_sentence_count = total - none_count
print(none_count)
print(creative_sentence_count)

In [None]:
tourism_df.head(20)

In [None]:
shortened_df = tourism_df#.head(100)
shortened_df['creative_sentence'].isnull().sum()

In [None]:
# Adjust display settings for full sentence visibility
pd.set_option('display.max_colwidth', None)

# Display the full sentences in the 'creative_sentence' column
shortened_df['creative_sentence'].head(5)


In [23]:
import json
import pandas as pd
import numpy as np

def dataframe_to_json(shortened_df, dataset_path, primary_key):
    json_data = []
    shortened_df = shortened_df.drop(columns=['sentence'])
    
    for _, row in shortened_df.iterrows():
        # Extract the primary key value if it's valid
        primary_id = str(row[primary_key]).lower() if primary_key in row and pd.notna(row[primary_key]) and str(row[primary_key]).lower() != "nan" else None
        
        # Build key-value pairs, skipping 'creative_sentence' and the primary key
        key_value = {
            col.lower(): (primary_id, str(row[col]).lower()) 
            for col in shortened_df.columns 
            if col not in ["creative_sentence", primary_key] and pd.notna(row[col]) and str(row[col]).lower() != "nan"
        }

        entities = list(key_value.keys())

        json_data.append({
            "text": row["creative_sentence"],
            "entities": entities,
            "key_value": key_value
        })

    with open(f"{dataset_path}.json", 'w') as json_file:
        json.dump(json_data, json_file, indent=4)
    
    print("JSON file saved successfully!")


In [8]:
import json

def combine_jsons(dataset_path):
    # Load the JSON file
    with open(f"{dataset_path}.json", "r") as json_file:
        data = json.load(json_file)
    
    # Combine every 5 entries
    combined_data = []
    batch_size = 5  # Number of entries to combine
    
    for i in range(0, len(data), batch_size):
        batch = data[i:i + batch_size]  # Take a batch of 5
    
        # Merge text fields
        combined_text = "\n".join(str(entry["text"]) for entry in batch)
    
        # Merge unique entities
        combined_entities = list(set(entity for entry in batch for entity in entry["entities"]))
    
        # Merge key-value pairs, keeping all values in a list
        combined_key_value = {}
    
        for entry in batch:
            for key, value in entry["key_value"].items():
                if key in combined_key_value:
                    if value not in combined_key_value[key]:  # Avoid duplicate values
                        combined_key_value[key].append(value)
                else:
                    combined_key_value[key] = [value]
        difficulty = None
        domain = None
        if "education" in dataset_path:
            difficulty = "medium"
            domain = "education"
        elif "tourism" in dataset_path:
            difficulty = "easy"
            domain = "tourism"
        elif "ecommerce" in dataset_path:
            difficulty = "medium"
            domain = "ecommerce"
        elif "healthcare" in dataset_path:
            difficulty = "hard"
            domain = "healthcare"
        elif "finance" in dataset_path:
            difficulty = "medium"
            domain = "finance"
        # Append combined entry
        combined_data.append({
            "text": combined_text,
            "entities": combined_entities,
            "key_value": combined_key_value,
            "difficulty": difficulty,
            "domain": domain
        })
    
    # Save the new JSON file
    with open(f"{dataset_path}_combined.json", "w") as json_file:
        json.dump(combined_data, json_file, indent=4)
    
    print("Combined JSON file saved successfully!")


In [9]:
import json

# Load the combined JSON file
def clean_combined_jsons(dataset_path):
    with open(f"{dataset_path}_combined.json", "r") as json_file:
        combined_data = json.load(json_file)
    
    # Function to clean entities and key-value pairs
    def clean_entry(entry):
        text = entry["text"].lower()  # Convert text to lowercase for case-insensitive matching
    
        # Keep only entities that exist in the text
        filtered_entities = [entity for entity in entry["entities"] if entity.lower() in text]
    
        # Initialize filtered key-value store
        filtered_key_value = {}
    
        for key, value_list in entry["key_value"].items():
            if isinstance(value_list, list):
                valid_pairs = []
                
                for pair in value_list:
                    if isinstance(pair, list) and len(pair) == 2:  # Ensure it's a (trip_id, value) structure
                        trip_id, actual_value = pair
                        
                        # Keep only pairs where the value appears in the text
                        if str(actual_value).lower() in text:
                            valid_pairs.append([trip_id, actual_value])
                
                # Only add key if it has valid values
                if valid_pairs:
                    filtered_key_value[key] = valid_pairs

        # Return cleaned entry
        return {
            "text": entry["text"],
            "entities": filtered_entities,
            "key_value": filtered_key_value,
            "difficulty": entry["difficulty"],
            "domain": entry["domain"]
            
        }
    # Apply the cleaning function to each entry
    cleaned_data = [clean_entry(entry) for entry in combined_data]
    
    # Save the cleaned JSON file
    with open(f"{dataset_path}_combined_cleaned.json", "w") as json_file:
        json.dump(cleaned_data, json_file, indent=4)
    
    print("Cleaned JSON file saved successfully!")


In [None]:
try:
    del dataframes_dict['tourism_data_sentence']
    del dataframes_dict['tourism_data']
except:
    pass

In [29]:
for name, df in dataframes_dict.items():
    row_count = df.shape[0]
    print("column_count in",name,row_count)
    nan_rows = df[df.isna().any(axis=1)]
    print("nan_rows in",name,len(nan_rows))

column_count in tourism_data 139
nan_rows in tourism_data 3
column_count in education_data 32561
nan_rows in education_data 0
column_count in healthcare_data 1249121
nan_rows in healthcare_data 1134378
column_count in ecommerce_data 447326
nan_rows in ecommerce_data 0
column_count in finance_data 7000
nan_rows in finance_data 0


In [30]:
for name, df in dataframes_dict.items():
    df.dropna(axis=1, thresh=len(df) * 0.5, inplace=True)  # Drops columns with >50% NaNs

In [31]:
for name, df in dataframes_dict.items():
    row_count = df.shape[0]
    print("column_count in",name,row_count)
    nan_rows = df[df.isna().any(axis=1)]
    print("nan_rows in",name,len(nan_rows))

column_count in tourism_data 139
nan_rows in tourism_data 3
column_count in education_data 32561
nan_rows in education_data 0
column_count in healthcare_data 1249121
nan_rows in healthcare_data 532380
column_count in ecommerce_data 447326
nan_rows in ecommerce_data 0
column_count in finance_data 7000
nan_rows in finance_data 0


In [47]:
education_data['sentence'] = education_data.apply(lambda row: row_to_sentence(row, primary_key="index"), axis=1)
file_path = "sources/education/data_sentence.csv"
# Save DataFrame to CSV
education_data.to_csv(file_path, index=False)

print(f"Saved: {file_path}")

Saved: sources/education/data_sentence.csv


In [None]:
for name, df in dataframes_dict.items():
    df['sentence'] = df.apply(row_to_sentence, axis=1)
    dataframes_dict[name] = df

In [49]:
for name, df in dataframes_dict.items():
    row_count = df.shape[0]
    print("column_count in",name,row_count)
    nan_rows = df[df.isna().any(axis=1)]
    print("nan_rows in",name,len(nan_rows))

column_count in tourism_data 139
nan_rows in tourism_data 3
column_count in education_data 32561
nan_rows in education_data 0
column_count in healthcare_data 1249121
nan_rows in healthcare_data 1134378
column_count in ecommerce_data 1048575
nan_rows in ecommerce_data 1048575
column_count in finance_data 7000
nan_rows in finance_data 0


In [None]:
# Define the base directory (modify as needed)
base_dir = "sources"

# Loop through dictionary and save each DataFrame as a CSV file
for name, df in dataframes_dict.items():
    # Construct the folder path dynamically based on the dataset name
    file_path = f"{base_dir}/{name.split('_')[0]}/data_sentence.csv"

    # Save DataFrame to CSV
    df.to_csv(file_path, index=False)
    
    print(f"Saved: {file_path}")


In [None]:
# Define the base directory
base_dir = "sources"

# Sort the dictionary by DataFrame length in ascending order
names = ['education'] #'ecommerce','education'

# Loop through each DataFrame in sorted order
for name in names:
    # Construct the checkpoint file path dynamically
    checkpoint_path = f"{base_dir}/{name}/data_sentence.csv"

    print(f"Processing: {checkpoint_path}")

    # Read the CSV file into a DataFrame
    df = pd.read_csv(checkpoint_path)

    # Process the DataFrame with checkpointing
    df = process_with_checkpoint(model, df, checkpoint_file=checkpoint_path, batch_size=20)

    # Save the final result back to the same CSV file
    df.to_csv(checkpoint_path, index=False)

    print(f"Saved: {checkpoint_path}")


In [10]:
import pandas as pd
base_dir = "sources"

# Sort the dictionary by DataFrame length in ascending order
names = ['education','ecommerce','healthcare','finance','tourism'] #

# Loop through each DataFrame in sorted order
for name in names:
    # Construct the checkpoint file path dynamically
    checkpoint_path = f"{base_dir}/{name}/data_sentence"
    if name == 'finance':
        primary_id = 'Customer_ID'
    elif name == 'tourism':
        primary_id = 'Trip ID'
    elif name == 'healthcare':
        primary_id = 'Index'
    elif name == 'ecommerce':
        primary_id = 'index'
    elif name == 'education':
        primary_id = 'index'

    print(f"Processing: {checkpoint_path}")

    # Read the CSV file into a DataFrame
    # df = pd.read_csv(checkpoint_path)
    
    # dataframe_to_json(df,checkpoint_path,primary_id)
    combine_jsons(checkpoint_path)
    clean_combined_jsons(checkpoint_path)

Processing: sources/education/data_sentence
Combined JSON file saved successfully!
Cleaned JSON file saved successfully!
Processing: sources/ecommerce/data_sentence
Combined JSON file saved successfully!
Cleaned JSON file saved successfully!
Processing: sources/healthcare/data_sentence
Combined JSON file saved successfully!
Cleaned JSON file saved successfully!
Processing: sources/finance/data_sentence
Combined JSON file saved successfully!
Cleaned JSON file saved successfully!
Processing: sources/tourism/data_sentence
Combined JSON file saved successfully!
Cleaned JSON file saved successfully!


In [52]:
!ls

generate_datasets.ipynb		      generate_mimic_dataset.ipynb
generate_mimic_cleaned_dataset.ipynb  sources


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [11]:
import os
import json
import random

def find_json_files(root_folder, target_filename):
    """Recursively find all files named target_filename in root_folder."""
    json_files = []
    
    for root, _, files in os.walk(root_folder):
        for file in files:
            if file == target_filename:
                file_path = os.path.join(root, file)
                json_files.append(file_path)

    return json_files

def clean_data(data):
    """Remove entries with empty 'text', 'entities', and 'key_value', and clean empty keys in 'key_value'."""
    cleaned = []
    for entry in data:
        if (
            entry.get("entities") == [] and
            entry.get("key_value") == {}
        ):
            continue  # Skip unwanted entry

        if "key_value" in entry:
            # Remove empty key_value entries
            entry["key_value"] = {k: v for k, v in entry["key_value"].items() if v}

        cleaned.append(entry)
    return cleaned

def merge_and_shuffle_json_files(json_files, root_folder):
    """Merge and shuffle all lists of dictionaries from found JSON files after cleaning."""
    combined_data = []
    
    for file in json_files:
        relative_path = os.path.relpath(file, root_folder)
        with open(file, 'r', encoding='utf-8') as f:
            try:
                data = json.load(f)
                if isinstance(data, list):
                    data = clean_data(data)
                    print(f"File: {relative_path} contains {len(data)} valid entries.")
                    combined_data.extend(data)
                else:
                    print(f"Warning: {relative_path} does not contain a list.")
            except json.JSONDecodeError:
                print(f"Error: Could not decode {relative_path}")

    random.shuffle(combined_data)
    return combined_data

# Settings
root_folder = "sources"
target_filename = "data_sentence_combined_cleaned.json"

# Run
json_files = find_json_files(root_folder, target_filename)

if not json_files:
    print("No matching JSON files found.")

print(f"Found {len(json_files)} files. Cleaning and merging...")

combined_data = merge_and_shuffle_json_files(json_files, root_folder)

output_file = "merged_dataset.json"
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(combined_data, f, indent=4, ensure_ascii=False)

print(f"Successfully merged {len(combined_data)} entries into {output_file}")


Found 5 files. Cleaning and merging...
File: tourism/data_sentence_combined_cleaned.json contains 28 valid entries.
File: education/data_sentence_combined_cleaned.json contains 24 valid entries.
File: healthcare/data_sentence_combined_cleaned.json contains 52 valid entries.
File: ecommerce/data_sentence_combined_cleaned.json contains 33 valid entries.
File: finance/data_sentence_combined_cleaned.json contains 36 valid entries.
Successfully merged 173 entries into merged_dataset.json


In [12]:
import os
import json

def remove_empty_key_value_entries(root_dir):
    """Remove entries with empty 'key_value' dict from all JSON files in subdirectories."""
    for root, _, files in os.walk(root_dir):
        for file in files:
            if file.endswith(".json"):
                file_path = os.path.join(root, file)
                try:
                    with open(file_path, 'r', encoding='utf-8') as f:
                        data = json.load(f)

                    if isinstance(data, list):
                        cleaned_data = [
                            entry for entry in data
                            if not (isinstance(entry, dict) and entry.get("key_value") == {})
                        ]

                        if len(cleaned_data) != len(data):
                            print(f"Cleaned {len(data) - len(cleaned_data)} entries from: {file_path}")

                        # Save back the cleaned data
                        with open(file_path, 'w', encoding='utf-8') as f:
                            json.dump(cleaned_data, f, indent=4, ensure_ascii=False)
                except Exception as e:
                    print(f"Error processing {file_path}: {e}")

# Change to your root directory
root_directory = "sources"
remove_empty_key_value_entries(root_directory)


Cleaned 6489 entries from: sources/education/data_sentence_combined_cleaned.json
Cleaned 249773 entries from: sources/healthcare/data_sentence_combined_cleaned.json
Cleaned 89433 entries from: sources/ecommerce/data_sentence_combined_cleaned.json
Cleaned 1364 entries from: sources/finance/data_sentence_combined_cleaned.json
