In [None]:
import os
import pandas as pd

def load_csv_files(folder_path, skip_folder=None):
    """
    Recursively load all CSV files from the specified folder and its subfolders into Pandas DataFrames,
    while skipping a specified folder.

    Args:
        folder_path (str): Path to the folder containing CSV files.
        skip_folder (str, optional): Name of the folder to skip (relative to folder_path).

    Returns:
        dict: A dictionary where keys are CSV file paths (relative to folder_path)
              and values are the corresponding Pandas DataFrames.
    """
    dataframes = {}

    # Walk through all directories and files
    for root, dirs, files in os.walk(folder_path):
        # Skip the specified folder
        if skip_folder and skip_folder in dirs:
            dirs.remove(skip_folder)  # This prevents os.walk() from descending into the folder

        for file_name in files:
            if file_name.endswith("data_sentence.csv"):
                file_path = os.path.join(root, file_name)

                # Create a unique name for the DataFrame (relative path without extension)
                rel_path = os.path.relpath(file_path, folder_path)
                dataframe_name = os.path.splitext(rel_path)[0].replace(os.sep, "_")

                try:
                    # Read the CSV file into a Pandas DataFrame
                    df = pd.read_csv(file_path)

                    # Store DataFrame in the dictionary
                    dataframes[dataframe_name] = df

                    print(f"Loaded: {file_path} -> DataFrame: {dataframe_name}")
                except Exception as e:
                    print(f"Could not load: {file_path} -> DataFrame: {dataframe_name}. Error: {e}")

    return dataframes

# Specify the folder containing your CSV files
folder_path = "sources"

# Load all CSV files into DataFrames
dataframes_dict = load_csv_files(folder_path)

# Example: Access a specific DataFrame
for name, df in dataframes_dict.items():
    print(f"DataFrame Name: {name}")
    print(df.head(5))  # Display the first few rows


Loaded: sources/tourism/data_sentence.csv -> DataFrame: tourism_data_sentence
Loaded: sources/education/data_sentence.csv -> DataFrame: education_data_sentence


  df = pd.read_csv(file_path)


Loaded: sources/healthcare/data_sentence.csv -> DataFrame: healthcare_data_sentence


  df = pd.read_csv(file_path)


Loaded: sources/ecommerce/data_sentence.csv -> DataFrame: ecommerce_data_sentence
Loaded: sources/finance/data_sentence.csv -> DataFrame: finance_data_sentence
DataFrame Name: tourism_data_sentence
   Trip ID       Destination Start date   End date  Duration           name  \
0        1        London, UK   5/1/2023   5/8/2023       7.0     John Smith   
1        2  Phuket, Thailand  6/15/2023  6/20/2023       5.0       Jane Doe   
2        3   Bali, Indonesia   7/1/2023   7/8/2023       7.0      David Lee   
3        4     New York, USA  8/15/2023  8/29/2023      14.0  Sarah Johnson   
4        5      Tokyo, Japan  9/10/2023  9/17/2023       7.0     Kim Nguyen   

    age  gender nationality Accommodation type Accommodation cost  \
0  35.0    Male    American              Hotel               1200   
1  28.0  Female    Canadian             Resort                800   
2  45.0    Male      Korean              Villa               1000   
3  29.0  Female     British              Hotel     

In [None]:
for name, df in dataframes_dict.items():
    print(f"\nDataFrame Name: {name}")
    print("Columns:")
    for column in df.columns:
        print(f"  - {column}")


DataFrame Name: tourism_data_sentence
Columns:
  - Trip ID
  - Destination
  - Start date
  - End date
  - Duration
  - name
  - age
  - gender
  - nationality
  - Accommodation type
  - Accommodation cost
  - Transportation type
  - Transportation cost
  - sentence
  - creative_sentence

DataFrame Name: education_data_sentence
Columns:
  - index
  - age
  - workclass
  - education
  - marital-status
  - occupation
  - relationship
  - race
  - sex
  - capital-gain
  - capital-loss
  - hours-per-week
  - native-country
  - salary
  - sentence
  - creative_sentence

DataFrame Name: healthcare_data_sentence
Columns:
  - Index
  - start_date
  - end_date
  - drug_type
  - drug_name
  - drug_name_poe
  - drug_name_generic
  - formulary_code
  - gsn
  - ndc
  - product_strength
  - dose_value
  - dose_unit
  - form_value_dispensed
  - form_unit_dispensed
  - route
  - admit_time
  - discharge_time
  - admission_type
  - admission_location
  - discharge_location
  - insurance
  - language
 

In [None]:
for name, df in dataframes_dict.items():
    print(f"\n - {name}")


 - tourism_data_sentence

 - education_data_sentence

 - healthcare_data_sentence

 - ecommerce_data_sentence

 - finance_data_sentence


In [None]:
dataframes_dict['tourism_data_sentence'].drop(["sentence","creative_sentence"],axis=1,inplace=True)
dataframes_dict['education_data_sentence'].drop(["sentence","creative_sentence","workclass","relationship","sex","capital-gain","capital-loss"],axis=1,inplace=True)
dataframes_dict['healthcare_data_sentence'].drop(["sentence","creative_sentence","drug_name_poe","form_value_dispensed","form_unit_dispensed","ed_registration_time","ed_exit_time","has_chartevents_data","expired_in_hospital"],axis=1,inplace=True)
dataframes_dict['ecommerce_data_sentence'].drop(["sentence","creative_sentence","commission_code","order_discount","order_value_display","fiscal_year","business_status"],axis=1,inplace=True)
dataframes_dict['finance_data_sentence'].drop(["sentence","Avg_Transaction_Value","Max_Transaction_Value","Min_Transaction_Value","Last_Transaction_Days_Ago","Referral_Count","App_Usage_Frequency","Support_Tickets_Raised","Issue_Resolution_Time"],axis=1,inplace=True)


In [None]:
print(len(dataframes_dict['tourism_data_sentence']))
dataframes_dict['tourism_data_sentence'].head(5)

136


Unnamed: 0,Trip ID,Destination,Start date,End date,Duration,name,age,gender,nationality,Accommodation type,Accommodation cost,Transportation type,Transportation cost
0,1,"London, UK",5/1/2023,5/8/2023,7.0,John Smith,35.0,Male,American,Hotel,1200,Flight,600
1,2,"Phuket, Thailand",6/15/2023,6/20/2023,5.0,Jane Doe,28.0,Female,Canadian,Resort,800,Flight,500
2,3,"Bali, Indonesia",7/1/2023,7/8/2023,7.0,David Lee,45.0,Male,Korean,Villa,1000,Flight,700
3,4,"New York, USA",8/15/2023,8/29/2023,14.0,Sarah Johnson,29.0,Female,British,Hotel,2000,Flight,1000
4,5,"Tokyo, Japan",9/10/2023,9/17/2023,7.0,Kim Nguyen,26.0,Female,Vietnamese,Airbnb,700,Train,200


In [None]:
print(len(dataframes_dict['education_data_sentence']))
dataframes_dict['education_data_sentence'].head(5)

32561


Unnamed: 0,index,age,education,marital-status,occupation,race,hours-per-week,native-country,salary
0,1,39,Bachelors,Never-married,Adm-clerical,White,40,United-States,<=50K
1,2,50,Bachelors,Married-civ-spouse,Exec-managerial,White,13,United-States,<=50K
2,3,38,HS-grad,Divorced,Handlers-cleaners,White,40,United-States,<=50K
3,4,53,11th,Married-civ-spouse,Handlers-cleaners,Black,40,United-States,<=50K
4,5,28,Bachelors,Married-civ-spouse,Prof-specialty,Black,40,Cuba,<=50K


In [None]:
print(len(dataframes_dict['healthcare_data_sentence']))
dataframes_dict['healthcare_data_sentence'].head(5)

1249121


Unnamed: 0,Index,start_date,end_date,drug_type,drug_name,drug_name_generic,formulary_code,gsn,ndc,product_strength,...,discharge_time,admission_type,admission_location,discharge_location,insurance,language,religion,marital_status,ethnicity,diagnosis
0,0,2138-07-18 00:00:00,2138-07-20 00:00:00,MAIN,NEO*IV*Gentamicin,,GENT10I,9298.0,63323020000.0,10mg/mL-2mL,...,2138-07-21 15:48:00,NEWBORN,PHYS REFERRAL/NORMAL DELI,HOME,Private,,NOT SPECIFIED,,ASIAN,NEWBORN
1,1,2138-07-18 00:00:00,2138-07-20 00:00:00,BASE,Syringe (Neonatal) *D5W*,,NEOSYRD5W,,0.0,1 Syringe,...,2138-07-21 15:48:00,NEWBORN,PHYS REFERRAL/NORMAL DELI,HOME,Private,,NOT SPECIFIED,,ASIAN,NEWBORN
2,2,2138-07-18 00:00:00,2138-07-21 00:00:00,MAIN,Ampicillin Sodium,,AMP500I,8937.0,63323040000.0,500mg Vial,...,2138-07-21 15:48:00,NEWBORN,PHYS REFERRAL/NORMAL DELI,HOME,Private,,NOT SPECIFIED,,ASIAN,NEWBORN
3,3,2138-07-18 00:00:00,2138-07-21 00:00:00,BASE,Send 500mg Vial,,AMPVL,,0.0,Send 500mg Vial,...,2138-07-21 15:48:00,NEWBORN,PHYS REFERRAL/NORMAL DELI,HOME,Private,,NOT SPECIFIED,,ASIAN,NEWBORN
4,4,,,,,,,,,,...,2103-02-04 12:15:00,NEWBORN,PHYS REFERRAL/NORMAL DELI,HOME,Private,,BUDDHIST,,ASIAN,NEWBORN


In [None]:
print(len(dataframes_dict['ecommerce_data_sentence']))
dataframes_dict['ecommerce_data_sentence'].head(5)

447326


Unnamed: 0,index,order_status,order_date,product_sku,product_price,quantity_ordered,order_total,product_category,payment_type,processing_date,order_year,order_month,customer_since,month_year
0,1,complete,7/1/2016,kreations_YI 06-L,1950.0,1.0,1950.0,Women's Fashion,cod,7/1/2016,2016.0,7.0,2016-7,7-2016
1,2,canceled,7/1/2016,kcc_Buy 2 Frey Air Freshener & Get 1 Kasual Bo...,240.0,1.0,240.0,Beauty & Grooming,cod,7/1/2016,2016.0,7.0,2016-7,7-2016
2,3,canceled,7/1/2016,Ego_UP0017-999-MR0,2450.0,1.0,2450.0,Women's Fashion,cod,7/1/2016,2016.0,7.0,2016-7,7-2016
3,4,complete,7/1/2016,kcc_krone deal,360.0,1.0,60.0,Beauty & Grooming,cod,7/1/2016,2016.0,7.0,2016-7,7-2016
4,5,order_refunded,7/1/2016,BK7010400AG,555.0,2.0,1110.0,Soghaat,cod,7/1/2016,2016.0,7.0,2016-7,7-2016


In [None]:
print(len(dataframes_dict['finance_data_sentence']))
dataframes_dict['finance_data_sentence'].head(5)

7000


Unnamed: 0,Customer_ID,Age,Location,Income_Level,Total_Transactions,Total_Spent,Active_Days,Loyalty_Points_Earned,Cashback_Received,Preferred_Payment_Method,Customer_Satisfaction_Score
0,cust_0000,54,Urban,Low,192,3213386.0,140,2114,2224.01214,Debit Card,1
1,cust_0001,67,Suburban,High,979,14231460.0,229,2960,4026.823518,UPI,8
2,cust_0002,44,Urban,High,329,2323192.0,73,3170,1441.011395,Debit Card,4
3,cust_0003,30,Rural,High,71,1166308.0,299,4756,4365.85558,Wallet Balance,1
4,cust_0004,58,Urban,Middle,878,9482481.0,236,1992,4161.523827,UPI,5


In [None]:
df_clean = dataframes_dict['healthcare_data_sentence'].dropna()

# Step 2: Select the first 120 rows from the cleaned DataFrame
df_selected = df_clean.head(120)
df_selected.drop(["Index"],axis=1,inplace=True)
df_selected['index'] = range(1, len(df_selected) + 1)
dataframes_dict['healthcare_data_sentence'] = df_selected
# Move 'index' column to the beginning
dataframes_dict['healthcare_data_sentence'] = dataframes_dict['healthcare_data_sentence'][['index'] + [col for col in dataframes_dict['healthcare_data_sentence'].columns if col != 'index']]

dataframes_dict['healthcare_data_sentence']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected.drop(["Index"],axis=1,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['index'] = range(1, len(df_selected) + 1)


Unnamed: 0,index,start_date,end_date,drug_type,drug_name,drug_name_generic,formulary_code,gsn,ndc,product_strength,...,discharge_time,admission_type,admission_location,discharge_location,insurance,language,religion,marital_status,ethnicity,diagnosis
340,1,2157-10-21 00:00:00,2157-10-23 00:00:00,MAIN,Dexamethasone,Dexamethasone,DEXA4,006789,5.481752e+07,4mg Tab,...,2157-10-25 14:00:00,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,Medicare,ENGL,CATHOLIC,MARRIED,WHITE,BRAIN MASS
350,2,2157-10-21 00:00:00,2157-10-25 00:00:00,MAIN,HydrALAzine,HydrALAzine,HYDZ20I,000283,5.170901e+08,20mg/mL Vial,...,2157-10-25 14:00:00,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,Medicare,ENGL,CATHOLIC,MARRIED,WHITE,BRAIN MASS
352,3,2157-10-22 00:00:00,2157-10-22 00:00:00,MAIN,Heparin,Heparin Sodium,HEPA5I,006549,6.332303e+10,5000 Units / mL- 1mL Vial,...,2157-10-25 14:00:00,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,Medicare,ENGL,CATHOLIC,MARRIED,WHITE,BRAIN MASS
360,4,2157-10-23 00:00:00,2157-10-25 00:00:00,MAIN,Dexamethasone,Dexamethasone,DEXA2,006788,5.481762e+07,2mg Tab,...,2157-10-25 14:00:00,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,Medicare,ENGL,CATHOLIC,MARRIED,WHITE,BRAIN MASS
362,5,2157-10-25 00:00:00,2157-10-24 00:00:00,MAIN,Dexamethasone,Dexamethasone,DEXA2,006788,5.481762e+07,2mg Tab,...,2157-10-25 14:00:00,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,Medicare,ENGL,CATHOLIC,MARRIED,WHITE,BRAIN MASS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7016,116,2153-09-26 00:00:00,2153-09-28 00:00:00,MAIN,Morphine Sulfate,Morphine Sulfate (Syringe),MORP2I,004070,4.091762e+08,2mg Syringe,...,2153-09-28 18:48:00,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME,Private,ENGL,CATHOLIC,SINGLE,UNKNOWN/NOT SPECIFIED,HYPERTROPHIC CARDIOMYOPATHY\ETHANOL SEPTAL ABL...
7017,117,2153-09-26 00:00:00,2153-09-28 00:00:00,MAIN,Metoprolol XL,Metoprolol XL,TOPR100,016600,1.861092e+08,100mg XL Tab,...,2153-09-28 18:48:00,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME,Private,ENGL,CATHOLIC,SINGLE,UNKNOWN/NOT SPECIFIED,HYPERTROPHIC CARDIOMYOPATHY\ETHANOL SEPTAL ABL...
7018,118,2153-09-26 00:00:00,2153-09-28 00:00:00,MAIN,Oxycodone-Acetaminophen,Oxycodone-Acetaminophen,PERC,004222,4.060513e+08,5mg/325mg Tablet,...,2153-09-28 18:48:00,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME,Private,ENGL,CATHOLIC,SINGLE,UNKNOWN/NOT SPECIFIED,HYPERTROPHIC CARDIOMYOPATHY\ETHANOL SEPTAL ABL...
7019,119,2153-09-26 00:00:00,2153-09-28 00:00:00,MAIN,Verapamil HCl,Verapamil HCl,VERA40,000565,5.910404e+08,40mg Tab,...,2153-09-28 18:48:00,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME,Private,ENGL,CATHOLIC,SINGLE,UNKNOWN/NOT SPECIFIED,HYPERTROPHIC CARDIOMYOPATHY\ETHANOL SEPTAL ABL...


In [None]:
df_clean = dataframes_dict['tourism_data_sentence'].dropna()

# Step 2: Select the first 120 rows from the cleaned DataFrame
df_selected = df_clean.head(120)
df_selected.drop(["Trip ID"],axis=1,inplace=True)
df_selected['index'] = range(1, len(df_selected) + 1)
dataframes_dict['tourism_data_sentence'] = df_selected
# Move 'index' column to the beginning
dataframes_dict['tourism_data_sentence'] = dataframes_dict['tourism_data_sentence'][['index'] + [col for col in dataframes_dict['tourism_data_sentence'].columns if col != 'index']]

dataframes_dict['tourism_data_sentence']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected.drop(["Trip ID"],axis=1,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['index'] = range(1, len(df_selected) + 1)


Unnamed: 0,index,Destination,Start date,End date,Duration,name,age,gender,nationality,Accommodation type,Accommodation cost,Transportation type,Transportation cost
0,1,"London, UK",5/1/2023,5/8/2023,7.0,John Smith,35.0,Male,American,Hotel,1200,Flight,600
1,2,"Phuket, Thailand",6/15/2023,6/20/2023,5.0,Jane Doe,28.0,Female,Canadian,Resort,800,Flight,500
2,3,"Bali, Indonesia",7/1/2023,7/8/2023,7.0,David Lee,45.0,Male,Korean,Villa,1000,Flight,700
3,4,"New York, USA",8/15/2023,8/29/2023,14.0,Sarah Johnson,29.0,Female,British,Hotel,2000,Flight,1000
4,5,"Tokyo, Japan",9/10/2023,9/17/2023,7.0,Kim Nguyen,26.0,Female,Vietnamese,Airbnb,700,Train,200
...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,116,"Paris, France",3/15/2022,3/22/2022,7.0,Bob Johnson,47.0,Male,Canadian,Hotel,1200,Train,500
116,117,"Sydney, Aus",5/1/2022,5/12/2022,11.0,Cindy Chen,26.0,Female,Chinese,Airbnb,800,Plane,1000
117,118,"Rome, Italy",6/10/2022,6/17/2022,7.0,David Lee,38.0,Male,Korean,Hotel,900,Train,400
118,119,"Bali, Indonesia",7/20/2022,7/30/2022,10.0,Emily Kim,29.0,Female,Korean,Hostel,500,Plane,800


In [None]:
df_clean = dataframes_dict['education_data_sentence'].dropna()

# Step 2: Select the first 120 rows from the cleaned DataFrame
df_selected = df_clean.head(120)
df_selected.drop(["index"],axis=1,inplace=True)
df_selected['index'] = range(1, len(df_selected) + 1)
dataframes_dict['education_data_sentence'] = df_selected
# Move 'index' column to the beginning
dataframes_dict['education_data_sentence'] = dataframes_dict['education_data_sentence'][['index'] + [col for col in dataframes_dict['education_data_sentence'].columns if col != 'index']]

dataframes_dict['education_data_sentence']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected.drop(["index"],axis=1,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['index'] = range(1, len(df_selected) + 1)


Unnamed: 0,index,age,education,marital-status,occupation,race,hours-per-week,native-country,salary
0,1,39,Bachelors,Never-married,Adm-clerical,White,40,United-States,<=50K
1,2,50,Bachelors,Married-civ-spouse,Exec-managerial,White,13,United-States,<=50K
2,3,38,HS-grad,Divorced,Handlers-cleaners,White,40,United-States,<=50K
3,4,53,11th,Married-civ-spouse,Handlers-cleaners,Black,40,United-States,<=50K
4,5,28,Bachelors,Married-civ-spouse,Prof-specialty,Black,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...
115,116,53,9th,Married-civ-spouse,Handlers-cleaners,White,50,United-States,<=50K
116,117,56,Some-college,Married-civ-spouse,Sales,White,50,United-States,<=50K
117,118,49,Assoc-voc,Married-civ-spouse,Craft-repair,Black,40,United-States,>50K
118,119,55,Some-college,Married-civ-spouse,Sales,White,56,United-States,<=50K


In [None]:
dataframes_dict['education_data_sentence'].rename(columns={'native-country': 'country'}, inplace=True)


In [None]:
df_clean = dataframes_dict['ecommerce_data_sentence'].dropna()

# Step 2: Select the first 120 rows from the cleaned DataFrame
df_selected = df_clean.head(120)
df_selected.drop(["index"],axis=1,inplace=True)
df_selected['index'] = range(1, len(df_selected) + 1)
dataframes_dict['ecommerce_data_sentence'] = df_selected
# Move 'index' column to the beginning
dataframes_dict['ecommerce_data_sentence'] = dataframes_dict['ecommerce_data_sentence'][['index'] + [col for col in dataframes_dict['ecommerce_data_sentence'].columns if col != 'index']]

dataframes_dict['ecommerce_data_sentence']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected.drop(["index"],axis=1,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['index'] = range(1, len(df_selected) + 1)


Unnamed: 0,index,order_status,order_date,product_sku,product_price,quantity_ordered,order_total,product_category,payment_type,processing_date,order_year,order_month,customer_since,month_year
0,1,complete,7/1/2016,kreations_YI 06-L,1950.0,1.0,1950.0,Women's Fashion,cod,7/1/2016,2016.0,7.0,2016-7,7-2016
1,2,canceled,7/1/2016,kcc_Buy 2 Frey Air Freshener & Get 1 Kasual Bo...,240.0,1.0,240.0,Beauty & Grooming,cod,7/1/2016,2016.0,7.0,2016-7,7-2016
2,3,canceled,7/1/2016,Ego_UP0017-999-MR0,2450.0,1.0,2450.0,Women's Fashion,cod,7/1/2016,2016.0,7.0,2016-7,7-2016
3,4,complete,7/1/2016,kcc_krone deal,360.0,1.0,60.0,Beauty & Grooming,cod,7/1/2016,2016.0,7.0,2016-7,7-2016
4,5,order_refunded,7/1/2016,BK7010400AG,555.0,2.0,1110.0,Soghaat,cod,7/1/2016,2016.0,7.0,2016-7,7-2016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,116,canceled,7/1/2016,Al Muhafiz Sohan Halwa Almond,350.0,1.0,350.0,Soghaat,cod,7/1/2016,2016.0,7.0,2016-7,7-2016
116,117,canceled,7/1/2016,sst_Lyquin-Regular fit-Large,1950.0,1.0,2745.0,Men's Fashion,ublcreditcard,7/1/2016,2016.0,7.0,2016-7,7-2016
117,118,canceled,7/1/2016,Fcafe_11777-L,795.0,1.0,2745.0,Men's Fashion,ublcreditcard,7/1/2016,2016.0,7.0,2016-7,7-2016
118,119,complete,7/1/2016,LC_359547105042,4750.0,1.0,12150.0,Beauty & Grooming,cod,7/1/2016,2016.0,7.0,2016-7,7-2016


In [None]:
df_clean = dataframes_dict['finance_data_sentence'].dropna()

# Step 2: Select the first 120 rows from the cleaned DataFrame
df_selected = df_clean.head(120)
df_selected.drop(["Customer_ID"],axis=1,inplace=True)
df_selected['index'] = range(1, len(df_selected) + 1)
dataframes_dict['finance_data_sentence'] = df_selected
# Move 'index' column to the beginning
dataframes_dict['finance_data_sentence'] = dataframes_dict['finance_data_sentence'][['index'] + [col for col in dataframes_dict['finance_data_sentence'].columns if col != 'index']]

dataframes_dict['finance_data_sentence']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected.drop(["Customer_ID"],axis=1,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['index'] = range(1, len(df_selected) + 1)


Unnamed: 0,index,Age,Location,Income_Level,Total_Transactions,Total_Spent,Active_Days,Loyalty_Points_Earned,Cashback_Received,Preferred_Payment_Method,Customer_Satisfaction_Score
0,1,54,Urban,Low,192,3.213386e+06,140,2114,2224.012140,Debit Card,1
1,2,67,Suburban,High,979,1.423146e+07,229,2960,4026.823518,UPI,8
2,3,44,Urban,High,329,2.323192e+06,73,3170,1441.011395,Debit Card,4
3,4,30,Rural,High,71,1.166308e+06,299,4756,4365.855580,Wallet Balance,1
4,5,58,Urban,Middle,878,9.482481e+06,236,1992,4161.523827,UPI,5
...,...,...,...,...,...,...,...,...,...,...,...
115,116,39,Urban,Middle,599,3.962774e+06,95,2147,4093.283791,Debit Card,10
116,117,26,Rural,Middle,897,8.518076e+06,199,1302,1818.347786,Debit Card,9
117,118,66,Suburban,Middle,19,2.653046e+04,146,3075,2999.723174,Wallet Balance,8
118,119,32,Urban,Low,46,4.889507e+04,155,4805,242.415140,Credit Card,3


In [None]:
def row_to_sentence(row,primary_key=""):
    # Start the sentence with an introductory phrase
    sentence = ""
    
    # Dynamically iterate over all columns
    for col in row.index:
        if col!=primary_key:
            value = str(row[col]).strip()  # Ensure value is a string and remove leading/trailing spaces
            if value.lower() != "nan":  # Skip NaN values
                col = col.lower()
                value = value.strip("'")  # Remove surrounding single quotes if present
                value = value.lower()
                sentence += f"{col} is {value}, "
    
    # Remove the trailing comma and space, then end with a period
    sentence = sentence.rstrip(", ") + "."
    return sentence


for key in dataframes_dict:
    df = dataframes_dict[key]
    df['sentence'] = df.apply(lambda row: row_to_sentence(row, primary_key="index"), axis=1)
    dataframes_dict[key] = df


dataframes_dict['tourism_data_sentence'].to_csv("sources/tourism/data2_sentence.csv",index=False)
dataframes_dict['healthcare_data_sentence'].to_csv("sources/healthcare/data2_sentence.csv",index=False)
dataframes_dict['education_data_sentence'].to_csv("sources/education/data2_sentence.csv",index=False)
dataframes_dict['ecommerce_data_sentence'].to_csv("sources/ecommerce/data2_sentence.csv",index=False)
dataframes_dict['finance_data_sentence'].to_csv("sources/finance/data2_sentence.csv",index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sentence'] = df.apply(lambda row: row_to_sentence(row, primary_key="index"), axis=1)


*Run from here*:

In [1]:
import pandas as pd
dataframes_dict = {}
dataframes_dict['tourism_data_sentence'] = pd.read_csv("sources/tourism/data2_sentence.csv")
dataframes_dict['healthcare_data_sentence'] = pd.read_csv("sources/healthcare/data2_sentence.csv")
dataframes_dict['education_data_sentence'] = pd.read_csv("sources/education/data2_sentence.csv")
dataframes_dict['ecommerce_data_sentence'] = pd.read_csv("sources/ecommerce/data2_sentence.csv")
dataframes_dict['finance_data_sentence'] = pd.read_csv("sources/finance/data2_sentence.csv")

In [2]:
import torch
torch.cuda.empty_cache()
import warnings
warnings.simplefilter("ignore")

In [3]:
import pandas as pd
import os
from transformers import LlamaTokenizer, LlamaForCausalLM, AutoConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
import random

2025-05-07 00:22:24.041069: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-07 00:22:24.063754: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746577344.088988 2334953 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746577344.096459 2334953 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1746577344.117000 2334953 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [4]:
class Model:
    def __init__(self):
        device = torch.device("cuda:0")
        model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
        self.model = LlamaForCausalLM.from_pretrained(
            model_name,# config = config, 
            torch_dtype=torch.float16,
            device_map='auto',
        ).to(device)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.tokenizer.eos_token = self.tokenizer.pad_token  # Set PAD token to EOS
        
    def predict(self,prompt, user_prompt=""):
        model,tokenizer = self.model, self.tokenizer
        temp = random.random()
        messages = [
        {"role": "system", "content": prompt},
        {"role": "user", "content": user_prompt}
        ]
        prompt = tokenizer.apply_chat_template(messages, tokenize=False)
        inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
        # print(inputs)
        generate_ids = model.generate(**inputs, max_new_tokens=4096, do_sample=True, temperature=temp,pad_token_id=tokenizer.eos_token_id) # Disable sampling for deterministic output
        generate_ids = generate_ids[0][len(inputs["input_ids"][0]):-1]
        infer_res = tokenizer.decode(generate_ids)
        return infer_res
        
    def enhance_sentence_with_llama(self,sentence):
        model = self.model
        # Construct the prompt
        system_prompt = "You are a creative AI that rephrases given sentences into engaging, conversational stories while incorporating all provided datapoints. Ensure that no information is omitted or added, and skip any datapoints labeled as 'nan'. Do not rephrase the object of a sentence. For example, if the sentence is 'start date is 9/22/2023', do not change the date to a different format. Respond only with the rephrased sentence without any additional commentary."
        user_prompt = f"""
    Rephrase the following sentence into a conversational story, ensuring all datapoints are included while skipping 'nan' values. Do not introduce any extra or false details.
    
    Original sentence: {sentence}
    
    Creative sentence:"""
        creative_sentence = self.predict(system_prompt,user_prompt)
        creative_sentence = creative_sentence.replace("<|start_header_id|>assistant<|end_header_id|>\n\n", "")
        
        # Extract only the generated part
        # creative_sentence = response.split("Creative sentence:")[-1].strip()
        return creative_sentence

In [5]:
model = None
# model.enhance_sentence_with_llama("Row ID 348: ROW_ID_x is '1136896.0', SUBJECT_ID is '23', HADM_ID_x is '124321.0', ICUSTAY_ID is '234044.0', STARTDATE is '2157-10-21 00:00:00', ENDDATE is '2157-10-25 00:00:00', DRUG_TYPE is 'MAIN', DRUG is 'Sodium Chloride 0.9%  Flush', DRUG_NAME_POE is 'Sodium Chloride 0.9%  Flush', DRUG_NAME_GENERIC is 'Sodium Chloride 0.9%  Flush', FORMULARY_DRUG_CD is 'NACLFLUSH', GSN is 'nan', NDC is '0.0', PROD_STRENGTH is 'Syringe', DOSE_VAL_RX is '3', DOSE_UNIT_RX is 'mL', FORM_VAL_DISP is '0.6', FORM_UNIT_DISP is 'SYR', ROUTE is 'IV', ROW_ID_y is '23.0', HADM_ID_y is '124321.0', ADMITTIME is '2157-10-18 19:34:00', DISCHTIME is '2157-10-25 14:00:00', DEATHTIME is 'nan', ADMISSION_TYPE is 'EMERGENCY', ADMISSION_LOCATION is 'TRANSFER FROM HOSP/EXTRAM', DISCHARGE_LOCATION is 'HOME HEALTH CARE', INSURANCE is 'Medicare', LANGUAGE is 'ENGL', RELIGION is 'CATHOLIC', MARITAL_STATUS is 'MARRIED', ETHNICITY is 'WHITE', EDREGTIME is 'nan', EDOUTTIME is 'nan', DIAGNOSIS is 'BRAIN MASS', HOSPITAL_EXPIRE_FLAG is '0.0', HAS_CHARTEVENTS_DATA is '1.0'.")

In [6]:
# Define the checkpoint function
def process_with_checkpoint(model,df, checkpoint_file, start_index=0, batch_size=10):
    # Load existing checkpoint if it exists
    if os.path.exists(checkpoint_file):
        df = pd.read_csv(checkpoint_file)
        print("Loaded existing checkpoint.")
        if 'creative_sentence' not in df.columns:
            df['creative_sentence'] = None
        
        # df = df.head(100)
    
    try:
        # Process the dataframe in batches
        for i in range(start_index, len(df), batch_size):
            # Process a batch of rows
            batch = df.iloc[i:i + batch_size]
            
            for idx, row in batch.iterrows():
                if pd.isna(row['creative_sentence']):  # Only process rows not yet completed
                    df.at[idx, 'creative_sentence'] = model.enhance_sentence_with_llama(row['sentence'])
            
            # Save progress after processing each batch
            df.to_csv(checkpoint_file, index=False)
            print(f"Checkpoint saved at row {i + batch_size}.")
    except Exception as e:
        print(f"Error occurred: {e}")
        # Save the checkpoint if an error occurs
        df.to_csv(checkpoint_file, index=False)
        print("Checkpoint saved after error.")
    
    return df

In [7]:
import json
import pandas as pd
import numpy as np

def dataframe_to_json(shortened_df, dataset_path, primary_key):
    json_data = []
    shortened_df = shortened_df.drop(columns=['sentence'])
    
    for _, row in shortened_df.iterrows():
        # Extract the primary key value if it's valid
        primary_id = str(row[primary_key]).lower() if primary_key in row and pd.notna(row[primary_key]) and str(row[primary_key]).lower() != "nan" else None
        
        # Build key-value pairs, skipping 'creative_sentence' and the primary key
        key_value = {
            col.lower(): (primary_id, str(row[col]).lower()) 
            for col in shortened_df.columns 
            if col not in ["creative_sentence", primary_key] and pd.notna(row[col]) and str(row[col]).lower() != "nan"
        }

        entities = list(key_value.keys())

        json_data.append({
            "text": row["creative_sentence"],
            "entities": entities,
            "key_value": key_value
        })

    with open(f"{dataset_path}.json", 'w') as json_file:
        json.dump(json_data, json_file, indent=4)
    
    print("JSON file saved successfully!")


In [8]:
import json

def combine_jsons(dataset_path):
    # Load the JSON file
    with open(f"{dataset_path}.json", "r") as json_file:
        data = json.load(json_file)
    
    # Combine every 5 entries
    combined_data = []
    batch_size = 5  # Number of entries to combine
    
    for i in range(0, len(data), batch_size):
        batch = data[i:i + batch_size]  # Take a batch of 5
    
        # Merge text fields
        combined_text = "\n".join(str(entry["text"]) for entry in batch)
    
        # Merge unique entities
        combined_entities = list(set(entity for entry in batch for entity in entry["entities"]))
    
        # Merge key-value pairs, keeping all values in a list
        combined_key_value = {}
    
        for entry in batch:
            for key, value in entry["key_value"].items():
                if key in combined_key_value:
                    if value not in combined_key_value[key]:  # Avoid duplicate values
                        combined_key_value[key].append(value)
                else:
                    combined_key_value[key] = [value]
        difficulty = None
        domain = None
        if "education" in dataset_path:
            difficulty = "medium"
            domain = "education"
        elif "tourism" in dataset_path:
            difficulty = "easy"
            domain = "tourism"
        elif "ecommerce" in dataset_path:
            difficulty = "medium"
            domain = "ecommerce"
        elif "healthcare" in dataset_path:
            difficulty = "hard"
            domain = "healthcare"
        elif "finance" in dataset_path:
            difficulty = "medium"
            domain = "finance"
        # Append combined entry
        combined_data.append({
            "text": combined_text,
            "entities": combined_entities,
            "key_value": combined_key_value,
            "difficulty": difficulty,
            "domain": domain
        })
    
    # Save the new JSON file
    with open(f"{dataset_path}_combined.json", "w") as json_file:
        json.dump(combined_data, json_file, indent=4)
    
    print("Combined JSON file saved successfully!")


In [9]:
import json

# Load the combined JSON file
def clean_combined_jsons(dataset_path):
    with open(f"{dataset_path}_combined.json", "r") as json_file:
        combined_data = json.load(json_file)
    
    # Function to clean entities and key-value pairs
    def clean_entry(entry):
        text = entry["text"].lower()  # Convert text to lowercase for case-insensitive matching
    
        # Keep only entities that exist in the text
        filtered_entities = [entity for entity in entry["entities"] if entity.lower() in text]
    
        # Initialize filtered key-value store
        filtered_key_value = {}
    
        for key, value_list in entry["key_value"].items():
            if isinstance(value_list, list):
                valid_pairs = []
                
                for pair in value_list:
                    if isinstance(pair, list) and len(pair) == 2:  # Ensure it's a (trip_id, value) structure
                        trip_id, actual_value = pair
                        
                        # Keep only pairs where the value appears in the text
                        if str(actual_value).lower() in text:
                            valid_pairs.append([trip_id, actual_value])
                
                # Only add key if it has valid values
                if valid_pairs:
                    filtered_key_value[key] = valid_pairs

        # Return cleaned entry
        return {
            "text": entry["text"],
            "entities": filtered_entities,
            "key_value": filtered_key_value,
            "difficulty": entry["difficulty"],
            "domain": entry["domain"]
            
        }
    # Apply the cleaning function to each entry
    cleaned_data = [clean_entry(entry) for entry in combined_data]
    
    # Save the cleaned JSON file
    with open(f"{dataset_path}_combined_cleaned.json", "w") as json_file:
        json.dump(cleaned_data, json_file, indent=4)
    
    print("Cleaned JSON file saved successfully!")


In [10]:
for name, df in dataframes_dict.items():
    row_count = df.shape[0]
    print("column_count in",name,row_count)
    nan_rows = df[df.isna().any(axis=1)]
    print("nan_rows in",name,len(nan_rows))

column_count in tourism_data_sentence 120
nan_rows in tourism_data_sentence 0
column_count in healthcare_data_sentence 120
nan_rows in healthcare_data_sentence 0
column_count in education_data_sentence 120
nan_rows in education_data_sentence 0
column_count in ecommerce_data_sentence 120
nan_rows in ecommerce_data_sentence 0
column_count in finance_data_sentence 120
nan_rows in finance_data_sentence 0


In [11]:
for name, df in dataframes_dict.items():
    df.dropna(axis=1, thresh=len(df) * 0.5, inplace=True)  # Drops columns with >50% NaNs

In [12]:
for name, df in dataframes_dict.items():
    row_count = df.shape[0]
    print("column_count in",name,row_count)
    nan_rows = df[df.isna().any(axis=1)]
    print("nan_rows in",name,len(nan_rows))

column_count in tourism_data_sentence 120
nan_rows in tourism_data_sentence 0
column_count in healthcare_data_sentence 120
nan_rows in healthcare_data_sentence 0
column_count in education_data_sentence 120
nan_rows in education_data_sentence 0
column_count in ecommerce_data_sentence 120
nan_rows in ecommerce_data_sentence 0
column_count in finance_data_sentence 120
nan_rows in finance_data_sentence 0


In [13]:
# Define the base directory
base_dir = "sources"

# Sort the dictionary by DataFrame length in ascending order
names = ['tourism','healthcare','education','ecommerce','finance'] #'ecommerce','education'

# Loop through each DataFrame in sorted order
for name in names:
    # Construct the checkpoint file path dynamically
    checkpoint_path = f"{base_dir}/{name}/data2_sentence.csv"

    print(f"Processing: {checkpoint_path}")

    # Read the CSV file into a DataFrame
    df = pd.read_csv(checkpoint_path)

    # Process the DataFrame with checkpointing
    df = process_with_checkpoint(model, df, checkpoint_file=checkpoint_path, batch_size=20)

    # Save the final result back to the same CSV file
    df.to_csv(checkpoint_path, index=False)

    print(f"Saved: {checkpoint_path}")


Processing: sources/tourism/data2_sentence.csv
Loaded existing checkpoint.
Checkpoint saved at row 20.
Checkpoint saved at row 40.
Checkpoint saved at row 60.
Checkpoint saved at row 80.
Checkpoint saved at row 100.
Checkpoint saved at row 120.
Saved: sources/tourism/data2_sentence.csv
Processing: sources/healthcare/data2_sentence.csv
Loaded existing checkpoint.
Checkpoint saved at row 20.
Checkpoint saved at row 40.
Checkpoint saved at row 60.
Checkpoint saved at row 80.
Checkpoint saved at row 100.
Checkpoint saved at row 120.
Saved: sources/healthcare/data2_sentence.csv
Processing: sources/education/data2_sentence.csv
Loaded existing checkpoint.
Checkpoint saved at row 20.
Checkpoint saved at row 40.
Checkpoint saved at row 60.
Checkpoint saved at row 80.
Checkpoint saved at row 100.
Checkpoint saved at row 120.
Saved: sources/education/data2_sentence.csv
Processing: sources/ecommerce/data2_sentence.csv
Loaded existing checkpoint.
Checkpoint saved at row 20.
Checkpoint saved at row 

In [14]:
import pandas as pd
base_dir = "sources"

names = ['tourism','healthcare','education','ecommerce','finance'] #

# Loop through each DataFrame in sorted order
for name in names:
    # Construct the checkpoint file path dynamically
    checkpoint_path = f"{base_dir}/{name}/data2_sentence"
    primary_id = "index"

    print(f"Processing: {checkpoint_path}")

    # Read the CSV file into a DataFrame
    df = pd.read_csv(checkpoint_path + ".csv")
    
    dataframe_to_json(df,checkpoint_path,primary_id)
    combine_jsons(checkpoint_path)
    # clean_combined_jsons(checkpoint_path)

Processing: sources/tourism/data2_sentence
JSON file saved successfully!
Combined JSON file saved successfully!
Processing: sources/healthcare/data2_sentence
JSON file saved successfully!
Combined JSON file saved successfully!
Processing: sources/education/data2_sentence
JSON file saved successfully!
Combined JSON file saved successfully!
Processing: sources/ecommerce/data2_sentence
JSON file saved successfully!
Combined JSON file saved successfully!
Processing: sources/finance/data2_sentence
JSON file saved successfully!
Combined JSON file saved successfully!


In [20]:
import os
import json
import random

def find_json_files(root_folder, target_filename):
    """Recursively find all files named target_filename in root_folder."""
    json_files = []
    
    for root, _, files in os.walk(root_folder):
        for file in files:
            if file == target_filename:
                file_path = os.path.join(root, file)
                json_files.append(file_path)

    return json_files

def clean_data(data):
    """Remove entries with empty 'text', 'entities', and 'key_value', and clean empty keys in 'key_value'."""
    cleaned = []
    for entry in data:
        if (
            entry.get("entities") == [] and
            entry.get("key_value") == {}
        ):
            continue  # Skip unwanted entry

        if "key_value" in entry:
            # Remove empty key_value entries
            entry["key_value"] = {k: v for k, v in entry["key_value"].items() if v}

        cleaned.append(entry)
    return cleaned

def merge_and_shuffle_json_files(json_files, root_folder):
    """Merge and shuffle all lists of dictionaries from found JSON files after cleaning."""
    combined_data = []
    
    for file in json_files:
        relative_path = os.path.relpath(file, root_folder)
        with open(file, 'r', encoding='utf-8') as f:
            try:
                data = json.load(f)
                if isinstance(data, list):
                    data = clean_data(data)
                    print(f"File: {relative_path} contains {len(data)} valid entries.")
                    combined_data.extend(data)
                else:
                    print(f"Warning: {relative_path} does not contain a list.")
            except json.JSONDecodeError:
                print(f"Error: Could not decode {relative_path}")

    random.shuffle(combined_data)
    return combined_data

# Settings
root_folder = "sources"
target_filename = "data2_sentence_combined.json"

# Run
json_files = find_json_files(root_folder, target_filename)

if not json_files:
    print("No matching JSON files found.")

print(f"Found {len(json_files)} files. Cleaning and merging...")

combined_data = merge_and_shuffle_json_files(json_files, root_folder)

output_file = "merged_dataset2.json"
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(combined_data, f, indent=4, ensure_ascii=False)

print(f"Successfully merged {len(combined_data)} entries into {output_file}")


Found 5 files. Cleaning and merging...
File: tourism/data2_sentence_combined.json contains 24 valid entries.
File: education/data2_sentence_combined.json contains 24 valid entries.
File: healthcare/data2_sentence_combined.json contains 24 valid entries.
File: ecommerce/data2_sentence_combined.json contains 24 valid entries.
File: finance/data2_sentence_combined.json contains 24 valid entries.
Successfully merged 120 entries into merged_dataset2.json


In [21]:
# import os
# import json

# def remove_empty_key_value_entries(root_dir):
#     """Remove entries with empty 'key_value' dict from all JSON files in subdirectories."""
#     for root, _, files in os.walk(root_dir):
#         for file in files:
#             if file.endswith(".json"):
#                 file_path = os.path.join(root, file)
#                 try:
#                     with open(file_path, 'r', encoding='utf-8') as f:
#                         data = json.load(f)

#                     if isinstance(data, list):
#                         cleaned_data = [
#                             entry for entry in data
#                             if not (isinstance(entry, dict) and entry.get("key_value") == {})
#                         ]

#                         if len(cleaned_data) != len(data):
#                             print(f"Cleaned {len(data) - len(cleaned_data)} entries from: {file_path}")

#                         # Save back the cleaned data
#                         with open(file_path, 'w', encoding='utf-8') as f:
#                             json.dump(cleaned_data, f, indent=4, ensure_ascii=False)
#                 except Exception as e:
#                     print(f"Error processing {file_path}: {e}")

# # Change to your root directory
# root_directory = "sources"
# remove_empty_key_value_entries(root_directory)


In [22]:
import json
from collections import Counter
from collections import defaultdict
from itertools import cycle

# Define the path to your JSON file
file_path = '/home/mushtari/nl2db/nl2db-main/data-generation/merged_dataset2.json'

# Open and load the JSON file
with open(file_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

data = [entry for entry in data if entry.get("domain") != "ecommerce"]
finance_entries = [entry for entry in data if entry.get("domain") == "finance"]
non_finance_entries = [entry for entry in data if entry.get("domain") != "finance"]

# Keep only the first 24 finance entries
filtered_finance_entries = finance_entries[:24]

# Combine the filtered finance entries with the rest
data = non_finance_entries + filtered_finance_entries

domain_buckets = defaultdict(list)
for entry in data:
    domain = entry.get("domain")
    domain_buckets[domain].append(entry)

# Define the domains to interleave (and their order)
target_domains = ["finance", "tourism", "healthcare", "education"]

# Create a round-robin interleaving of entries
interleaved = []
domain_iters = {d: iter(domain_buckets[d]) for d in target_domains}

# Continue pulling one entry at a time from each domain in order
while any(domain_buckets[d] for d in target_domains):
    for d in target_domains:
        if domain_buckets[d]:  # Only append if there are entries left
            interleaved.append(domain_buckets[d].pop(0))

data = interleaved

# Count how many entries per domain
domain_counts = Counter(entry.get("domain") for entry in data)

# Print the counts
for domain, count in domain_counts.items():
    print(f"{domain}: {count}")

finance: 24
tourism: 24
healthcare: 24
education: 24


In [23]:
for entry in data:
    gtv = entry.get("key_value", {})
    for key in gtv:
        for i, pair in enumerate(gtv[key]):
            pair[0] = str(i + 1)  # Replace first value with "1", "2", ..., "5"


In [24]:
with open(file_path, 'w', encoding='utf-8') as f:
    json.dump(data, f, indent=2)

In [None]:
import json

# Load the JSON files
with open('/home/mushtari/nl2db/nl2db-main/data-generation/merged_dataset.json', 'r') as f1, open('/home/mushtari/nl2db/nl2db-main/data-generation/merged_dataset2.json', 'r') as f2:
    data1 = json.load(f1)
    data2 = json.load(f2)

# Filter healthcare entries from file1
healthcare_entries_file1 = [entry for entry in data1 if entry.get('domain') == 'healthcare']

# Replace healthcare entries in file2 at their original positions
replaced_data2 = []
healthcare_idx = 0

for entry in data2:
    if entry.get('domain') == 'healthcare':
        if healthcare_idx < len(healthcare_entries_file1):
            replaced_data2.append(healthcare_entries_file1[healthcare_idx])
            healthcare_idx += 1
        else:
            # Optionally skip or keep original if not enough entries in file1
            replaced_data2.append(entry)
    else:
        replaced_data2.append(entry)

# Save the modified data2
with open('/home/mushtari/nl2db/nl2db-main/data-generation/merged_dataset3.json', 'w') as f:
    json.dump(replaced_data2, f, indent=2)
