In [6]:
# file loading
import pandas as pd
import os
import re
from IPython.display import display

print(f"Raw data path: {os.path.abspath(raw_data_path)}")
# Fix: Use correct relative path to data folder (go up two levels)
raw_data_path = os.path.join('..', '..', 'data', '01_raw')
processed_data_path = os.path.join('..', '..', 'data', '02_processed')

# ensuring processed directory exists
os.makedirs(processed_data_path,exist_ok=True)

# These lines were already correct, but verify they are still there
file_a=os.path.join(raw_data_path,'ayu-sat-table-a.csv')
file_b=os.path.join(raw_data_path,'ayu-sat-table-b.csv')
file_c=os.path.join(raw_data_path,'ayu-sat-table-c.csv')

column_maps={
    'Code':'sat_code',
    'Word':'ayurvedic_term',
    'Short Defination':'short_definition',
    'Long Defination':'long_definition'
}

def load_and_tag(file_path,tag):
    """loads a file, renames columns, and adds a source tag."""
    print(f"Loading: {file_path}")
    df=pd.read_csv(file_path,encoding='latin1')
    df=df.rename(columns={k:v for k,v in column_maps.items() if k in df.columns})

    # ensuring all the required columns exists
    for col in column_maps.values():
        if col not in df.columns:
            df[col]=''
    
    df['source_dataset']=tag
    return df[list(column_maps.values())+['source_dataset']]

# loading datasets
df_a=load_and_tag(file_a,'SAT_A')
df_b=load_and_tag(file_b,'SAT_B')
df_c=load_and_tag(file_c,'SAT_C')

print(f"\nLoaded Rows: A({len(df_a)}), B({len(df_b)}), C({len(df_c)})")

Raw data path: c:\Users\hp\Desktop\nlp_project\nlp_project\data\01_raw
Loading: ..\..\data\01_raw\ayu-sat-table-a.csv
Loading: ..\..\data\01_raw\ayu-sat-table-b.csv
Loading: ..\..\data\01_raw\ayu-sat-table-c.csv

Loaded Rows: A(319), B(514), C(176)


In [7]:
# --- 1. MERGE (CONSOLIDATION) ---
# Stack the three dataframes vertically
master_df = pd.concat([df_a, df_b, df_c], ignore_index=True)
print(f"Total rows after stacking: {len(master_df)}")

# --- 2. STANDARDIZATION ---
master_df = master_df.fillna('') # Replace NaNs with empty strings

# Apply cleaning: lowercase, strip whitespace, and basic deduplication
master_df['sat_code'] = master_df['sat_code'].astype(str).str.strip()
master_df['ayurvedic_term_clean'] = master_df['ayurvedic_term'].astype(str).str.lower().str.strip()
master_df['short_definition_clean'] = master_df['short_definition'].astype(str).str.lower().str.strip()

# Deduplication based on the core unique concept
master_df.drop_duplicates(
    # Use code and the cleaned term/definition as the unique identifier
    subset=['sat_code', 'ayurvedic_term_clean', 'short_definition_clean'], 
    inplace=True
)
print(f"Total unique concepts after deduplication: {len(master_df)}")

Total rows after stacking: 1009
Total unique concepts after deduplication: 1009


In [9]:
# --- 3. CREATE SEARCHABLE TERMS LIST ---

def create_query_list(row):
    """Gathers all potential search inputs into a list for a single concept."""
    # Use a set to automatically handle duplicates and ensure uniqueness
    terms = {row['ayurvedic_term_clean']} 
    
    if row['short_definition_clean']:
        terms.add(row['short_definition_clean'])
        
    return list(terms)

master_df['searchable_terms'] = master_df.apply(create_query_list, axis=1)

# --- 4. EXPLODE THE DATA ---
# This is the key step: it creates a new row for every searchable term, 
# keeping the SAT code mapped to each potential query input.
final_mapping_df = master_df.explode('searchable_terms').rename(
    columns={'searchable_terms': 'query_input'}
)

# Final cleanup and selection of columns
final_mapping_df = final_mapping_df[['sat_code', 'query_input', 'source_dataset']]
final_mapping_df = final_mapping_df[final_mapping_df['query_input'] != '']
# Remove duplicates that might have resulted from identical term/definition
final_mapping_df.drop_duplicates(subset=['sat_code', 'query_input'], inplace=True)

# --- 5. SAVE FOR MANUAL ENRICHMENT ---
OUTPUT_FILE = os.path.join(processed_data_path, 'master_concept_map_to_enrich.csv')
final_mapping_df.to_csv(OUTPUT_FILE, index=False)

print(f"\nFinal searchable query terms created: {len(final_mapping_df)}")
print(f"File saved to: {OUTPUT_FILE}")
print("\n--- Next Step: Manually enrich this file ---")
display(final_mapping_df.head())


Final searchable query terms created: 2017
File saved to: ..\..\data\02_processed\master_concept_map_to_enrich.csv

--- Next Step: Manually enrich this file ---


Unnamed: 0,sat_code,query_input,source_dataset
0,SAT-A,mulabuta-sabdah,SAT_A
0,SAT-A,fundamental terms,SAT_A
1,SAT-A.1,science of life,SAT_A
1,SAT-A.1,ayurvedah,SAT_A
2,SAT-A.2,ayuh,SAT_A
