In [1]:
# %%
import pandas as pd
from dateutil import parser

In [5]:
# %%
# Define input file paths
files = {
    "nrweuropa": "/Users/kiranmulawad/AI-Funding/2_preprocessing/data/funding_nrweuropa_cleaned.csv",
    "foerderdatenbank": "/Users/kiranmulawad/AI-Funding/2_preprocessing/data/funding_foerderdatenbank_cleaned.csv",
    "isb": "/Users/kiranmulawad/AI-Funding/2_preprocessing/data/funding-isb-cleaned.csv"
}



In [None]:
# %%
# New (put 'source' first):
final_columns = [
    "source", "name", "description", "domain", "eligibility", "location",
    "amount", "procedure", "contact", "deadline", "url"
]



In [11]:
# Initialize list to collect DataFrames
dataframes = []

In [12]:
# Process each file and attach source
for site, path in files.items():
    df = pd.read_csv(path, dtype=str)  # Read with all columns as string
    df["source"] = site  # Add source column at the beginning

    # Ensure all required columns exist
    for col in final_columns:
        if col not in df.columns:
            df[col] = f"{col} information not found"

    # Reorder columns to ensure consistency (source first)
    ordered_cols = ["source"] + [col for col in final_columns if col != "source"]
    df = df[ordered_cols]

    dataframes.append(df)


In [13]:
# %%
# Concatenate all dataframes
final_df = pd.concat(dataframes, ignore_index=True)



In [14]:
# %%
# Format deadline field to standard readable date (e.g. February 5, 2025)
def standardize_deadline(date_str):
    if pd.isna(date_str) or date_str.strip() == "":
        return "No deadline provided"
    try:
        date_obj = parser.parse(date_str, dayfirst=True)
        return date_obj.strftime("%B %d, %Y")
    except Exception:
        return date_str  # Leave unchanged if it can't be parsed

final_df["deadline"] = final_df["deadline"].apply(standardize_deadline)



In [15]:
# %%
# Save the merged output
output_path = "data/merged_funding_data.csv"
final_df.to_csv(output_path, index=False)

print(f"✅ Merged CSV saved to: {output_path}")


✅ Merged CSV saved to: data/merged_funding_data.csv
