# AI-ML-Pipeline-for-arraging-csv-xlsx

### Author: Mohit Janbandhu

## Purpose: Map formate file into data's schema and export outputdata as CSV or Excel

In [4]:
import io
import difflib
import numpy as np
import pandas as pd
import ipywidgets as widgets
from IPython.display import display, FileLink

In [9]:

# Helpers
# -------------------------

def align_data(df_schema, df_data):
    schema_columns = list(df_schema.columns)
    aligned = pd.DataFrame(columns=schema_columns)
    mapping = {}
    
    for col in schema_columns:
        match = difflib.get_close_matches(col, df_data.columns, n=1, cutoff=0.6)
        if match:
            aligned[col] = df_data[match[0]]
            mapping[col] = match[0]
        else:
            aligned[col] = np.nan
            mapping[col] = None
    return aligned, mapping


def preview_and_download(aligned, mapping, fmt):
    # Show mapping table
    print("\n===== Column Mapping (data1 -> data2) =====")
    for k, v in mapping.items():
        print(f"{k:25s} <-- {v if v else 'None'}")

    # Show preview of aligned data
    print("\n===== Preview (first 5 rows) =====")
    display(aligned.head())

    # Save output
    out_path = "outputdata.csv" if fmt == "CSV" else "outputdata.xlsx"
    if fmt == "CSV":
        aligned.to_csv(out_path, index=False)
    else:
        aligned.to_excel(out_path, index=False)

    # Provide download link
    print(f"\nFile saved as {out_path}. Click below to download:")
    display(FileLink(out_path))





In [10]:
# Jupyter UI

fu1 = widgets.FileUpload(accept=".csv,.xlsx,.xls", multiple=False, description="Upload data1")
fu2 = widgets.FileUpload(accept=".csv,.xlsx,.xls", multiple=False, description="Upload data2")
fmt = widgets.Dropdown(options=["CSV", "Excel"], value="CSV", description="Output")
run_btn = widgets.Button(description="Run Pipeline", button_style="primary")
out_area = widgets.Output()

def on_run_clicked(_):
    with out_area:
        out_area.clear_output()
        if len(fu1.value) == 0 or len(fu2.value) == 0:
            print("Please upload both files.")
            return
        
        # Read uploaded files
        key1 = list(fu1.value.keys())[0]
        key2 = list(fu2.value.keys())[0]
        f1 = fu1.value[key1]
        f2 = fu2.value[key2]

        # Auto-detect format
        def read_any(file_bytes, filename):
            if filename.lower().endswith(".csv"):
                return pd.read_csv(io.BytesIO(file_bytes))
            else:
                return pd.read_excel(io.BytesIO(file_bytes))
        
        df1 = read_any(f1["content"], key1)
        df2 = read_any(f2["content"], key2)

        print("Files loaded")
        print(f"data1 (schema) shape: {df1.shape}")
        print(f"data2 (content) shape: {df2.shape}")

        # Align data
        aligned, mapping = align_data(df1, df2)
        preview_and_download(aligned, mapping, fmt.value)

run_btn.on_click(on_run_clicked)
display(fu1, fu2, fmt, run_btn, out_area)

FileUpload(value={}, accept='.csv,.xlsx,.xls', description='Upload data1')

FileUpload(value={}, accept='.csv,.xlsx,.xls', description='Upload data2')

Dropdown(description='Output', options=('CSV', 'Excel'), value='CSV')

Button(button_style='primary', description='Run Pipeline', style=ButtonStyle())

Output()

In [7]:
print("If you see upload widgets below, use them. Otherwise, set file paths in the fallback section.")

upload_data1 = None
upload_data2 = None
out_format = None

if IPYW_AVAILABLE:
    fu1 = widgets.FileUpload(accept=".csv,.xlsx,.xls", multiple=False, description="Upload data1")
    fu2 = widgets.FileUpload(accept=".csv,.xlsx,.xls", multiple=False, description="Upload data2")
    fmt = widgets.Dropdown(options=["CSV", "Excel"], value="CSV", description="Output")
    run_btn = widgets.Button(description="Run Pipeline", button_style="primary")
    out_area = widgets.Output()

    def on_run_clicked(_):
        with out_area:
            out_area.clear_output()
            if len(fu1.value) == 0 or len(fu2.value) == 0:
                print("Please upload both files.")
                return
            # read
            key1 = list(fu1.value.keys())[0]
            key2 = list(fu2.value.keys())[0]
            f1 = fu1.value[key1]
            f2 = fu2.value[key2]
            df1 = _read_any(f1["content"], key1)
            df2 = _read_any(f2["content"], key2)

            print("Files loaded")
            print(f"data1 shape: {df1.shape}")
            print(f"data2 shape: {df2.shape}")

            schema = infer_schema(df1)
            mapping = build_column_mapping(list(df1.columns), list(df2.columns), cutoff=0.75)
            aligned, rep = align_to_schema(df2, schema, mapping, fill_missing_with=np.nan)

            print_report(schema, mapping, rep)

            # choose output path in working dir
            if fmt.value == "CSV":
                out_path = "outputdata.csv"
            else:
                out_path = "outputdata.xlsx"

            abs_path = _write_output(aligned, out_path)
            print(f"\nOutput written to: {abs_path}")

    run_btn.on_click(on_run_clicked)

    display(fu1, fu2, fmt, run_btn, out_area)


If you see upload widgets below, use them. Otherwise, set file paths in the fallback section.


FileUpload(value={}, accept='.csv,.xlsx,.xls', description='Upload data1')

FileUpload(value={}, accept='.csv,.xlsx,.xls', description='Upload data2')

Dropdown(description='Output', options=('CSV', 'Excel'), value='CSV')

Button(button_style='primary', description='Run Pipeline', style=ButtonStyle())

Output()