In [2]:
import pandas as pd

In [None]:
    # load the files
    sf = pd.read_excel(sf_file)
    robo = pd.read_excel(robo_file)

    # Convert dates
    robo['Payment Date'] = pd.to_datetime(robo['Payment Date'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
    sf['Finish Date'] = pd.to_datetime(sf['Finish Date'], format='%m/%d/%Y, %I:%M %p')

    # convert start_date and end_date
    start_date = pd.to_datetime(start_date)
    end_date = pd.to_datetime(end_date)

    # filter based on date range
    sf = sf[(sf['Finish Date'] >= start_date) & (sf['Finish Date'] <= end_date)]
    robo = robo[(robo['Payment Date'] >= start_date) & (robo['Payment Date'] <= end_date)]

    # Standardize Transaction Names in both DataFrames
    def clean_transaction_name(x):
        if pd.isna(x):
            return None
        try:
            return str(int(float(x)))
        except:
            return str(x).strip()

    robo['Transaction Name Clean'] = robo['Transaction Name'].apply(clean_transaction_name)
    sf['Transaction Name Clean'] = sf['Transaction: Transaction Name'].apply(clean_transaction_name)

    # print shapes
    print('Shape of sf:', sf.shape)
    print('Shape of robo:', robo.shape)

    # merge DataFrames
    merged = pd.merge(
        sf,
        robo,
        how='outer',
        left_on='Transaction Name Clean',
        right_on='Transaction Name Clean',
        indicator=True,
        suffixes=('_SF', '_Robo')
    )

    # Rename columns
    merged = merged.rename(columns={
        'Roboticket ID': 'Roboticket ID SF',
        'Finish Date': 'Finish Date SF',
        'Payment Date': 'Payment Date Robo',
        'Transaction Name': 'Transaction Name Robo'
    })

    # Select relevant columns
    columns_to_output = [
        'Roboticket ID SF',
        'Transaction: Transaction Name',
        'Transaction Name Robo',
        'Transaction Name Clean',
        'Finish Date SF',
        'Payment Date Robo',
        '_merge'
    ]

    merged = merged[columns_to_output]

    # Label merge results
    merged['_merge'] = merged['_merge'].replace({
        'left_only': 'exists_in_sf',
        'right_only': 'exists_in_robo'
    })

    # Split subsets
    only_in_sf = merged[merged['_merge'] == 'exists_in_sf']
    only_in_robo = merged[merged['_merge'] == 'exists_in_robo']
    both = merged[merged['_merge'] == 'both']

    # Write to Excel
    with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
        both.to_excel(writer, sheet_name='Both', index=False)
        only_in_sf.to_excel(writer, sheet_name='Exists_in_SF', index=False)
        only_in_robo.to_excel(writer, sheet_name='Exists_in_Robo', index=False)

    print(f"Results saved to {output_file}")

    return {
        'sf': sf,
        'robo': robo,
        'merged': merged,
        'only_in_robo': only_in_robo,
        'only_in_sf': only_in_sf,
        'both': both
    }

IndentationError: unexpected indent (379639732.py, line 4)

In [None]:
print('Shape of only_in_robo:', only_in_robo.shape)
only_in_robo.to_excel('Only_in_Robo.xlsx', index=False)
only_in_robo

In [15]:
print('Shape of only_in_robo:', only_in_robo.shape)
only_in_robo.to_excel('Only_in_Robo.xlsx', index=False)
only_in_robo

Shape of only_in_robo: (455237, 6)


Unnamed: 0,Roboticket ID SF,Transaction: Transaction Name,Finish Date SF,Payment Date,Transaction Name,_merge
0,,,NaT,2015-11-16,103157,exists_in_robo
1,,,NaT,2015-11-16,103157,exists_in_robo
2,,,NaT,2015-11-16,103157,exists_in_robo
3,,,NaT,2015-11-16,103157,exists_in_robo
4,,,NaT,2015-11-16,103157,exists_in_robo
...,...,...,...,...,...,...
519262,,,NaT,2024-04-24,,exists_in_robo
519263,,,NaT,2024-04-24,,exists_in_robo
519264,,,NaT,2024-04-24,,exists_in_robo
519265,,,NaT,2024-04-24,,exists_in_robo


In [None]:
print('Shape of only_in_sf:', only_in_sf.shape)
only_in_sf