In [3]:
import pickle
import pandas as pd

# Load and filter function

def load_and_filter(path, start_date):
    """
    Load a pickle file and filter entries by date >= start_date.
    Supports pandas.DataFrame (with 'date' column) or dict (keys as date strings).
    Returns the filtered DataFrame or dict.
    """
    obj = pickle.load(open(path, 'rb'))
    # DataFrame case
    if isinstance(obj, pd.DataFrame):
        df = obj.copy()
        if 'date' in df.columns:
            df.set_index(pd.to_datetime(df['date']), inplace=True)
        return df.loc[df.index >= start_date]
    # Dictionary case
    elif isinstance(obj, dict):
        result = {}
        for key, value in obj.items():
            try:
                ts = pd.to_datetime(key)
            except Exception:
                continue
            if ts >= start_date:
                result[key] = value
        return result
    else:
        raise ValueError(f"Unsupported object type: {type(obj)}")

# Comparison functions

def compare_dataframes(df1, df2):
    try:
        pd.testing.assert_frame_equal(df1, df2)
        print("DataFrames are identical from 2017 onward.")
    except AssertionError as err:
        print("Differences in DataFrames from 2017 onward:")
        print(err)


def compare_dicts(d1, d2):
    keys1, keys2 = set(d1), set(d2)
    only1 = keys1 - keys2
    only2 = keys2 - keys1
    common = keys1 & keys2
    diffs = {k: (d1[k], d2[k]) for k in common if d1[k] != d2[k]}
    if not only1 and not only2 and not diffs:
        print("Dictionaries are identical from 2017 onward.")
    else:
        print("Differences in dictionaries from 2017 onward:")
        if only1:
            print(f"Keys only in first file: {only1}")
        if only2:
            print(f"Keys only in second file: {only2}")
        if diffs:
            print("Mismatched entries:")
            for k, (v1, v2) in diffs.items():
                print(f"  {k}: {v1} != {v2}")

# Main execution
if __name__ == '__main__':
    file1 = 'ro_views.pkl'
    file2 = 'bayes_factor_views_v2_2014.pkl'
    start_date = pd.Timestamp('2017-01-01')

    ro_data = load_and_filter(file1, start_date)
    bayes_data = load_and_filter(file2, start_date)

    if isinstance(ro_data, pd.DataFrame) and isinstance(bayes_data, pd.DataFrame):
        compare_dataframes(ro_data, bayes_data)
    elif isinstance(ro_data, dict) and isinstance(bayes_data, dict):
        compare_dicts(ro_data, bayes_data)
    else:
        raise ValueError(f"Mismatched types: {type(ro_data)}, {type(bayes_data)}")


Dictionaries are identical from 2017 onward.
