<a href="https://colab.research.google.com/github/LGBFBenchmarking/ChangeLogMaker/blob/main/Automate_differences.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Test code for creating LGBF change log for Monthly refreshes. The below code find the difference between 2 dataframes/datasets.

#Load files

##new_df is new data including new year's data
##diff_df has old data (no changes or new data for the new year)

In [1]:
import pandas as pd
import numpy as np

def get_data(url): # this function gets data stored as csv file from a given url
  df = pd.read_csv(url)
  return df


In [2]:
url_old = "https://raw.githubusercontent.com/LGBFBenchmarking/ChangeLogMaker/refs/heads/main/LGBF_data_April_2025.csv"
url_new = "https://raw.githubusercontent.com/LGBFBenchmarking/ChangeLogMaker/refs/heads/main/LGBF_Data_May2025.csv"
old_df = get_data(url_old)
new_df = get_data(url_new)

In [3]:
old_df.dtypes

Unnamed: 0,0
Code,object
LocalAuthority,object
Year,object
Indicator,float64
Numerator,float64
Denominator,float64


In [None]:
new_df.dtypes

Unnamed: 0,0
Code,object
LocalAuthority,object
Year,object
Indicator,float64
Numerator,float64
Denominator,float64


In [None]:
new_df['Indicator'] = new_df['Indicator'].map("{:.2f}".format)
new_df['Numerator'] = new_df['Numerator'].map('{:.2f}'.format)
new_df['Denominator'] = new_df['Denominator'].map('{:.2f}'.format)

In [None]:
old_df['Indicator'] = old_df['Indicator'].map('{:.2f}'.format)
old_df['Numerator'] = old_df['Numerator'].map('{:.2f}'.format)
old_df['Denominator'] = old_df['Denominator'].map('{:.2f}'.format)

In [4]:
old_df = old_df.round(4)
new_df = new_df.round(4)

In [None]:
old_df.head()

Unnamed: 0,Code,LocalAuthority,Year,Indicator,Numerator,Denominator
0,C&L01,Aberdeen City,2010-11,0.33,637.0,1922292.0
1,C&L01,Aberdeen City,2011-12,0.76,1554.0,2045051.0
2,C&L01,Aberdeen City,2012-13,3.64,7883.0,2163756.0
3,C&L01,Aberdeen City,2013-14,3.48,7726.0,2222588.0
4,C&L01,Aberdeen City,2014-15,3.02,7517.0,2487138.0


In [None]:
new_df.head()

Unnamed: 0,Code,LocalAuthority,Year,Indicator,Numerator,Denominator
0,C&L01,Aberdeen City,2010-11,0.33,637.0,1922292.0
1,C&L01,Aberdeen City,2011-12,0.76,1554.0,2045051.0
2,C&L01,Aberdeen City,2012-13,3.64,7883.0,2163756.0
3,C&L01,Aberdeen City,2013-14,3.48,7726.0,2222588.0
4,C&L01,Aberdeen City,2014-15,3.02,7517.0,2487138.0


In [5]:
df1 = old_df
df2 = new_df

In [6]:
newdf = df2[~df2.astype(str).apply(tuple, 1).isin(df1.astype(str).apply(tuple, 1))]

In [7]:
newdf

Unnamed: 0,Code,LocalAuthority,Year,Indicator,Numerator,Denominator
6117,CHN08a,Aberdeen City,2023-24,3998.6065,14347.0000,69.0
6131,CHN08a,Aberdeenshire,2023-24,2591.1681,10914.0000,81.0
6145,CHN08a,Angus,2023-24,6224.7596,10358.0000,32.0
6159,CHN08a,Argyll & Bute,2023-24,2536.6587,4221.0000,32.0
6173,CHN08a,Clackmannanshire,2023-24,4575.4438,3093.0000,13.0
...,...,...,...,...,...,...
38437,CHN23,Scotland,2023-24,0.1750,,
38479,CLIM03,Scotland,2023-24,28.6673,157386.5526,5490100.0
38486,CLIM04,Scotland,2023-24,48.1484,264339.3936,5490100.0
38493,CLIM05,Scotland,2023-24,56.1712,308385.6782,5490100.0


In [None]:
from google.colab import files
newdf. to_excel('newdf.xlsx')
files.download('newdf.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [8]:
def dataframe_difference(df1, df2, which=None):
    """Find rows which are different between two DataFrames."""
    comparison_df = df1.merge(
        df2,
        indicator=True,
        how='outer'
    )
    if which is None:
        diff_df = comparison_df[comparison_df['_merge'] != 'both']  # Identify data changes/updates
    else:
        diff_df = comparison_df[comparison_df['_merge'] == which]
    return diff_df

In [9]:
def getColChangedName(row):
    colsChanged = []
    for c in df1.columns.values:
        if row[c+"_diff"] == True:
            colsChanged.append(c)
    return ", ".join(colsChanged)

In [10]:
df1 = old_df.set_index(['Code', 'LocalAuthority', 'Year'])
df2 = new_df.set_index(['Code','LocalAuthority', 'Year'])

diffs = df1.merge(df2, left_index=True, right_index=True, suffixes=('_old', '_new'))

diff_cols = [c+"_diff" for c in df1.columns.values]

for c in df1.columns.values:
    diffs[c+"_diff"] = diffs[c+"_old"] != diffs[c+"_new"]

diffs["Record_Changed"] = diffs[diff_cols].sum(axis=1)/len(diff_cols) > 0


diffs = diffs[diffs["Record_Changed"] == 1]
diffs["Cols_Changed"] = diffs.apply(lambda row: getColChangedName(row), axis=1)

diffs

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Indicator_old,Numerator_old,Denominator_old,Indicator_new,Numerator_new,Denominator_new,Indicator_diff,Numerator_diff,Denominator_diff,Record_Changed,Cols_Changed
Code,LocalAuthority,Year,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
C&L03,Clackmannanshire,2014-15,,,,,,,True,True,True,True,"Indicator, Numerator, Denominator"
C&L03,Clackmannanshire,2015-16,,,,,,,True,True,True,True,"Indicator, Numerator, Denominator"
C&L03,Clackmannanshire,2016-17,,,,,,,True,True,True,True,"Indicator, Numerator, Denominator"
C&L03,Clackmannanshire,2017-18,,123.0,,,123.0,,True,False,True,True,"Indicator, Denominator"
C&L03,Clackmannanshire,2018-19,,86.0,,,86.0,,True,False,True,True,"Indicator, Denominator"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
SW07,Scotland,2019-20,0.818,,,0.818,,,False,True,True,True,"Numerator, Denominator"
SW07,Scotland,2020-21,0.825,,,0.825,,,False,True,True,True,"Numerator, Denominator"
SW07,Scotland,2021-22,0.758,,,0.758,,,False,True,True,True,"Numerator, Denominator"
SW07,Scotland,2022-23,0.750,,,0.750,,,False,True,True,True,"Numerator, Denominator"


In [None]:
from google.colab import files
diffs. to_excel('diffs.xlsx')
files.download('diffs.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

#Use diff and new_Data to get the Change log