In [1]:
import pandas as pd

template = '2022_04_27-World_Bank_Group'<br>
template = '2022_09_16-World_Bank_Group'<br>
template = '2022_12_22-World_Bank_Group'

In [2]:
pd.options.display.max_columns = 200

In [3]:
template = '2022_09_16-World_Bank_Group'
ls_previous = [f'{template}-total', f'{template}-male', f'{template}-female']
print(ls_previous)

template = '2022_12_22-World_Bank_Group'
ls_next = [f'{template}-total', f'{template}-male', f'{template}-female']
print(ls_next)

['2022_09_16-World_Bank_Group-total', '2022_09_16-World_Bank_Group-male', '2022_09_16-World_Bank_Group-female']
['2022_12_22-World_Bank_Group-total', '2022_12_22-World_Bank_Group-male', '2022_12_22-World_Bank_Group-female']


In [4]:
def pretify_df(df, round_precision=3):
    ls_dropping = [
        'Euro area',
        'IBRD only',
        'IDA & IBRD total',
        'IDA total',
        'IDA blend',
        'IDA only',
        'OECD members',
        'Europe & Central Asia (excluding high income)',
        'East Asia & Pacific (excluding high income)',
        'Latin America & Caribbean (excluding high income)',
        'Middle East & North Africa (excluding high income)',
        'Sub-Saharan Africa (excluding high income)',
        'East Asia & Pacific (IDA & IBRD countries)',
        'Europe & Central Asia (IDA & IBRD countries)',
        'Middle East & North Africa (IDA & IBRD countries)',
        'Sub-Saharan Africa (IDA & IBRD countries)',           
        'Latin America & the Caribbean (IDA & IBRD countries)',
        'South Asia (IDA & IBRD)',
        'Heavily indebted poor countries (HIPC)',
        'Fragile and conflict affected situations',
        'Least developed countries: UN classification',
        'Early-demographic dividend',
        'Late-demographic dividend',
        'Pre-demographic dividend',
        'Post-demographic dividend',
        'Lower middle income',
        'Low & middle income',
        'Upper middle income',
    ]
    
    return df.drop(ls_dropping) \
             .rename(index = {'United States':'USA',
                              'Russian Federation': 'Russia',
                              'Korea, Rep.': 'South Korea',
                              'Egypt, Arab Rep.': 'Egypt',
                              'Congo, Dem. Rep.': 'Congo, DR',
                              'Congo, Rep.': 'Congo, Rep.',
                              'Yemen, Rep.': 'Yemen',
                              'Micronesia, Fed. Sts.': 'Micronesia',
                              'Gambia, The': 'Gambia',
                              'Iran, Islamic Rep.': 'Iran',
                              "Korea, Dem. People's Rep.": 'North Korea',
                              'Syrian Arab Republic': 'Syria',
                              'Venezuela, RB': 'Venezuela',
                              'Kyrgyz Republic': 'Kyrgyzstan',
                              'Lao PDR': 'Laos',
                              'Czech Republic': 'Czechia',
                              'Slovak Republic': 'Slovakia',
                              'Bahamas, The': 'Bahamas',
                              'Brunei Darussalam': 'Brunei',
                              'Turkiye': 'Turkey'
                             }) \
             .iloc[:, 3:-2] \
             .rename(columns=int) \
             .round(round_precision)  # transform names of columns from strings to integers, like '1960' -> 1960

In [5]:
def find_changes(table_previous, table_next, round_precision=3):
    
    # load info from csv-files
    df_previous = pretify_df(pd.read_csv(f'data\{table_previous}.csv', skiprows=4, index_col=0), round_precision=round_precision)
    df_next     = pretify_df(pd.read_csv(f'data\{table_next}.csv', skiprows=4, index_col=0),     round_precision=round_precision)
    
    # ensure that both csv-files contains the same list of regions
    ls_regions_previous = df_previous.index.to_list()
    ls_regions_next = df_next.index.to_list()
    assert ls_regions_previous == ls_regions_next, "Found differences in the lists of regions"
    
    print('Number of territories in compared databases:', len(ls_regions_previous), '\n')
    
    
    # find regions without any data, and intersection of these regions
    ls_empty_previous = df_previous[df_previous.isnull().all(axis=1)].index.to_list()
    ls_empty_next = df_next[df_next.isnull().all(axis=1)].index.to_list()
    ls_empty = [record for record in ls_empty_next if record in ls_empty_previous]
    
    print('Number of territories without any data:', len(ls_empty))
    print(*ls_empty, sep=', ', end='\n\n')
    
    # remove regions, for those in both dataFrames there are no data, from dataFrames
    df_previous.drop(index=ls_empty, inplace=True)
    df_next.drop(index=ls_empty, inplace=True)
    
    # Refresh list of territories for further processing
    ls_regions_previous = df_previous.index.to_list()
    ls_regions_next = df_next.index.to_list()
    assert ls_regions_previous == ls_regions_next, "Something wrong after intermediate transformations"

    
    # compare dataFrames
    df_diff=df_previous.compare(df_next)

    # print list of countries where changes were made
    ls_diff = df_diff.index.to_list()
    print('Number of territories, where changes were made:', len(ls_diff))
    for i, region in enumerate(ls_diff[:3] + ls_diff[-3:] if len(ls_diff)>7 else ls_diff, start=1):
        print(f"{i:2}. {region}")
        if i == 3 and len(ls_diff) > 7:
            print('...')
    print()

    # print list of countries without changes
    ls_kept = [region for region in ls_regions_previous if region not in ls_diff]
    print('Number of territories without noticeable changes:', len(ls_kept))
    for i, region in enumerate(ls_kept[:3] + ls_kept[-3:] if len(ls_kept)>7 else ls_kept, start=1):
        print(f"{i:2}. {region}")
        if i == 3 and len(ls_kept) > 7:
            print('...')
            
    return df_diff, ls_empty, ls_diff, ls_kept

round_precision = 3
df_diff_total, ls_empty_total, ls_diff_total, ls_kept_total = find_changes(ls_previous[0], ls_next[0], round_precision=round_precision)
print('\n———')
df_diff_male, ls_empty_male, ls_diff_male, ls_kept_male = find_changes(ls_previous[1], ls_next[1], round_precision=round_precision)
print('\n———')
df_diff_female, ls_empty_female, ls_diff_female, ls_kept_female = find_changes(ls_previous[2], ls_next[2], round_precision=round_precision)

Number of territories in compared databases: 238 

Number of territories without any data: 7
Andorra, American Samoa, Not classified, Monaco, Northern Mariana Islands, Nauru, San Marino

Number of territories, where changes were made: 192
 1. Aruba
 2. Africa Eastern and Southern
 3. Afghanistan
...
 4. South Africa
 5. Zambia
 6. Zimbabwe

Number of territories without noticeable changes: 39
 1. Australia
 2. Austria
 3. Belgium
...
 4. Sint Maarten (Dutch part)
 5. Seychelles
 6. USA

———
Number of territories in compared databases: 238 

Number of territories without any data: 7
Andorra, American Samoa, Not classified, Monaco, Northern Mariana Islands, Nauru, San Marino

Number of territories, where changes were made: 193
 1. Aruba
 2. Africa Eastern and Southern
 3. Afghanistan
...
 4. South Africa
 5. Zambia
 6. Zimbabwe

Number of territories without noticeable changes: 38
 1. Australia
 2. Austria
 3. Belgium
...
 4. Sint Maarten (Dutch part)
 5. Seychelles
 6. USA

———
Number o

In [6]:
assert ls_empty_total == ls_empty_male == ls_empty_female, "Lists of territories without any data are not identical"

In [7]:
set_diff_total = set(ls_diff_total)
set_diff_male = set(ls_diff_male)
set_diff_female= set(ls_diff_female)
ls_diff = sorted(list(set_diff_total and set_diff_male and set_diff_female))

print("TOTAL: Number of territories, where changes were made:", len(ls_diff))

TOTAL: Number of territories, where changes were made: 193


In [8]:
set_kept_total = set(ls_kept_total)
set_kept_male = set(ls_kept_male)
set_kept_female= set(ls_kept_female)
ls_kept = sorted(list(set_kept_total and set_kept_male and set_kept_female))

print("TOTAL: Number of territories, without noticeable changes:", len(ls_kept))

TOTAL: Number of territories, without noticeable changes: 38


<br />
<br />
<br />
<hr>

In [9]:
# explore concrete case
print([territory for territory in ls_diff_male   if territory not in ls_diff_total])
print([territory for territory in ls_diff_female if territory not in ls_diff_total])
print([territory for territory in ls_diff if territory not in ls_diff_total])

['North America']
['North America']
['North America']


In [10]:
df = pd.concat([df_diff_male.loc[['North America']], df_diff_female.loc[['North America']]]).fillna('')
df.index = ['male', 'female']
df

Unnamed: 0_level_0,1960,1960,1961,1961,1962,1962,1963,1963,1964,1964,1965,1965,1966,1966,1967,1967,1968,1968,1969,1969,1970,1970,1971,1971,1972,1972,1973,1973,1974,1974,1975,1975,1976,1976,1977,1977,1978,1978,1979,1979,1980,1980,1981,1981,1982,1982,1983,1983,1984,1984,1985,1985,1986,1986,1987,1987,1988,1988,1989,1989,1990,1990,1991,1991,1992,1992,1993,1993,1994,1994,1995,1995,1996,1996,1997,1997,1998,1998,1999,1999,2000,2000,2001,2001,2002,2002,2003,2003,2004,2004,2005,2005,2006,2006,2007,2007,2008,2008,2009,2009,2010,2010,2011,2011,2012,2012,2013,2013,2014,2014,2015,2015,2016,2016,2017,2017,2018,2018,2019,2019,2020,2020
Unnamed: 0_level_1,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other
male,66.752,66.753,,,,,,,,,,,66.895,66.896,67.182,67.183,66.294,66.296,67.025,67.027,67.312,67.314,67.614,67.615,67.603,67.605,67.802,67.803,68.351,68.352,68.917,68.918,69.218,69.219,,,69.73,69.731,70.13,70.131,,,70.558,70.559,,,71.168,71.169,71.278,71.279,71.298,71.299,71.408,71.409,71.609,71.61,,,71.921,71.922,,,,,,,,,,,,,,,,,,,,,74.346,74.347,74.556,74.557,,,74.786,74.787,,,75.286,75.287,,,,,75.887,75.888,,,76.488,76.489,76.609,76.61,,,,,76.831,76.832,76.661,76.662,,,,,,,,,,
female,,,73.676,73.675,,,73.499,73.498,,,,,74.036,74.035,74.425,74.424,74.255,74.254,74.549,74.548,74.844,74.843,75.153,75.152,75.237,75.236,75.445,75.444,75.996,75.995,,,76.876,76.875,77.274,77.273,77.398,77.397,77.876,77.875,77.528,77.527,,,78.226,78.225,78.245,78.244,,,,,78.365,78.364,,,,,,,,,79.098,79.097,,,,,,,,,,,,,,,,,,,79.734,79.735,,,,,,,,,,,,,80.836,80.835,,,81.247,81.246,81.347,81.348,,,,,,,,,,,81.392,81.393,81.505,81.506,81.687,81.688,80.58,80.581


<br />
<br />
<br />

In [11]:
print(f"Territories without noticeable changes ({len(ls_kept)}):")
# print(*ls_kept, sep='\n')
print(f"{ls_kept}")

Territories without noticeable changes (38):
['Australia', 'Austria', 'Belgium', 'Bermuda', 'Bulgaria', 'Canada', 'Cayman Islands', 'Curacao', 'Czechia', 'Denmark', 'Dominica', 'Faroe Islands', 'Finland', 'France', 'Greenland', 'Hungary', 'Israel', 'Italy', 'Japan', 'Latvia', 'Liechtenstein', 'Lithuania', 'Marshall Islands', 'Netherlands', 'New Zealand', 'Norway', 'Palau', 'Poland', 'Serbia', 'Seychelles', 'Sint Maarten (Dutch part)', 'Slovenia', 'Spain', 'St. Kitts and Nevis', 'Sweden', 'Switzerland', 'USA', 'United Kingdom']
