In [1]:
%matplotlib inline

In [2]:
import os

import glob
import pycountry
import difflib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gpd

from fuzzywuzzy import process, fuzz
from shapely.geometry import Point

In [3]:
%store -r constants_unique_countries constants_fuzzy_matches

In [4]:
%reload_ext autoreload
from utils.cleanup_helpers import add_fuzzy_matches_key_column

In [5]:
# pd.set_option('display.max_columns', None)
# pd.set_option('display.expand_frame_repr', False)

# ## Unique countries list/countries of interest 
# unique_countries_df = pd.DataFrame({'Unique_Countries': constants_unique_countries})

In [6]:
## Adding Alpha 3 matches to the fuzzy dict

def alpha3_matches(country_dict):
    
    for country_name in country_dict.keys():
        country = pycountry.countries.get(name=country_name)
        if country:
            country_dict[country_name].append(country.alpha_3)
            
    return country_dict

In [7]:
alpha_dict = alpha3_matches(constants_fuzzy_matches)
alpha_dict

{'Afghanistan': ['Afghanistan', 'AFG'],
 'Albania': ['Albania', 'ALB'],
 'Algeria': ['Algeria', 'DZA'],
 'Argentina': ['Argentina', 'ARG'],
 'Armenia': ['Armenia', 'ARM'],
 'Australia': ['Australia', 'AUS'],
 'Austria': ['Austria', 'AUT'],
 'Azerbaijan': ['Azerbaijan*', 'Azerbaijan', 'AZE'],
 'Bahrain': ['Bahrain', 'BHR'],
 'Bangladesh': ['Bangladesh', 'BGD'],
 'Belarus': ['Belarus*', 'Belarus', 'BLR'],
 'Belgium': ['Belgium', 'BEL'],
 'Benin': ['Benin', 'BEN'],
 'Bolivia': ['Bolivia'],
 'Bosnia and Herzegovina': ['Bosnia and Herzegovina', 'BIH'],
 'Botswana': ['Botswana*', 'Botswana', 'BWA'],
 'Brazil': ['Brazil', 'BRA'],
 'Bulgaria': ['Bulgaria', 'BGR'],
 'Burkina Faso': ['Burkina Faso', 'BFA'],
 'Cambodia': ['Cambodia', 'KHM'],
 'Cameroon': ['Cameroon', 'CMR'],
 'Canada': ['Canada', 'CAN'],
 'Chad': ['Chad', 'Chad*', 'TCD'],
 'Chile': ['Chile', 'CHL'],
 'China': ['China', 'CHN'],
 'Colombia': ['Colombia', 'COL'],
 'Congo': ['Congo', 'Congo (Kinshasa)', 'Congo (Brazzaville)', 'COG'],

In [8]:
## Cleaning ot the counties name within the IMF GDP df

def clean_country_name(df):
    cleaned_names = []
    
    for i in df['Country']:
        if "," in i:
            i = i.split(",")[0].strip()
        cleaned_names.append(i)
        
    df['Country'] = cleaned_names
    
    return df

In [9]:
## Cleaning missing values

def drop_nodata(df):
    
    df2 = df.replace("no data", pd.NA).dropna()
    
    return df2

In [10]:
## Mean to get rid of duplicates - duplicate rows are removed, only one average row remains

def mean_and_drop_duplicates(df):
    
    if df['alpha_dict_key'].duplicated().any():
        duplicated_keys = df[df['alpha_dict_key'].duplicated(keep=False)]['alpha_dict_key'].unique()
        
        for key in duplicated_keys:
            dup_rows = df[df['alpha_dict_key'] == key]
            year_cols = [col for col in df.columns if str(col).isdigit()]
            mean_values = dup_rows[year_cols].mean()
      
            mean_values['Country'] = dup_rows['Country'].iloc[0]
            mean_values['alpha_dict_key'] = key
            
            df = df[df['alpha_dict_key'] != key]
            df = pd.concat([df, mean_values.to_frame().T], ignore_index=True)

    return df

In [11]:
cwd = os.getcwd()
data_path = '\\data\\gdp_data_pv\\imf-dm-export-20230810.xlsx'
path = cwd + data_path

gdp_df = pd.read_excel(path, usecols=[0, 36,37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49 ], skiprows=lambda x: x in [1])

gdp_df_2 = gdp_df[0:196]
gdp_df_2.rename(columns={"GDP per capita, current prices\n (U.S. dollars per capita)": "Country"}, inplace=True)

gdp_clear = clean_country_name(gdp_df_2)
gdp_matched = add_fuzzy_matches_key_column(gdp_clear, alpha_dict, 'alpha_dict_key')
gdp_filtered = drop_nodata(gdp_matched)
dup_removed = mean_and_drop_duplicates(gdp_filtered)
dup_removed.to_excel("gdp_output.xlsx")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gdp_df_2.rename(columns={"GDP per capita, current prices\n (U.S. dollars per capita)": "Country"}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Country'] = cleaned_names
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_key] = np.nan
