In [86]:
%matplotlib inline

In [111]:
import os

import glob
import difflib
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from fuzzywuzzy import process, fuzz
from shapely.geometry import Point

In [112]:
%store -r constants_unique_countries

In [113]:
%reload_ext autoreload
from utils.cleanup_helpers import rename_and_retain_cols_for_all_dfs, import_all_dfs

In [114]:
# pd.set_option('display.max_columns', None)
# pd.set_option('display.expand_frame_repr', False)

# ## Unique countries list/countries of interest 
# unique_countries_df = pd.DataFrame({'Unique_Countries': constants_unique_countries})

In [115]:
## Extracing all unique country names throughout the years into a set 

def extract_all_unique_countries(dfs_dict):

    all_countries = []

    for year, df in dfs_dict.items():
        country_names = df['Country'].tolist()
        all_countries.extend(country_names)

    extracted_unique_countries = list(set(all_countries))

    return extracted_unique_countries

In [116]:
## Finding fuzzy matches between country names and unique countries

def find_fuzzy_matches(extracted_unique_countries, unique_countries):
    fuzzy_matches = {}
    
    for country in extracted_unique_countries:
        
        if "Czech" in country:
            matches = process.extract(country, unique_countries, limit=None)
            fuzzy_matches[country] = [match[0] for match in matches if match[1] >= 65]
        elif "Hong Kong" in country:
            matches = process.extract(country, unique_countries, limit=None)
            fuzzy_matches[country] = [match[0] for match in matches if match[1] >= 65]
        elif "Taiwan" in country:
            matches = process.extract(country, unique_countries, limit=None)
            fuzzy_matches[country] = [match[0] for match in matches if match[1] >= 75]
        elif "Congo" in country:
            matches = process.extract(country, unique_countries, limit=None)
            fuzzy_matches[country] = [match[0] for match in matches if match[1] >= 75]
        elif "Cyprus" in country:
            matches = process.extract(country, unique_countries, limit=None)
            fuzzy_matches[country] = [match[0] for match in matches if match[1] >= 75]
        elif "Macedonia" in country:
            matches = process.extract(country, unique_countries, limit=None)
            fuzzy_matches[country] = [match[0] for match in matches if match[1] >= 75]
        else:
            matches = process.extract(country, unique_countries, limit=None)
            fuzzy_matches[country] = [match[0] for match in matches if match[1] >= 92]
            
    return fuzzy_matches

In [117]:
years = range(2015, 2023)
cwd = os.getcwd()
data_path = '\\data\\happiness_score_2015_2022'
path = cwd + data_path

countries_dfs_dict = import_all_dfs(years, path)
renamed_dfs_dicts = rename_and_retain_cols_for_all_dfs(countries_dfs_dict)
all_unique_contries = extract_all_unique_countries(renamed_dfs_dicts)
fuzzy_matches = find_fuzzy_matches(constants_unique_countries, all_unique_contries)

fuzzy_matches

{'Afghanistan': ['Afghanistan'],
 'Albania': ['Albania'],
 'Algeria': ['Algeria'],
 'Argentina': ['Argentina'],
 'Armenia': ['Armenia'],
 'Australia': ['Australia'],
 'Austria': ['Austria'],
 'Azerbaijan': ['Azerbaijan', 'Azerbaijan*'],
 'Bahrain': ['Bahrain'],
 'Bangladesh': ['Bangladesh'],
 'Belarus': ['Belarus*', 'Belarus'],
 'Belgium': ['Belgium'],
 'Benin': ['Benin'],
 'Bolivia': ['Bolivia'],
 'Bosnia and Herzegovina': ['Bosnia and Herzegovina'],
 'Botswana': ['Botswana*', 'Botswana'],
 'Brazil': ['Brazil'],
 'Bulgaria': ['Bulgaria'],
 'Burkina Faso': ['Burkina Faso'],
 'Cambodia': ['Cambodia'],
 'Cameroon': ['Cameroon'],
 'Canada': ['Canada'],
 'Chad': ['Chad*', 'Chad'],
 'Chile': ['Chile'],
 'China': ['China'],
 'Colombia': ['Colombia'],
 'Congo': ['Congo', 'Congo (Brazzaville)', 'Congo (Kinshasa)'],
 'Costa Rica': ['Costa Rica'],
 'Croatia': ['Croatia'],
 'Cyprus': ['Cyprus', 'North Cyprus*', 'Northern Cyprus', 'North Cyprus'],
 'Czech': ['Czech Republic', 'Czechia'],
 'Denmark