In [None]:
%matplotlib inline

In [37]:
import os

import glob
import pycountry
import difflib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import geopandas as gpd

from fuzzywuzzy import process, fuzz
from shapely.geometry import Point
from sklearn.linear_model import LogisticRegression

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
# pd.set_option('display.max_rows', None)

In [None]:
## Unique countries list/countries of interest 

unique_countries = [
"Afghanistan","Albania","Algeria","Argentina","Armenia","Australia","Austria","Azerbaijan","Bahrain","Bangladesh","Belarus","Belgium","Benin","Bolivia","Bosnia and Herzegovina","Botswana","Brazil","Bulgaria","Burkina Faso","Cambodia","Cameroon",
"Canada","Chad","Chile","China","Colombia","Congo","Costa Rica","Croatia","Cyprus","Czech","Denmark","Dominican Republic","Ecuador","Egypt","El Salvador","Estonia","United Kingdom","Ethiopia","Finland","France","Gabon",
"Georgia","Germany","Ghana","Greece","Guatemala","Guinea","Honduras","Hong Kong","Hungary","Iceland","India","Indonesia","Iran","Iraq","Ireland","Israel","Italy","Ivory Coast","Jamaica","Japan","Jordan","Kazakhstan","Kenya","Kosovo","Kuwait","Kyrgyzstan",
"Latvia","Lebanon","Liberia","Libya","Lithuania","Luxembourg","Macedonia","Madagascar","Malawi","Malaysia","Mali","Malta","Mauritania","Mauritius","Mexico","Moldova","Mongolia","Montenegro","Morocco","Myanmar","Namibia","Nepal","Netherlands",
"New Zealand","Nicaragua","Niger","Nigeria","Norway","Pakistan","Panama","Paraguay","Peru","Philippines","Poland","Portugal","Romania","Russia","Rwanda","Saudi Arabia","Senegal","Serbia","Sierra Leone",
"Singapore","Slovakia","Slovenia","South Africa","South Korea","Spain","Sri Lanka","Sweden","Switzerland","Taiwan","Tajikistan","Tanzania","Thailand","Togo","Tunisia","Turkey","Turkmenistan","Uganda","Ukraine",
"United Arab Emirates", "United Kingdom","United States","Uruguay","Uzbekistan","Venezuela","Vietnam","Yemen","Zambia","Zimbabwe"]
unique_countries_df =pd.DataFrame({'Unique_Countries': unique_countries})

In [9]:
## Adding Alpha 3 matches to the fuzzy dict

def alpha3_matches(country_dict):
    
    for country_name in country_dict.keys():
        country = pycountry.countries.get(name=country_name)
        if country:
            country_dict[country_name].append(country.alpha_3)
            
    return country_dict

In [71]:
fuzzy_dict_ref = {
 'Afghanistan': ['Afghanistan'],
 'Albania': ['Albania'],
 'Algeria': ['Algeria'],
 'Argentina': ['Argentina'],
 'Armenia': ['Armenia'],
 'Australia': ['Australia'],
 'Austria': ['Austria'],
 'Azerbaijan': ['Azerbaijan', 'Azerbaijan*'],
 'Bahrain': ['Bahrain'],
 'Bangladesh': ['Bangladesh'],
 'Belarus': ['Belarus*', 'Belarus'],
 'Belgium': ['Belgium'],
 'Benin': ['Benin'],
 'Bolivia': ['Bolivia'],
 'Bosnia and Herzegovina': ['Bosnia and Herzegovina'],
 'Botswana': ['Botswana*', 'Botswana'],
 'Brazil': ['Brazil'],
 'Bulgaria': ['Bulgaria'],
 'Burkina Faso': ['Burkina Faso'],
 'Cambodia': ['Cambodia'],
 'Cameroon': ['Cameroon'],
 'Canada': ['Canada'],
 'Chad': ['Chad*', 'Chad'],
 'Chile': ['Chile'],
 'China': ['China'],
 'Colombia': ['Colombia'],
 'Congo': ['Congo', 'Congo (Brazzaville)', 'Congo (Kinshasa)'],
 'Costa Rica': ['Costa Rica'],
 'Croatia': ['Croatia'],
 'Cyprus': ['Cyprus', 'Northern Cyprus', 'North Cyprus*', 'North Cyprus'],
 'Czech': ['Czech Republic', 'Czechia'],
 'Denmark': ['Denmark'],
 'Dominican Republic': ['Dominican Republic'],
 'Ecuador': ['Ecuador'],
 'Egypt': ['Egypt'],
 'El Salvador': ['El Salvador'],
 'Estonia': ['Estonia'],
 'United Kingdom': ['United Kingdom'],
 'Ethiopia': ['Ethiopia'],
 'Finland': ['Finland'],
 'France': ['France'],
 'Gabon': ['Gabon'],
 'Georgia': ['Georgia'],
 'Germany': ['Germany'],
 'Ghana': ['Ghana'],
 'Greece': ['Greece'],
 'Guatemala': ['Guatemala*', 'Guatemala'],
 'Guinea': ['Guinea'],
 'Honduras': ['Honduras'],
 'Hong Kong S.A': ['Hong Kong', 'Hong Kong S.A.R. of China', 'Hong Kong S.A.R., China', 'Hong Kong SAR'],
 'Hungary': ['Hungary'],
 'Iceland': ['Iceland'],
 'India': ['India'],
 'Indonesia': ['Indonesia'],
 'Iran': ['Iran'],
 'Iraq': ['Iraq'],
 'Ireland': ['Ireland'],
 'Israel': ['Israel'],
 'Italy': ['Italy'],
 'Ivory Coast': ['Ivory Coast'],
 'Jamaica': ['Jamaica'],
 'Japan': ['Japan'],
 'Jordan': ['Jordan'],
 'Kazakhstan': ['Kazakhstan'],
 'Kenya': ['Kenya'],
 'Kosovo': ['Kosovo'],
 'Kuwait': ['Kuwait*', 'Kuwait'],
 'Kyrgyzstan': ['Kyrgyzstan'],
 'Latvia': ['Latvia'],
 'Lebanon': ['Lebanon'],
 'Liberia': ['Liberia', 'Liberia*'],
 'Libya': ['Libya', 'Libya*'],
 'Lithuania': ['Lithuania'],
 'Luxembourg': ['Luxembourg', 'Luxembourg*'],
 'Macedonia': ['Macedonia', 'North Macedonia', 'North Macedonia '],
 'Madagascar': ['Madagascar*', 'Madagascar'],
 'Malawi': ['Malawi'],
 'Malaysia': ['Malaysia'],
 'Mali': ['Mali'],
 'Malta': ['Malta'],
 'Mauritania': ['Mauritania', 'Mauritania*'],
 'Mauritius': ['Mauritius'],
 'Mexico': ['Mexico'],
 'Moldova': ['Moldova'],
 'Mongolia': ['Mongolia'],
 'Montenegro': ['Montenegro'],
 'Morocco': ['Morocco'],
 'Myanmar': ['Myanmar'],
 'Namibia': ['Namibia'],
 'Nepal': ['Nepal'],
 'Netherlands': ['Netherlands'],
 'New Zealand': ['New Zealand'],
 'Nicaragua': ['Nicaragua'],
 'Niger': ['Niger*', 'Niger'],
 'Nigeria': ['Nigeria'],
 'Norway': ['Norway'],
 'Pakistan': ['Pakistan'],
 'Panama': ['Panama'],
 'Paraguay': ['Paraguay'],
 'Peru': ['Peru'],
 'Philippines': ['Philippines'],
 'Poland': ['Poland'],
 'Portugal': ['Portugal'],
 'Romania': ['Romania'],
 'Russia': ['Russia', 'Russian Federation'],
 'Rwanda': ['Rwanda*', 'Rwanda'],
 'Saudi Arabia': ['Saudi Arabia'],
 'Senegal': ['Senegal'],
 'Serbia': ['Serbia'],
 'Sierra Leone': ['Sierra Leone'],
 'Singapore': ['Singapore'],
 'Slovakia': ['Slovakia', 'Slovak Republic'],
 'Slovenia': ['Slovenia'],
 'South Africa': ['South Africa'],
 'South Korea': ['South Korea','Korea'],
 'Spain': ['Spain'],
 'Sri Lanka': ['Sri Lanka'],
 'Sweden': ['Sweden'],
 'Switzerland': ['Switzerland'],
 'Taiwan': ['Taiwan', 'Taiwan Province of China'],
 'Tajikistan': ['Tajikistan'],
 'Tanzania': ['Tanzania'],
 'Thailand': ['Thailand'],
 'Togo': ['Togo'],
 'Tunisia': ['Tunisia'],
 'Turkey': ['Turkey', 'Türkiye'],
 'Turkmenistan': ['Turkmenistan', 'Turkmenistan*'],
 'Uganda': ['Uganda'],
 'Ukraine': ['Ukraine'],
 'United Arab Emirates': ['United Arab Emirates'],
 'United States': ['United States'],
 'Uruguay': ['Uruguay'],
 'Uzbekistan': ['Uzbekistan'],
 'Venezuela': ['Venezuela'],
 'Vietnam': ['Vietnam'],
 'Yemen': ['Yemen*', 'Yemen'],
 'Zambia': ['Zambia'],
 'Zimbabwe': ['Zimbabwe']}

In [72]:
alpha_dict = alpha3_matches(fuzzy_dict_ref)
alpha_dict

{'Afghanistan': ['Afghanistan', 'AFG'],
 'Albania': ['Albania', 'ALB'],
 'Algeria': ['Algeria', 'DZA'],
 'Argentina': ['Argentina', 'ARG'],
 'Armenia': ['Armenia', 'ARM'],
 'Australia': ['Australia', 'AUS'],
 'Austria': ['Austria', 'AUT'],
 'Azerbaijan': ['Azerbaijan', 'Azerbaijan*', 'AZE'],
 'Bahrain': ['Bahrain', 'BHR'],
 'Bangladesh': ['Bangladesh', 'BGD'],
 'Belarus': ['Belarus*', 'Belarus', 'BLR'],
 'Belgium': ['Belgium', 'BEL'],
 'Benin': ['Benin', 'BEN'],
 'Bolivia': ['Bolivia'],
 'Bosnia and Herzegovina': ['Bosnia and Herzegovina', 'BIH'],
 'Botswana': ['Botswana*', 'Botswana', 'BWA'],
 'Brazil': ['Brazil', 'BRA'],
 'Bulgaria': ['Bulgaria', 'BGR'],
 'Burkina Faso': ['Burkina Faso', 'BFA'],
 'Cambodia': ['Cambodia', 'KHM'],
 'Cameroon': ['Cameroon', 'CMR'],
 'Canada': ['Canada', 'CAN'],
 'Chad': ['Chad*', 'Chad', 'TCD'],
 'Chile': ['Chile', 'CHL'],
 'China': ['China', 'CHN'],
 'Colombia': ['Colombia', 'COL'],
 'Congo': ['Congo', 'Congo (Brazzaville)', 'Congo (Kinshasa)', 'COG'],

In [75]:
## Cleaning ot the counties name within the IMF GDP df

def clean_country_name(df):
    cleaned_names = []
    
    for i in df['Country']:
        if "," in i:
            i = i.split(",")[0].strip()
        cleaned_names.append(i)
        
    df['Country'] = cleaned_names
    
    return df

In [76]:
## Adding a new column containing the fuzzy match to the country column 

def add_fuzzy_matches_key_column(df, alpha_dict):

    df['alpha_dict_key'] = np.nan
    
    for key, values in alpha_dict.items():
        for value in values:
            df.loc[df['Country'] == value, 'alpha_dict_key'] = key
    
    return df

In [79]:
## Cleaning missing values

def drop_nodata(df):
    
    df2 = df.replace("no data", pd.NA).dropna()
    
    return df2

In [86]:
## Mean to get rid of duplicates - duplicate rows are removed, only one average row remains

def mean_and_drop_duplicates(df):
    
    if df['alpha_dict_key'].duplicated().any():
        duplicated_keys = df[df['alpha_dict_key'].duplicated(keep=False)]['alpha_dict_key'].unique()
        
        for key in duplicated_keys:
            dup_rows = df[df['alpha_dict_key'] == key]
            year_cols = [col for col in df.columns if str(col).isdigit()]
            mean_values = dup_rows[year_cols].mean()
            
            mean_values['Country'] = dup_rows['Country'].iloc[0]
            mean_values['alpha_dict_key'] = key
            
            df = df[df['alpha_dict_key'] != key]
            df = df.append(mean_values, ignore_index=True)

    return df

In [91]:
gdp_df = pd.read_excel(r'C:/Users/user/OneDrive/Documents/coding/Python/Data Science/Regression Models/06. Regression-Models-Lab/imf-dm-export-20230810.xlsx' , usecols=[0, 36,37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49 ], skiprows=lambda x: x in [1])

gdp_df_2 = gdp_df[0:196]
gdp_df_2.rename(columns={"GDP per capita, current prices\n (U.S. dollars per capita)": "Country"}, inplace=True)

gdp_clear = clean_country_name(gdp_df_2)
gdp_matched = add_fuzzy_matches_key_column(gdp_clear, alpha_dict)
gdp_filtered = drop_nodata(gdp_matched)

dup_removed = mean_and_drop_duplicates(gdp_filtered)
dup_removed.to_excel("gdp_output.xlsx")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gdp_df_2.rename(columns={"GDP per capita, current prices\n (U.S. dollars per capita)": "Country"}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Country'] = cleaned_names
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['alpha_dict_key'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_ind