In [2]:
!pip install pandas-datareader

Collecting pandas-datareader
  Downloading pandas_datareader-0.10.0-py3-none-any.whl.metadata (2.9 kB)
Downloading pandas_datareader-0.10.0-py3-none-any.whl (109 kB)
Installing collected packages: pandas-datareader
Successfully installed pandas-datareader-0.10.0


In [3]:
from pandas_datareader import wb
import numpy as np
import pandas as pd
pd.options.display.width = 0

# Download data from World Bank
names = [
    "NE.EXP.GNFS.CD", # Exports of goods and services (current US$)
    "NE.IMP.GNFS.CD", # Imports of goods and services (current US$)
    "NV.AGR.TOTL.CD", # Agriculture, forestry, and fishing, value added (current US$)
    "NY.GDP.MKTP.CD", # GDP (current US$)
    "NE.RSB.GNFS.CD", # External balance on goods and services (current US$)
]
df = wb.download(country="all", indicator=names, start=2010, end=2010).reset_index()

# We remove aggregates and keep only countries with no missing data
countries = wb.get_countries()
non_aggregates = countries[countries["region"] != "Aggregates"].name
df_nonagg = df[df["country"].isin(non_aggregates)].dropna()

# Extract vector for each country
vectors = {}
for rowid, row in df_nonagg.iterrows():
    vectors[row["country"]] = row[names].values

# Compute the Euclidean and cosine distances
euclid = {}
cosine = {}

target = "Australia"
for country in vectors:
    vecA = vectors[target]
    vecB = vectors[country]
    dist = np.linalg.norm(vecA - vecB)
    cos = (vecA @ vecB) / (np.linalg.norm(vecA) * np.linalg.norm(vecB))
    euclid[country] = dist    # Euclidean distance
    cosine[country] = 1-cos   # cosine distance

# Print the results
df_distance = pd.DataFrame({"euclid": euclid, "cos": cosine})
print("Closest by Euclidean distance:")
print(df_distance.sort_values(by="euclid").head())
print()
print("Closest by Cosine distance:")
print(df_distance.sort_values(by="cos").head())

# Print the detail metrics
print()
print("Detail metrics:")
print(df_nonagg[df_nonagg.country.isin(["Mexico", "Colombia", "Australia"])])

Closest by Euclidean distance:
                 euclid           cos
Australia  0.000000e+00  2.220446e-16
Mexico     1.413113e+11  6.959944e-03
Spain      3.452526e+11  3.009764e-03
Turkiye    3.825043e+11  3.576441e-03
Indonesia  4.107555e+11  7.403175e-03

Closest by Cosine distance:
                    euclid           cos
Australia     0.000000e+00  2.220446e-16
Colombia      9.006747e+11  1.691545e-03
South Africa  7.538171e+11  2.300379e-03
Cuba          1.133493e+12  2.400544e-03
Italy         1.095467e+12  2.592310e-03

Detail metrics:
       country  year  NE.EXP.GNFS.CD  NE.IMP.GNFS.CD  NV.AGR.TOTL.CD  \
59   Australia  2010    2.274270e+11    2.381005e+11    2.532545e+10   
91    Colombia  2010    4.681628e+10    5.135131e+10    1.812061e+10   
176     Mexico  2010    3.207660e+11    3.344563e+11    3.417981e+10   

     NY.GDP.MKTP.CD  NE.RSB.GNFS.CD  
59     1.148838e+12   -1.067345e+10  
91     2.864985e+11   -4.535024e+09  
176    1.105424e+12   -1.369028e+10  
