<a href="https://colab.research.google.com/github/MarciaFG/EconComplex/blob/main/brown_fitness_algo_servedio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
Economic complexity algorithm by Vito Servedio
Implementation Marcia R. Ferreira
"""
#! /usr/bin/python

from math import fabs,sqrt
import numpy
import random
import pandas as pd
from sys import exit, argv
import os.path
import sys
print(sys.path)
os.chdir("YOUR PATH") 

EPS = 1e-6 #convergence parameter

# the following values will be used if there is no Mcp file
# and a stochastic matrix will be generated:
erdos_p = .1
C = 100 # nr of authors
P = 100 # nr of journals

# it is possible to input the erdos param from command line
# to generate the stoch matrix
if(len(argv)>1):
   erdos_p = float((argv[1]))

# if the file matrix for time t exists than load it otherwise create a stochastic matrix
if(not os.path.exists("./mat_1980")):
   Mcp = numpy.zeros((C,P))
   for i in range(C):
      for j in range(P):
        Mcp[i,j] = 1.0 if(random.random()<erdos_p) else 0.0
else:
   print ("Mcp exists, downloading. "),
   Mcp = numpy.loadtxt("./mat_1980")
   C = len(Mcp)
   P = len(Mcp[0])
   print ("Dimensions: ",C,P)

Mcp = Mcp.transpose()

# save the stochastic Mcp in case of its use
file = "Mcp"
if(len(argv)>1):
   file = file[:]+("_p=%f" % erdos_p)
if(not os.path.exists(file)): numpy.savetxt(file,Mcp,"%g")

# Start of iterations

# initialize all to one
Simplicity = [1.0]*P
Fitness = [1.0]*C
Simplicity0 = Simplicity[:]
Fitness0 = Fitness[:]

count = 0
while(True):
   count += 1
   for c in range(C):
      Fitness[c] = 1.0
      for p in range(P):
        Fitness[c] += Mcp[c,p]/Simplicity0[p]

   for p in range(P):
      Simplicity[p] = 1.0
      for c in range(C):
        Simplicity[p] += Mcp[c,p]/Fitness0[c]

   # check the error for the first country only
   err = fabs(Fitness[0]-Fitness0[0])
   #err = fabs(Fitness[0]-Fitness0[0])/Fitness[0] # in case of relative error check
   # if for the first country the error is small, then check all others to find the maximum
   if(err<EPS):
      for c in range(C):
        err = max(err, fabs(Fitness[c]-Fitness0[c]))
      for p in range(p):
        err = max(err, fabs(Simplicity[p]-Simplicity0[p]))
      if(err<EPS):
        print ("Iteration: ", count, " Final error: ", err)
        break

   print ("Iteration: ", count, " Error: ", err)

   Fitness0 = Fitness[:]
   Simplicity0 = Simplicity[:]

# End of iterations

# Output section follows
file1 = "Fitness_1980.dat"
file2 = "Quality_1980.dat"

# if there was an erdos param as input create special file names
if(len(argv)>1):
   file1 = file1[:]+("_p=%f" % erdos_p)
   file2 = file2[:]+("_p=%f" % erdos_p)

out = open(file1,"w")
out.write("# Iterations:"+str(count)+" Final error:"+str(err)+"\n")

count = 0
for i in Fitness:
   count += 1
   out.write(str(count)+" "+str(i)+"\n")
out.close()

out = open(file2,"w")
count = 0
for i in Simplicity:
   count += 1
   # we print the complexity of products rather than their simplicity
   q = 1.0/(i-1.0)
   out.write(str(count)+" "+str(q)+"\n")
out.close()

exit("end")


In [1]:
# this code move to the shared drive contents
import os
from google.colab import drive
drive.mount('/content/drive')
#os.listdir('drive/MyDrive/Group5/data')
os.listdir(os.getcwd())

os.chdir('drive/MyDrive/Group5/data')

os.listdir()

Mounted at /content/drive


['colors.csv',
 'ECUtils.py',
 'employment',
 'synthetic',
 'network',
 'economic',
 'science',
 'technology',
 'export',
 'Copia di ECUtils.py']

In [5]:
from math import fabs,sqrt
import numpy
import random
import pandas as pd
from sys import exit, argv
import os.path
import sys
print(sys.path)

['/content', '/env/python', '/usr/lib/python310.zip', '/usr/lib/python3.10', '/usr/lib/python3.10/lib-dynload', '', '/usr/local/lib/python3.10/dist-packages', '/usr/lib/python3/dist-packages', '/usr/local/lib/python3.10/dist-packages/IPython/extensions', '/root/.ipython']


In [6]:
import os

dir_path = '/content/drive/MyDrive/Group5/data/technology/countries'

csv_files = [pos_csv for pos_csv in os.listdir(dir_path) if pos_csv.endswith('-mat_ah.csv')]

data = {}
for file in csv_files:
    dict_key = file[:-4]
    df = pd.read_csv(os.path.join(dir_path, file))
    df = df.rename(columns={'Unnamed: 0': 'country'})
    data[dict_key] = df

reversed_keys = []
for key in data:
    parts = key.split('-')
    new_key = '_'.join(reversed(parts))
    globals()[new_key] = data[key]
    reversed_keys.append(new_key)

for name in reversed_keys:
    print(name)

periods = [(2010, 2012), (2012, 2014), (2014, 2016), (2016, 2018)]

aggregated_data = {}

for start, end in periods:
    dfs = []
    for year in range(start, end + 1):
        key = f'mat_ah_{year}'
        if key in reversed_keys:
            dfs.append(globals()[key])
    df = pd.concat(dfs, ignore_index=True)
    df = df.groupby('country').sum().reset_index()
    agg_key = f'agg_ah_{start}_{end}'
    aggregated_data[agg_key] = df
    print(f'For the period {start}-{end}, the aggregated data is stored in: {agg_key}')
    print(df.head(2))
    print()

print("Names of the aggregated datasets:")
print(list(aggregated_data.keys()))

for key, df in aggregated_data.items():
    globals()[key] = df

mat_ah_1984
mat_ah_1987
mat_ah_1982
mat_ah_1980
mat_ah_1978
mat_ah_1981
mat_ah_1985
mat_ah_1983
mat_ah_1988
mat_ah_1979
mat_ah_1986
mat_ah_2018
mat_ah_1991
mat_ah_2004
mat_ah_2007
mat_ah_2015
mat_ah_1995
mat_ah_2013
mat_ah_2009
mat_ah_1989
mat_ah_2002
mat_ah_1999
mat_ah_1997
mat_ah_2008
mat_ah_1998
mat_ah_2014
mat_ah_2003
mat_ah_2006
mat_ah_2005
mat_ah_2011
mat_ah_2010
mat_ah_2012
mat_ah_1993
mat_ah_1996
mat_ah_2000
mat_ah_1994
mat_ah_1990
mat_ah_1992
mat_ah_2001
mat_ah_2017
mat_ah_2016
mat_ah_2021
mat_ah_2020
mat_ah_2019
For the period 2010-2012, the aggregated data is stored in: agg_ah_2010_2012
  country        A01        A21  A22        A23        A24       A41   A42  \
0     AUS  54.336290   3.500000  1.5  33.618720   0.000000  5.823810  3.00   
1     AUT  70.901343  10.833333  1.4  17.536905  12.699444  8.142857  7.45   

         A43        A44  ...    G12        G16       G21  G99         H01  \
0   6.833333   0.873333  ...  0.000  15.037466  3.058333  0.0   78.729486   
1  17.

In [7]:
for agg_key in list(aggregated_data.keys()):
    df = aggregated_data[agg_key]
    total_patents_per_country = df.iloc[:, 1:].sum(axis=1)
    total_patents_per_technology_class = df.iloc[:, 1:].sum(axis=0)
    total_patents = total_patents_per_country.sum()
    rca_df = df.copy()
    for technology_class in df.columns[1:]:
        patents_per_country = df[technology_class]
        patents_per_technology_class = total_patents_per_technology_class[technology_class]
        rca = (patents_per_country / total_patents_per_country) / ((patents_per_technology_class + 1e-8) / total_patents)
        rca_df[technology_class] = rca
    rca_df = rca_df.fillna(0)
    rca_key = agg_key.replace('agg', 'rca')
    aggregated_data[rca_key] = rca_df
    globals()[rca_key] = rca_df
    print(f'RCA values for {agg_key} calculated and saved in {rca_key}')

RCA values for agg_ah_2010_2012 calculated and saved in rca_ah_2010_2012
RCA values for agg_ah_2012_2014 calculated and saved in rca_ah_2012_2014
RCA values for agg_ah_2014_2016 calculated and saved in rca_ah_2014_2016
RCA values for agg_ah_2016_2018 calculated and saved in rca_ah_2016_2018


In [8]:
for rca_key in ['rca_ah_2010_2012', 'rca_ah_2012_2014', 'rca_ah_2014_2016', 'rca_ah_2016_2018']:
    df = globals()[rca_key]
    countries = df['country']
    df = df.drop(columns=['country'])
    df = df.apply(pd.to_numeric, errors='coerce')
    df = df.fillna(0)
    binary_df = (df > 1).astype(int)
    binary_df.insert(loc=0, column='country', value=countries)
    binary_key = rca_key.replace('rca', 'binary')
    globals()[binary_key] = binary_df
    print(f'Binary matrix for {rca_key} calculated and saved in {binary_key}')

Binary matrix for rca_ah_2010_2012 calculated and saved in binary_ah_2010_2012
Binary matrix for rca_ah_2012_2014 calculated and saved in binary_ah_2012_2014
Binary matrix for rca_ah_2014_2016 calculated and saved in binary_ah_2014_2016
Binary matrix for rca_ah_2016_2018 calculated and saved in binary_ah_2016_2018


In [9]:
import pandas as pd
import numpy as np
from math import fabs

def apply_algorithm(df, EPS=1e-6):
    Mcp = df.values
    C, P = Mcp.shape

    # Start of iterations
    Simplicity = np.ones(P)
    Fitness = np.ones(C)
    Simplicity0 = Simplicity.copy()
    Fitness0 = Fitness.copy()

    count = 0
    while True:
        count += 1
        for c in range(C):
            Fitness[c] = 1.0
            for p in range(P):
                Fitness[c] += Mcp[c,p]/Simplicity0[p]

        for p in range(P):
            Simplicity[p] = 1.0
            for c in range(C):
                Simplicity[p] += Mcp[c,p]/Fitness0[c]

        # Check the error for the first country only
        err = fabs(Fitness[0] - Fitness0[0])

        # If for the first country the error is small, then check all others to find the maximum
        if err < EPS:
            for c in range(C):
                err = max(err, fabs(Fitness[c] - Fitness0[c]))
            for p in range(P):
                err = max(err, fabs(Simplicity[p] - Simplicity0[p]))
            if err < EPS:
                print("Iteration: ", count, " Final error: ", err)
                break

        print("Iteration: ", count, " Error: ", err)

        Fitness0 = Fitness.copy()
        Simplicity0 = Simplicity.copy()

    # Convert results to pandas Series and return them
    Fitness = pd.Series(Fitness, index=df.index)
    Simplicity = pd.Series(1.0 / (Simplicity - 1.0), index=df.columns)  # Complexity of products

    return Fitness, Simplicity

In [10]:
# Drop the 'country' column and apply the function to each dataframe
Fitness1, Simplicity1 = apply_algorithm(binary_ah_2010_2012.drop(columns='country'))
Fitness2, Simplicity2 = apply_algorithm(binary_ah_2012_2014.drop(columns='country'))
Fitness3, Simplicity3 = apply_algorithm(binary_ah_2014_2016.drop(columns='country'))
Fitness4, Simplicity4 = apply_algorithm(binary_ah_2016_2018.drop(columns='country'))

# Now Fitness1, Simplicity1, Fitness2, Simplicity2, Fitness3, Simplicity3, Fitness4, and Simplicity4
# are pandas Series with the country or product names as the index, and the fitness or simplicity
# as the values.

Iteration:  1  Error:  55.0
Iteration:  2  Error:  52.014225552964355
Iteration:  3  Error:  35.64954056803023
Iteration:  4  Error:  29.263065445566937
Iteration:  5  Error:  25.2773630378747
Iteration:  6  Error:  16.46844373360569
Iteration:  7  Error:  15.17520799592343
Iteration:  8  Error:  7.794926094493572
Iteration:  9  Error:  7.338980620953489
Iteration:  10  Error:  3.2195639413817396
Iteration:  11  Error:  3.0542672919449494
Iteration:  12  Error:  1.2411664819159043
Iteration:  13  Error:  1.1806470510148799
Iteration:  14  Error:  0.4649694361640542
Iteration:  15  Error:  0.4427328746566275
Iteration:  16  Error:  0.17229086203271038
Iteration:  17  Error:  0.16410996743632467
Iteration:  18  Error:  0.06358240443867658
Iteration:  19  Error:  0.060571212672464014
Iteration:  20  Error:  0.023429610350376606
Iteration:  21  Error:  0.022321071848729446
Iteration:  22  Error:  0.008628927199929137
Iteration:  23  Error:  0.008220804724246022
Iteration:  24  Error:  0.00

  Simplicity = pd.Series(1.0 / (Simplicity - 1.0), index=df.columns)  # Complexity of products


Iteration:  6  Error:  17.15999283151595
Iteration:  7  Error:  15.847588974107232
Iteration:  8  Error:  8.205489421903138
Iteration:  9  Error:  7.743319076953831
Iteration:  10  Error:  3.405733235610384
Iteration:  11  Error:  3.2383753018104358
Iteration:  12  Error:  1.3145084119927617
Iteration:  13  Error:  1.253308551275282
Iteration:  14  Error:  0.4922307922931637
Iteration:  15  Error:  0.4697723541266541
Iteration:  16  Error:  0.1822056508745149
Iteration:  17  Error:  0.17395366139885482
Iteration:  18  Error:  0.0671596206761862
Iteration:  19  Error:  0.06412617055171665
Iteration:  20  Error:  0.024716240819302016
Iteration:  21  Error:  0.023600952219965166
Iteration:  22  Error:  0.009091023751082616
Iteration:  23  Error:  0.008680948180490589
Iteration:  24  Error:  0.0033431416820235427
Iteration:  25  Error:  0.003192359483051632
Iteration:  26  Error:  0.0012293193361259114
Iteration:  27  Error:  0.0011738772299594302
Iteration:  28  Error:  0.0004520255709223

  Simplicity = pd.Series(1.0 / (Simplicity - 1.0), index=df.columns)  # Complexity of products


Iteration:  2  Error:  48.44492015244578
Iteration:  3  Error:  32.815339723703424
Iteration:  4  Error:  27.387951948783936
Iteration:  5  Error:  23.6766551127114
Iteration:  6  Error:  15.801602254080809
Iteration:  7  Error:  14.619320111156963
Iteration:  8  Error:  7.6266718414381565
Iteration:  9  Error:  7.21584349223814
Iteration:  10  Error:  3.167203379152692
Iteration:  11  Error:  3.0201570940219824
Iteration:  12  Error:  1.2156797351087434
Iteration:  13  Error:  1.1624917995985236
Iteration:  14  Error:  0.45138032621417423
Iteration:  15  Error:  0.43206876691702334
Iteration:  16  Error:  0.16547117402367206
Iteration:  17  Error:  0.15844987402876853
Iteration:  18  Error:  0.06037343826683639
Iteration:  19  Error:  0.0578193607928128
Iteration:  20  Error:  0.021989576196943972
Iteration:  21  Error:  0.02106033338282387
Iteration:  22  Error:  0.008004120507823842
Iteration:  23  Error:  0.0076660144684659315
Iteration:  24  Error:  0.0029127998321314408
Iteration

  Simplicity = pd.Series(1.0 / (Simplicity - 1.0), index=df.columns)  # Complexity of products


Iteration:  3  Error:  32.55819009773655
Iteration:  4  Error:  27.274085261331297
Iteration:  5  Error:  23.482634794079484
Iteration:  6  Error:  15.774661405073545
Iteration:  7  Error:  14.562569156765342
Iteration:  8  Error:  7.662709674647992
Iteration:  9  Error:  7.239359343241087
Iteration:  10  Error:  3.20744921007935
Iteration:  11  Error:  3.0549993703202176
Iteration:  12  Error:  1.2408979418409665
Iteration:  13  Error:  1.1853943380296492
Iteration:  14  Error:  0.4642169863191832
Iteration:  15  Error:  0.44392839130734885
Iteration:  16  Error:  0.171407239944795
Iteration:  17  Error:  0.16398015860657011
Iteration:  18  Error:  0.06298077799685586
Iteration:  19  Error:  0.06026047544872881
Iteration:  20  Error:  0.02309932909150092
Iteration:  21  Error:  0.02210277340025968
Iteration:  22  Error:  0.008466443537034962
Iteration:  23  Error:  0.00810133900921528
Iteration:  24  Error:  0.0031023895340922536
Iteration:  25  Error:  0.002968623969024975
Iteration:

  Simplicity = pd.Series(1.0 / (Simplicity - 1.0), index=df.columns)  # Complexity of products


In [16]:
Simplicity1.head(10)

A01    0.827514
A21    1.330315
A22    2.137710
A23    0.788672
A24    2.112353
A41    1.044039
A42    2.435310
A43    2.250615
A44    4.139650
A45    1.279778
dtype: float64