In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import save_npz
from scipy.sparse import load_npz
import scipy.sparse as sps
from tqdm import tqdm 
import gc


## **Load Similarity Matrix**

In [2]:
loaded_sparse_matrix = load_npz("../../src/similarities.npz")

In [3]:
sim_matrix = loaded_sparse_matrix.toarray()
sim_matrix.shape

(35575, 35575)

## **Load Project DF**

In [4]:
project_df = pd.read_csv("../../src/merged_orgas.csv")
project_df.shape

(35575, 23)

In [5]:
project_df.head()

Unnamed: 0,iati_id,iati_orga_id,orga_abbreviation,orga_full_name,title_en,title_other,title_main,country_code_list,country,country_name,...,description_main,status,crs_5_code,crs_5_name,crs_3_code,crs_3_name,title_and_description,sgd_pred_code,sgd_pred_str,client
0,DE-1-201920016-0,DE-1,bmz,Bundesministerium für wirtschaftliche Zusammen...,Strengthening of Metrology for the Improvement...,Stärkung des Messwesens in Ägypten zur Verbess...,Strengthening of Metrology for the Improvement...,['AG'],AG;,Antigua and Barbuda,...,With a rapidly growing population and a promis...,Implementation,14010;,Water sector policy and administrative managem...,140;,Water Supply & Sanitation;,Strengthening of Metrology for the Improvement...,9,"8 9. Build resilient infrastructure, promot...",bmz
1,DE-1-201721877-0,DE-1,bmz,Bundesministerium für wirtschaftliche Zusammen...,Strengthening regional integration and coopera...,Stärkung der regionalen Integration und Zusamm...,Strengthening regional integration and coopera...,,,,...,The module supports intra-regional partnership...,Finalisation,33130;,Regional trade agreements (RTAs);,331;,Trade Policies & Regulations;,Strengthening regional integration and coopera...,9,"8 9. Build resilient infrastructure, promot...",bmz
2,DE-1-201822287-0,DE-1,bmz,Bundesministerium für wirtschaftliche Zusammen...,Strengthening quality infrastructure for trade...,Stärkung der Qualitätsinfrastruktur für den Ha...,Strengthening quality infrastructure for trade...,,,,...,Enhances the efficiency of the SADC Directorat...,Implementation,33130;,Regional trade agreements (RTAs);,331;,Trade Policies & Regulations;,Strengthening quality infrastructure for trade...,9,"8 9. Build resilient infrastructure, promot...",bmz
3,DE-1-197966252,DE-1,bmz,Bundesministerium für wirtschaftliche Zusammen...,,Programm zur Förderung erneuerbarer Energien,Programm zur Förderung erneuerbarer Energien,,,,...,"Ziel der FZ-Maßnahme ist es, die BOAD bei der ...",Finalisation,24030;,Formal sector financial intermediaries;,240;,Banking & Financial Services;,Programm zur Förderung erneuerbarer Energien. ...,4,3 4. Ensure inclusive and equitable quality...,bmz
4,DE-1-199535527,DE-1,bmz,Bundesministerium für wirtschaftliche Zusammen...,Studies and Experts Fund,Studien- und Fachkräftefonds,Studies and Experts Fund,,,,...,Preparation and appraisal of technical coopera...,Finalisation,43010;,Multisector aid;,430;,Other Multisector;,Studies and Experts Fund. Preparation and appr...,9,"8 9. Build resilient infrastructure, promot...",bmz


### Create crs3 and crs5 list columns

SHOULD BE INTEGRATED IN TRANSFORMATION PROCESS

In [6]:
project_df['crs_3_list'] = project_df['crs_3_code'].apply(
    lambda x: [element.strip() for element in x.split(';') if element.strip()] if pd.notna(x) else []
)
project_df['crs_3_list'].head()


0    [140]
1    [331]
2    [331]
3    [240]
4    [430]
Name: crs_3_list, dtype: object

In [7]:
filtered_df = project_df[project_df['crs_3_list'].apply(lambda x: len(x) > 1)]

print(filtered_df["crs_3_list"])

55            [140, 430]
87            [140, 430]
88            [240, 311]
93            [140, 430]
118           [152, 430]
              ...       
31939    [151, 230, 230]
31940         [121, 160]
31941         [151, 151]
31942    [111, 121, 160]
31943         [160, 160]
Name: crs_3_list, Length: 5475, dtype: object


In [8]:
project_df['crs_5_list'] = project_df['crs_5_code'].apply(
    lambda x: [element.strip() for element in x.split(';') if element.strip()] if pd.notna(x) else []
)
project_df['crs_5_list'].head()

0    [14010]
1    [33130]
2    [33130]
3    [24030]
4    [43010]
Name: crs_5_list, dtype: object

In [9]:
filtered_df = project_df[project_df['crs_5_list'].apply(lambda x: len(x) > 1)]

print(filtered_df["crs_5_list"])

55              [14020, 43030]
87              [14020, 43030]
88              [24030, 31193]
93              [14020, 43030]
118             [15220, 43040]
                 ...          
31939    [15110, 23020, 23040]
31940           [12110, 16010]
31941           [15110, 15111]
31942    [11110, 12110, 16010]
31943           [16010, 16010]
Name: crs_5_list, Length: 5475, dtype: object


In [10]:
project_df.columns

Index(['iati_id', 'iati_orga_id', 'orga_abbreviation', 'orga_full_name',
       'title_en', 'title_other', 'title_main', 'country_code_list', 'country',
       'country_name', 'country_flag', 'description_en', 'description_other',
       'description_main', 'status', 'crs_5_code', 'crs_5_name', 'crs_3_code',
       'crs_3_name', 'title_and_description', 'sgd_pred_code', 'sgd_pred_str',
       'client', 'crs_3_list', 'crs_5_list'],
      dtype='object')

## **Extend Similarity Matrix**

In [11]:
n = len(project_df)
final_similarity_matrix = np.zeros((n, n)) 

### **CRS 3**

In [12]:
n = len(project_df)
crs3_sim_matrix = np.zeros((n, n)) 

# search for matches 
matches = project_df["crs_3_code"].values[:, None] == project_df["crs_3_code"].values
crs3_sim_matrix += matches.astype(int) * 0.1

print(crs3_sim_matrix)

[[0.1 0.  0.  ... 0.  0.  0. ]
 [0.  0.1 0.1 ... 0.  0.  0. ]
 [0.  0.1 0.1 ... 0.  0.  0. ]
 ...
 [0.  0.  0.  ... 0.1 0.1 0.1]
 [0.  0.  0.  ... 0.1 0.1 0.1]
 [0.  0.  0.  ... 0.1 0.1 0.1]]


In [13]:
final_similarity_matrix += crs3_sim_matrix

In [14]:
# extract all indexes where multiple crs codes appear
multi_code_df = project_df[project_df['crs_3_list'].apply(lambda x: len(x) > 1)]

multi_code_indices = multi_code_df.index.to_list()

n = len(project_df)  
crs3_mc_sim_matrix = np.zeros((n, n))  

list_sets_multi = [set(project_df['crs_3_list'].iloc[idx]) for idx in multi_code_indices]
list_sets_all = project_df['crs_3_list'].apply(set).tolist()

def is_match(set1, set2):
    return not set1.isdisjoint(set2)

for idx, i in enumerate(tqdm(multi_code_indices, desc='Processing')):
    for j in range(n):
        if list_sets_multi[idx] != list_sets_all[j]: 
            if is_match(list_sets_multi[idx], list_sets_all[j]):
                crs3_mc_sim_matrix[i, j] += 0.1

print(crs3_mc_sim_matrix)

Processing: 100%|██████████| 5475/5475 [02:10<00:00, 41.94it/s]

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]





In [15]:
final_similarity_matrix += crs3_mc_sim_matrix

In [16]:
del crs3_sim_matrix, crs3_mc_sim_matrix
gc.collect()

9

### **CRS5**

In [17]:
n = len(project_df)
crs5_sim_matrix = np.zeros((n, n)) 

# search for matches 
matches = project_df["crs_5_code"].values[:, None] == project_df["crs_5_code"].values
crs5_sim_matrix += matches.astype(int) * 0.15

print(crs5_sim_matrix)

[[0.15 0.   0.   ... 0.   0.   0.  ]
 [0.   0.15 0.15 ... 0.   0.   0.  ]
 [0.   0.15 0.15 ... 0.   0.   0.  ]
 ...
 [0.   0.   0.   ... 0.15 0.15 0.15]
 [0.   0.   0.   ... 0.15 0.15 0.15]
 [0.   0.   0.   ... 0.15 0.15 0.15]]


In [18]:
final_similarity_matrix += crs5_sim_matrix

In [19]:
# extract all indexes where multiple crs codes appear
multi_code_df = project_df[project_df['crs_5_list'].apply(lambda x: len(x) > 1)]

multi_code_indices = multi_code_df.index.to_list()

n = len(project_df)  
crs5_mc_sim_matrix = np.zeros((n, n))  

list_sets_multi = [set(project_df['crs_5_list'].iloc[idx]) for idx in multi_code_indices]
list_sets_all = project_df['crs_5_list'].apply(set).tolist()

def is_match(set1, set2):
    return not set1.isdisjoint(set2)

for idx, i in enumerate(tqdm(multi_code_indices, desc='Processing')):
    for j in range(n):
        if list_sets_multi[idx] != list_sets_all[j]: 
            if is_match(list_sets_multi[idx], list_sets_all[j]):
                crs5_mc_sim_matrix[i, j] += 0.15  

print(crs5_mc_sim_matrix)

Processing: 100%|██████████| 5475/5475 [01:48<00:00, 50.26it/s]

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]





In [20]:
final_similarity_matrix += crs5_mc_sim_matrix

In [21]:
del crs5_mc_sim_matrix, crs5_sim_matrix
gc.collect()

9

### **SDG**

In [22]:
n = len(project_df)
sdg_sim_matrix = np.zeros((n, n)) 

# search for matches 
matches = project_df["sgd_pred_code"].values[:, None] == project_df["sgd_pred_code"].values
sdg_sim_matrix += matches.astype(int) * 0.2 

print(sdg_sim_matrix)

[[0.2 0.2 0.2 ... 0.  0.  0. ]
 [0.2 0.2 0.2 ... 0.  0.  0. ]
 [0.2 0.2 0.2 ... 0.  0.  0. ]
 ...
 [0.  0.  0.  ... 0.2 0.  0. ]
 [0.  0.  0.  ... 0.  0.2 0. ]
 [0.  0.  0.  ... 0.  0.  0.2]]


In [23]:
final_similarity_matrix += sdg_sim_matrix

### **Add text similarity to final matrix**

In [24]:
del sdg_sim_matrix
gc.collect()

0

In [25]:
final_similarity_matrix += sim_matrix

In [26]:
print(final_similarity_matrix)

[[1.45       0.60987325 0.2        ... 0.         0.         0.        ]
 [0.60987325 1.45000024 1.05729301 ... 0.         0.         0.        ]
 [0.2        1.05729301 1.45000012 ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 1.45       0.81515884 0.25      ]
 [0.         0.         0.         ... 0.81515884 1.45       0.25      ]
 [0.         0.         0.         ... 0.25       0.25       1.45000012]]


In [27]:
np.amax(final_similarity_matrix)

1.4500007152557373

## **Normalize new matrix**

In [28]:
max_value = final_similarity_matrix.max()
print(max_value)

# Normalize the matrix to scale values between 0 and 1
normalized_matrix = np.round(final_similarity_matrix / max_value, 4)

print(normalized_matrix)


1.4500007152557373
[[1.     0.4206 0.1379 ... 0.     0.     0.    ]
 [0.4206 1.     0.7292 ... 0.     0.     0.    ]
 [0.1379 0.7292 1.     ... 0.     0.     0.    ]
 ...
 [0.     0.     0.     ... 1.     0.5622 0.1724]
 [0.     0.     0.     ... 0.5622 1.     0.1724]
 [0.     0.     0.     ... 0.1724 0.1724 1.    ]]


## **Drop 1 values**

In [29]:
# Drop 1 values because there is no perfect match and its most likely a doublicate
normalized_matrix[normalized_matrix >= 1] = 0

## **Create Sparse Matrix**

In [30]:
threshold = 0.35
mask = normalized_matrix > threshold

In [31]:
normalized_matrix

array([[0.    , 0.4206, 0.1379, ..., 0.    , 0.    , 0.    ],
       [0.4206, 0.    , 0.7292, ..., 0.    , 0.    , 0.    ],
       [0.1379, 0.7292, 0.    , ..., 0.    , 0.    , 0.    ],
       ...,
       [0.    , 0.    , 0.    , ..., 0.    , 0.5622, 0.1724],
       [0.    , 0.    , 0.    , ..., 0.5622, 0.    , 0.1724],
       [0.    , 0.    , 0.    , ..., 0.1724, 0.1724, 0.    ]])

In [32]:
sparse_matrix = sps.coo_matrix(normalized_matrix * mask)

## **Export Extended Matrix**

In [33]:
save_npz("../../synergy-app/src/extended_similarities.npz", sparse_matrix)

## **Create sim matrix for mateches between only different orgas**

In [34]:
non_org_sim_matrix = final_similarity_matrix.copy()
non_org_sim_matrix

array([[1.45      , 0.60987325, 0.2       , ..., 0.        , 0.        ,
        0.        ],
       [0.60987325, 1.45000024, 1.05729301, ..., 0.        , 0.        ,
        0.        ],
       [0.2       , 1.05729301, 1.45000012, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.45      , 0.81515884,
        0.25      ],
       [0.        , 0.        , 0.        , ..., 0.81515884, 1.45      ,
        0.25      ],
       [0.        , 0.        , 0.        , ..., 0.25      , 0.25      ,
        1.45000012]])

### **Remove similarities if projects with similar orga**

In [35]:
# search for matches 
matches = project_df["orga_abbreviation"].values[:, None] == project_df["orga_abbreviation"].values
non_org_sim_matrix[matches] = 0 # Set all orga matches similarity to 0

print(non_org_sim_matrix)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [36]:
max_value = non_org_sim_matrix.max()
max_value

1.4500003576278686

### **Normalize**

In [37]:
del final_similarity_matrix, max_value, sparse_matrix, normalized_matrix
gc.collect()

0

In [38]:
max_value = non_org_sim_matrix.max()
print(max_value)

# Normalize the matrix to scale values between 0 and 1
norm_non_orga_matrix = np.round(non_org_sim_matrix / max_value, 4)

print(norm_non_orga_matrix)

1.4500003576278686
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


### **Drop 1 values**

In [39]:
# Drop 1 values because there is no perfect match and its most likely a doublicate
norm_non_orga_matrix[norm_non_orga_matrix >= 1] = 0

### **Create sparse**

In [40]:
threshold = 0.3
mask = norm_non_orga_matrix > threshold

In [41]:
sparse_non_orga_matrix = sps.coo_matrix(norm_non_orga_matrix * mask)

### **Export**

In [42]:
save_npz("../../synergy-app/src/extended_similarities_nonsimorga.npz", sparse_non_orga_matrix)