In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import save_npz
from scipy.sparse import load_npz
import scipy.sparse as sps
from tqdm import tqdm 
import gc


## **Load Similarity Matrix**

In [2]:
loaded_sparse_matrix = load_npz("../../src/similarities.npz")

In [3]:
sim_matrix = loaded_sparse_matrix.toarray()
sim_matrix.shape

(27397, 27397)

## **Load Project DF**

In [4]:
project_df = pd.read_csv("../../src/merged_orgas.csv")
project_df.shape

(27397, 32)

In [5]:
project_df.head()

Unnamed: 0,iati_id,iati_orga_id,orga_abbreviation,orga_full_name,client,title_en,title_other,title_main,organization,country_code_list,...,actual_end,last_update,crs_5_code,crs_5_name,crs_3_code,crs_3_name,docs,title_and_description,sgd_pred_code,sgd_pred_str
0,DE-1-201822287-0,DE-1,bmz,Bundesministerium für wirtschaftliche Zusammen...,BMZ,Strengthening quality infrastructure for trade...,Stärkung der Qualitätsinfrastruktur für den Ha...,Strengthening quality infrastructure for trade...,Bundesministerium für wirtschaftliche Zusammen...,,...,2016-03-14T00:00:00Z,2024-02-29T00:00:00Z,33130;,Regional trade agreements (RTAs);,331;,Trade Policies & Regulations;,,Strengthening quality infrastructure for trade...,9,"8 9. Build resilient infrastructure, promot..."
1,DE-1-201920016-0,DE-1,bmz,Bundesministerium für wirtschaftliche Zusammen...,BMZ,Strengthening of Metrology for the Improvement...,Stärkung des Messwesens in Ägypten zur Verbess...,Strengthening of Metrology for the Improvement...,Bundesministerium für wirtschaftliche Zusammen...,['AG'],...,2016-03-14T00:00:00Z,2024-02-29T00:00:00Z,14010;,Water sector policy and administrative managem...,140;,Water Supply & Sanitation;,,Strengthening of Metrology for the Improvement...,8,"7 8. Promote sustained, inclusive and susta..."
2,DE-1-201721877-0,DE-1,bmz,Bundesministerium für wirtschaftliche Zusammen...,BMZ,Strengthening regional integration and coopera...,Stärkung der regionalen Integration und Zusamm...,Strengthening regional integration and coopera...,Bundesministerium für wirtschaftliche Zusammen...,,...,2016-03-14T00:00:00Z,2024-02-29T00:00:00Z,33130;,Regional trade agreements (RTAs);,331;,Trade Policies & Regulations;,,Strengthening regional integration and coopera...,9,"8 9. Build resilient infrastructure, promot..."
3,DE-1-201276351-0,DE-1,bmz,Bundesministerium für wirtschaftliche Zusammen...,BMZ,Strengthening Non-Violent Popular Movements in...,Kapazitätsentwicklung für gewaltfreie Basisbew...,Strengthening Non-Violent Popular Movements in...,Bundesministerium für wirtschaftliche Zusammen...,['VU'],...,2016-03-14T00:00:00Z,2024-03-20T00:00:00Z,15160;,Human rights;,151;,Government & Civil Society-general;,,Strengthening Non-Violent Popular Movements in...,10,9 10. Reduce inequality within and among co...
4,DE-1-201676584-0,DE-1,bmz,Bundesministerium für wirtschaftliche Zusammen...,BMZ,Rebuilding Further Arts after cyclone,Wiederaufbau von Further Arts nach Wirbelsturm,Rebuilding Further Arts after cyclone,Bundesministerium für wirtschaftliche Zusammen...,['VU'],...,2016-03-14T00:00:00Z,2024-03-20T00:00:00Z,73010;,Immediate post-emergency reconstruction and re...,730;,Reconstruction Relief & Rehabilitation;,,Rebuilding Further Arts after cyclone. Rebuild...,15,"14 15. Protect, restore and promote sustain..."


### Create crs3 and crs5 list columns

SHOULD BE INTEGRATED IN TRANSFORMATION PROCESS

In [6]:
project_df['crs_3_list'] = project_df['crs_3_code'].apply(
    lambda x: [element.strip() for element in x.split(';') if element.strip()] if pd.notna(x) else []
)
project_df['crs_3_list'].head()


0    [331]
1    [140]
2    [331]
3    [151]
4    [730]
Name: crs_3_list, dtype: object

In [7]:
filtered_df = project_df[project_df['crs_3_list'].apply(lambda x: len(x) > 1)]

print(filtered_df["crs_3_list"])

23                      [140, 140, 140]
36                      [250, 113, 430]
37                      [231, 231, 321]
38                           [140, 140]
44                           [121, 231]
                      ...              
24242                        [130, 160]
24243                        [232, 250]
24244    [111, 121, 140, 151, 151, 160]
24245                   [121, 122, 130]
24246              [111, 121, 140, 160]
Name: crs_3_list, Length: 4365, dtype: object


In [8]:
project_df['crs_5_list'] = project_df['crs_5_code'].apply(
    lambda x: [element.strip() for element in x.split(';') if element.strip()] if pd.notna(x) else []
)
project_df['crs_5_list'].head()

0    [33130]
1    [14010]
2    [33130]
3    [15160]
4    [73010]
Name: crs_5_list, dtype: object

In [9]:
filtered_df = project_df[project_df['crs_5_list'].apply(lambda x: len(x) > 1)]

print(filtered_df["crs_5_list"])

23                            [14015, 14021, 14010]
36                            [25010, 11330, 43040]
37                            [23110, 23181, 32130]
38                                   [14021, 14015]
44                                   [12191, 23183]
                            ...                    
24242                                [13010, 16062]
24243                                [23220, 25010]
24244    [11110, 12110, 14010, 15112, 15130, 16010]
24245                         [12110, 12240, 13020]
24246                  [11110, 12110, 14010, 16010]
Name: crs_5_list, Length: 4365, dtype: object


In [10]:
project_df.columns

Index(['iati_id', 'iati_orga_id', 'orga_abbreviation', 'orga_full_name',
       'client', 'title_en', 'title_other', 'title_main', 'organization',
       'country_code_list', 'country', 'country_name', 'country_flag',
       'region', 'location', 'description_en', 'description_other',
       'description_main', 'status', 'planned_start', 'actual_start',
       'planned_end', 'actual_end', 'last_update', 'crs_5_code', 'crs_5_name',
       'crs_3_code', 'crs_3_name', 'docs', 'title_and_description',
       'sgd_pred_code', 'sgd_pred_str', 'crs_3_list', 'crs_5_list'],
      dtype='object')

## **Extend Similarity Matrix**

In [11]:
n = len(project_df)
final_similarity_matrix = np.zeros((n, n)) 

### **CRS 3**

In [12]:
n = len(project_df)
crs3_sim_matrix = np.zeros((n, n)) 

# search for matches 
matches = project_df["crs_3_code"].values[:, None] == project_df["crs_3_code"].values
crs3_sim_matrix += matches.astype(int) * 0.1

print(crs3_sim_matrix)

[[0.1 0.  0.1 ... 0.  0.  0. ]
 [0.  0.1 0.  ... 0.  0.  0. ]
 [0.1 0.  0.1 ... 0.  0.  0. ]
 ...
 [0.  0.  0.  ... 0.1 0.1 0.1]
 [0.  0.  0.  ... 0.1 0.1 0.1]
 [0.  0.  0.  ... 0.1 0.1 0.1]]


In [13]:
final_similarity_matrix += crs3_sim_matrix

In [14]:
# extract all indexes where multiple crs codes appear
multi_code_df = project_df[project_df['crs_3_list'].apply(lambda x: len(x) > 1)]

multi_code_indices = multi_code_df.index.to_list()

n = len(project_df)  
crs3_mc_sim_matrix = np.zeros((n, n))  

list_sets_multi = [set(project_df['crs_3_list'].iloc[idx]) for idx in multi_code_indices]
list_sets_all = project_df['crs_3_list'].apply(set).tolist()

def is_match(set1, set2):
    return not set1.isdisjoint(set2)

for idx, i in enumerate(tqdm(multi_code_indices, desc='Processing')):
    for j in range(n):
        if list_sets_multi[idx] != list_sets_all[j]: 
            if is_match(list_sets_multi[idx], list_sets_all[j]):
                crs3_mc_sim_matrix[i, j] += 0.1

print(crs3_mc_sim_matrix)

Processing: 100%|██████████| 4365/4365 [01:12<00:00, 59.93it/s]

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]





In [15]:
final_similarity_matrix += crs3_mc_sim_matrix

### **CRS5**

In [16]:
n = len(project_df)
crs5_sim_matrix = np.zeros((n, n)) 

# search for matches 
matches = project_df["crs_5_code"].values[:, None] == project_df["crs_5_code"].values
crs5_sim_matrix += matches.astype(int) * 0.15

print(crs5_sim_matrix)

[[0.15 0.   0.15 ... 0.   0.   0.  ]
 [0.   0.15 0.   ... 0.   0.   0.  ]
 [0.15 0.   0.15 ... 0.   0.   0.  ]
 ...
 [0.   0.   0.   ... 0.15 0.15 0.15]
 [0.   0.   0.   ... 0.15 0.15 0.15]
 [0.   0.   0.   ... 0.15 0.15 0.15]]


In [17]:
final_similarity_matrix += crs5_sim_matrix

In [18]:
# extract all indexes where multiple crs codes appear
multi_code_df = project_df[project_df['crs_5_list'].apply(lambda x: len(x) > 1)]

multi_code_indices = multi_code_df.index.to_list()

n = len(project_df)  
crs5_mc_sim_matrix = np.zeros((n, n))  

list_sets_multi = [set(project_df['crs_5_list'].iloc[idx]) for idx in multi_code_indices]
list_sets_all = project_df['crs_5_list'].apply(set).tolist()

def is_match(set1, set2):
    return not set1.isdisjoint(set2)

for idx, i in enumerate(tqdm(multi_code_indices, desc='Processing')):
    for j in range(n):
        if list_sets_multi[idx] != list_sets_all[j]: 
            if is_match(list_sets_multi[idx], list_sets_all[j]):
                crs5_mc_sim_matrix[i, j] += 0.15  

print(crs5_mc_sim_matrix)

Processing: 100%|██████████| 4365/4365 [01:13<00:00, 59.60it/s]

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]





In [19]:
final_similarity_matrix += crs5_mc_sim_matrix

### **SDG**

In [20]:
n = len(project_df)
sdg_sim_matrix = np.zeros((n, n)) 

# search for matches 
matches = project_df["sgd_pred_code"].values[:, None] == project_df["sgd_pred_code"].values
sdg_sim_matrix += matches.astype(int) * 0.2 

print(sdg_sim_matrix)

[[0.2 0.  0.2 ... 0.  0.  0. ]
 [0.  0.2 0.  ... 0.  0.  0. ]
 [0.2 0.  0.2 ... 0.  0.  0. ]
 ...
 [0.  0.  0.  ... 0.2 0.  0. ]
 [0.  0.  0.  ... 0.  0.2 0. ]
 [0.  0.  0.  ... 0.  0.  0.2]]


In [21]:
final_similarity_matrix += sdg_sim_matrix

### **Add text similarity to final matrix**

In [22]:
del sdg_sim_matrix, crs3_sim_matrix, crs3_mc_sim_matrix, crs5_mc_sim_matrix, crs5_sim_matrix
gc.collect()

9

In [23]:
final_similarity_matrix += sim_matrix

In [24]:
print(final_similarity_matrix)

[[1.45000012 0.         1.05729301 ... 0.         0.         0.        ]
 [0.         1.45       0.         ... 0.         0.         0.        ]
 [1.05729301 0.         1.45000024 ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 1.45       0.81515884 0.25      ]
 [0.         0.         0.         ... 0.81515902 1.45000012 0.25      ]
 [0.         0.         0.         ... 0.25       0.25       1.45000036]]


In [25]:
np.amax(final_similarity_matrix)

1.4500007152557373

## **Normalize new matrix**

In [26]:
max_value = final_similarity_matrix.max()
print(max_value)

# Normalize the matrix to scale values between 0 and 1
normalized_matrix = np.round(final_similarity_matrix / max_value, 4)

print(normalized_matrix)


1.4500007152557373
[[1.     0.     0.7292 ... 0.     0.     0.    ]
 [0.     1.     0.     ... 0.     0.     0.    ]
 [0.7292 0.     1.     ... 0.     0.     0.    ]
 ...
 [0.     0.     0.     ... 1.     0.5622 0.1724]
 [0.     0.     0.     ... 0.5622 1.     0.1724]
 [0.     0.     0.     ... 0.1724 0.1724 1.    ]]


## **Drop 1 values**

In [27]:
# Drop 1 values because there is no perfect match and its most likely a doublicate
normalized_matrix[normalized_matrix >= 1] = 0

## **Create Sparse Matrix**

In [28]:
threshold = 0.3
mask = normalized_matrix > threshold

In [29]:
normalized_matrix

array([[0.    , 0.    , 0.7292, ..., 0.    , 0.    , 0.    ],
       [0.    , 0.    , 0.    , ..., 0.    , 0.    , 0.    ],
       [0.7292, 0.    , 0.    , ..., 0.    , 0.    , 0.    ],
       ...,
       [0.    , 0.    , 0.    , ..., 0.    , 0.5622, 0.1724],
       [0.    , 0.    , 0.    , ..., 0.5622, 0.    , 0.1724],
       [0.    , 0.    , 0.    , ..., 0.1724, 0.1724, 0.    ]])

In [30]:
sparse_matrix = sps.coo_matrix(normalized_matrix * mask)

## **Export Extended Matrix**

In [31]:
save_npz("../../synergy-app/src/extended_similarities.npz", sparse_matrix)

## **Create sim matrix for mateches between only different orgas**

In [32]:
non_org_sim_matrix = final_similarity_matrix.copy()
non_org_sim_matrix

array([[1.45000012, 0.        , 1.05729301, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.45      , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [1.05729301, 0.        , 1.45000024, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.45      , 0.81515884,
        0.25      ],
       [0.        , 0.        , 0.        , ..., 0.81515902, 1.45000012,
        0.25      ],
       [0.        , 0.        , 0.        , ..., 0.25      , 0.25      ,
        1.45000036]])

### **Remove similarities if projects with similar orga**

In [33]:
# search for matches 
matches = project_df["orga_abbreviation"].values[:, None] == project_df["orga_abbreviation"].values
non_org_sim_matrix[matches] = 0 # Set all orga matches similarity to 0

print(non_org_sim_matrix)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [34]:
max_value = non_org_sim_matrix.max()
max_value

1.4500003576278686

### **Normalize**

In [35]:
max_value = non_org_sim_matrix.max()
print(max_value)

# Normalize the matrix to scale values between 0 and 1
norm_non_orga_matrix = np.round(non_org_sim_matrix / max_value, 4)

print(norm_non_orga_matrix)

1.4500003576278686
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


### **Drop 1 values**

In [36]:
# Drop 1 values because there is no perfect match and its most likely a doublicate
norm_non_orga_matrix[norm_non_orga_matrix >= 1] = 0

### **Create sparse**

In [37]:
threshold = 0.2
mask = norm_non_orga_matrix > threshold

In [38]:
sparse_non_orga_matrix = sps.coo_matrix(norm_non_orga_matrix * mask)

### **Export**

In [39]:
save_npz("../../synergy-app/src/extended_similarities_nonsimorga.npz", sparse_non_orga_matrix)