In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import save_npz
from scipy.sparse import load_npz
import scipy.sparse as sps

## **Load Similarity Matrix**

In [2]:
loaded_sparse_matrix = load_npz("../../app/src/similarities.npz")

In [3]:
sim_matrix = loaded_sparse_matrix.toarray()
sim_matrix.shape

(31700, 31700)

## **Load Project DF**

In [4]:
project_df = pd.read_csv("../../src/merged_orgas.csv")
project_df.shape

(31700, 30)

In [5]:
project_df.columns

Index(['iati_id', 'iati_orga_id', 'orga_abbreviation', 'orga_full_name',
       'client', 'title_en', 'title_other', 'title_main', 'organization',
       'country_code', 'country', 'region', 'location', 'description_en',
       'description_other', 'description_main', 'status', 'planned_start',
       'actual_start', 'planned_end', 'actual_end', 'last_update',
       'crs_5_code', 'crs_5_name', 'crs_3_code', 'crs_3_name', 'docs',
       'title_and_description', 'sgd_pred_code', 'sgd_pred_str'],
      dtype='object')

## **Extend Similarity Matrix**

### **CRS 3**

In [6]:
# search for matches 
matches = project_df["crs_3_code"].values[:, None] == project_df["crs_3_code"].values
sim_matrix += matches.astype(int) * 0.5 #Number to be added

print(sim_matrix)

[[1.4999999 0.        1.2788637 ... 0.        0.        0.       ]
 [0.        1.5000002 0.        ... 0.        0.        0.       ]
 [1.2788637 0.        1.5000001 ... 0.        0.        0.       ]
 ...
 [0.        0.        0.        ... 1.5000002 1.0651588 0.5      ]
 [0.        0.        0.        ... 1.0651588 1.5000001 0.5      ]
 [0.        0.        0.        ... 0.5       0.5       1.5000002]]


### **CRS5**

In [7]:
# search for matches 
matches = project_df["crs_5_code"].values[:, None] == project_df["crs_5_code"].values
sim_matrix += matches.astype(int) * 1 #Number to be added

print(sim_matrix)

[[2.5       0.        2.2788637 ... 0.        0.        0.       ]
 [0.        2.5000002 0.        ... 0.        0.        0.       ]
 [2.2788637 0.        2.5       ... 0.        0.        0.       ]
 ...
 [0.        0.        0.        ... 2.5000002 2.0651588 1.5      ]
 [0.        0.        0.        ... 2.0651588 2.5       1.5      ]
 [0.        0.        0.        ... 1.5       1.5       2.5000002]]


### **SDG**

In [8]:
# search for matches 
matches = project_df["sgd_pred_code"].values[:, None] == project_df["sgd_pred_code"].values
sim_matrix += matches.astype(int) * 0.5 #Number to be added

print(sim_matrix)

[[3.        0.        2.7788637 ... 0.        0.        0.       ]
 [0.        3.0000002 0.        ... 0.        0.        0.       ]
 [2.7788637 0.        3.        ... 0.        0.        0.       ]
 ...
 [0.        0.        0.        ... 3.0000002 2.0651588 1.5      ]
 [0.        0.        0.        ... 2.0651588 3.        1.5      ]
 [0.        0.        0.        ... 1.5       1.5       3.0000002]]


## **Normalize new matrix**

In [9]:
max_value = sim_matrix.max()
print(max_value)

# Normalize the matrix to scale values between 0 and 1
normalized_matrix = np.round(sim_matrix / max_value, 4)

print(normalized_matrix)

3.0000007
[[1.     0.     0.9263 ... 0.     0.     0.    ]
 [0.     1.     0.     ... 0.     0.     0.    ]
 [0.9263 0.     1.     ... 0.     0.     0.    ]
 ...
 [0.     0.     0.     ... 1.     0.6884 0.5   ]
 [0.     0.     0.     ... 0.6884 1.     0.5   ]
 [0.     0.     0.     ... 0.5    0.5    1.    ]]


## **Create Sparse Matrix**

In [19]:
threshold = 0.17
mask = normalized_matrix > threshold

In [20]:
sparse_matrix = sps.coo_matrix(normalized_matrix * mask)

## **Export Extended Matrix**

In [21]:
save_npz("../../synergy-app/src/extended_similarities.npz", sparse_matrix)