In [2]:
import numpy as np
import pandas as pd
from scipy.sparse import save_npz
from scipy.sparse import load_npz
import scipy.sparse as sps
import tqdm

## **Load Similarity Matrix**

In [3]:
loaded_sparse_matrix = load_npz("../../app/src/similarities.npz")

In [4]:
sim_matrix = loaded_sparse_matrix.toarray()
sim_matrix.shape

(31700, 31700)

## **Load Project DF**

In [5]:
project_df = pd.read_csv("../../src/merged_orgas.csv")
project_df.shape

(31700, 30)

In [6]:
project_df.columns

Index(['iati_id', 'iati_orga_id', 'orga_abbreviation', 'orga_full_name',
       'client', 'title_en', 'title_other', 'title_main', 'organization',
       'country_code', 'country', 'region', 'location', 'description_en',
       'description_other', 'description_main', 'status', 'planned_start',
       'actual_start', 'planned_end', 'actual_end', 'last_update',
       'crs_5_code', 'crs_5_name', 'crs_3_code', 'crs_3_name', 'docs',
       'title_and_description', 'sgd_pred_code', 'sgd_pred_str'],
      dtype='object')

## **Extend Similarity Matrix**

### **CRS 3**

In [7]:
# search for matches 
matches = project_df["crs_3_code"].values[:, None] == project_df["crs_3_code"].values
sim_matrix += matches.astype(int) * 0.15 #Number to be added

print(sim_matrix)

[[1.1499999 0.        0.9288637 ... 0.        0.        0.       ]
 [0.        1.1500002 0.        ... 0.        0.        0.       ]
 [0.9288637 0.        1.1500001 ... 0.        0.        0.       ]
 ...
 [0.        0.        0.        ... 1.1500002 0.7151588 0.15     ]
 [0.        0.        0.        ... 0.7151588 1.1500001 0.15     ]
 [0.        0.        0.        ... 0.15      0.15      1.1500002]]


### **CRS5**

In [8]:
# search for matches 
matches = project_df["crs_5_code"].values[:, None] == project_df["crs_5_code"].values
sim_matrix += matches.astype(int) * 0.25 #Number to be added

print(sim_matrix)

[[1.3999999 0.        1.1788638 ... 0.        0.        0.       ]
 [0.        1.4000002 0.        ... 0.        0.        0.       ]
 [1.1788638 0.        1.4000001 ... 0.        0.        0.       ]
 ...
 [0.        0.        0.        ... 1.4000002 0.9651588 0.4      ]
 [0.        0.        0.        ... 0.9651588 1.4000001 0.4      ]
 [0.        0.        0.        ... 0.4       0.4       1.4000002]]


### **SDG**

In [9]:
# search for matches 
matches = project_df["sgd_pred_code"].values[:, None] == project_df["sgd_pred_code"].values
sim_matrix += matches.astype(int) * 0.25 #Number to be added

print(sim_matrix)

[[1.6499999 0.        1.4288638 ... 0.        0.        0.       ]
 [0.        1.6500002 0.        ... 0.        0.        0.       ]
 [1.4288638 0.        1.6500001 ... 0.        0.        0.       ]
 ...
 [0.        0.        0.        ... 1.6500002 0.9651588 0.4      ]
 [0.        0.        0.        ... 0.9651588 1.6500001 0.4      ]
 [0.        0.        0.        ... 0.4       0.4       1.6500002]]


## **Normalize new matrix**

In [10]:
max_value = sim_matrix.max()
print(max_value)

# Normalize the matrix to scale values between 0 and 1
normalized_matrix = np.round(sim_matrix / max_value, 4)

print(normalized_matrix)


1.6500007
[[1.     0.     0.866  ... 0.     0.     0.    ]
 [0.     1.     0.     ... 0.     0.     0.    ]
 [0.866  0.     1.     ... 0.     0.     0.    ]
 ...
 [0.     0.     0.     ... 1.     0.5849 0.2424]
 [0.     0.     0.     ... 0.5849 1.     0.2424]
 [0.     0.     0.     ... 0.2424 0.2424 1.    ]]


## **Drop 1 values**

In [11]:
# Drop 1 values because there is no perfect match and its most likely a doublicate
normalized_matrix[normalized_matrix >= 1] = 0

## **Create Sparse Matrix**

In [12]:
threshold = 0.3
mask = normalized_matrix > threshold

In [13]:
normalized_matrix

array([[0.    , 0.    , 0.866 , ..., 0.    , 0.    , 0.    ],
       [0.    , 0.    , 0.    , ..., 0.    , 0.    , 0.    ],
       [0.866 , 0.    , 0.    , ..., 0.    , 0.    , 0.    ],
       ...,
       [0.    , 0.    , 0.    , ..., 0.    , 0.5849, 0.2424],
       [0.    , 0.    , 0.    , ..., 0.5849, 0.    , 0.2424],
       [0.    , 0.    , 0.    , ..., 0.2424, 0.2424, 0.    ]],
      dtype=float32)

In [14]:
sparse_matrix = sps.coo_matrix(normalized_matrix * mask)

## **Export Extended Matrix**

In [15]:
save_npz("../../synergy-app/src/extended_similarities.npz", sparse_matrix)

## **Create sim matrix for mateches between only different orgas**

In [16]:
non_org_sim_matrix = sim_matrix.copy()
non_org_sim_matrix

array([[1.6499999, 0.       , 1.4288638, ..., 0.       , 0.       ,
        0.       ],
       [0.       , 1.6500002, 0.       , ..., 0.       , 0.       ,
        0.       ],
       [1.4288638, 0.       , 1.6500001, ..., 0.       , 0.       ,
        0.       ],
       ...,
       [0.       , 0.       , 0.       , ..., 1.6500002, 0.9651588,
        0.4      ],
       [0.       , 0.       , 0.       , ..., 0.9651588, 1.6500001,
        0.4      ],
       [0.       , 0.       , 0.       , ..., 0.4      , 0.4      ,
        1.6500002]], dtype=float32)

### **Remove similarities if projects with similar orga**

In [17]:
# search for matches 
matches = project_df["orga_abbreviation"].values[:, None] == project_df["orga_abbreviation"].values
non_org_sim_matrix[matches] = 0 # Set all orga matches similarity to 0

print(non_org_sim_matrix)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [18]:
max_value = non_org_sim_matrix.max()
max_value

1.6500006

### **Normalize**

In [19]:
max_value = non_org_sim_matrix.max()
print(max_value)

# Normalize the matrix to scale values between 0 and 1
norm_non_orga_matrix = np.round(non_org_sim_matrix / max_value, 4)

print(norm_non_orga_matrix)

1.6500006
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


### **Drop 1 values**

In [20]:
# Drop 1 values because there is no perfect match and its most likely a doublicate
norm_non_orga_matrix[norm_non_orga_matrix >= 1] = 0

### **Create sparse**

In [21]:
threshold = 0.2
mask = norm_non_orga_matrix > threshold

In [22]:
sparse_non_orga_matrix = sps.coo_matrix(norm_non_orga_matrix * mask)

### **Export**

In [23]:
save_npz("../../synergy-app/src/extended_similarities_nonsimorga.npz", sparse_non_orga_matrix)