In [None]:
import numpy as np
import pandas as pd
from scipy.sparse import save_npz
from scipy.sparse import load_npz
import scipy.sparse as sps
from tqdm import tqdm 
import gc


## **Load Similarity Matrix**

In [None]:
loaded_sparse_matrix = load_npz("../../src/similarities.npz")

In [None]:
sim_matrix = loaded_sparse_matrix.toarray()
sim_matrix.shape

## **Load Project DF**

In [None]:
project_df = pd.read_csv("../../src/merged_orgas.csv")
project_df.shape

In [None]:
project_df.head()

### Create crs3 and crs5 list columns

SHOULD BE INTEGRATED IN TRANSFORMATION PROCESS

In [49]:
project_df['crs_3_list'] = project_df['crs_3_code'].apply(
    lambda x: [element.strip() for element in x.split(';') if element.strip()] if pd.notna(x) else []
)
project_df['crs_3_list'].head()


0    [140]
1    [331]
2    [331]
3    [240]
4    [430]
Name: crs_3_list, dtype: object

In [50]:
filtered_df = project_df[project_df['crs_3_list'].apply(lambda x: len(x) > 1)]

print(filtered_df["crs_3_list"])

55            [140, 430]
87            [140, 430]
88            [240, 311]
93            [140, 430]
118           [152, 430]
              ...       
31939    [151, 230, 230]
31940         [121, 160]
31941         [151, 151]
31942    [111, 121, 160]
31943         [160, 160]
Name: crs_3_list, Length: 5475, dtype: object


In [51]:
project_df['crs_5_list'] = project_df['crs_5_code'].apply(
    lambda x: [element.strip() for element in x.split(';') if element.strip()] if pd.notna(x) else []
)
project_df['crs_5_list'].head()

0    [14010]
1    [33130]
2    [33130]
3    [24030]
4    [43010]
Name: crs_5_list, dtype: object

In [52]:
filtered_df = project_df[project_df['crs_5_list'].apply(lambda x: len(x) > 1)]

print(filtered_df["crs_5_list"])

55              [14020, 43030]
87              [14020, 43030]
88              [24030, 31193]
93              [14020, 43030]
118             [15220, 43040]
                 ...          
31939    [15110, 23020, 23040]
31940           [12110, 16010]
31941           [15110, 15111]
31942    [11110, 12110, 16010]
31943           [16010, 16010]
Name: crs_5_list, Length: 5475, dtype: object


In [53]:
project_df.columns

Index(['iati_id', 'iati_orga_id', 'orga_abbreviation', 'orga_full_name',
       'title_en', 'title_other', 'title_main', 'country_code_list', 'country',
       'country_name', 'country_flag', 'description_en', 'description_other',
       'description_main', 'status', 'crs_5_code', 'crs_5_name', 'crs_3_code',
       'crs_3_name', 'title_and_description', 'sgd_pred_code', 'sgd_pred_str',
       'client', 'crs_3_list', 'crs_5_list'],
      dtype='object')

## **Extend Similarity Matrix**

In [55]:
n = len(project_df)
final_similarity_matrix = np.zeros((n, n)) 
final_similarity_matrix

MemoryError: Unable to allocate 9.43 GiB for an array with shape (35575, 35575) and data type float64

### **CRS 3**

In [None]:
n = len(project_df)
crs3_sim_matrix = np.zeros((n, n)) 

# search for matches 
matches = project_df["crs_3_code"].values[:, None] == project_df["crs_3_code"].values
crs3_sim_matrix += matches.astype(int) * 0.1

print(crs3_sim_matrix)

In [None]:
final_similarity_matrix += crs3_sim_matrix

In [None]:
multi_code_df = project_df[project_df['crs_3_list'].apply(lambda x: len(x) > 1)]

multi_code_indices = multi_code_df.index.to_list()

n = len(project_df)  
crs3_mc_sim_matrix = np.zeros((n, n))  

list_sets_multi = [set(project_df['crs_3_list'].iloc[idx]) for idx in multi_code_indices]
list_sets_all = project_df['crs_3_list'].apply(set).tolist()

def is_match(set1, set2):
    return not set1.isdisjoint(set2)

for idx, i in enumerate(tqdm(multi_code_indices, desc='Processing')):
    for j in range(n):
        if list_sets_multi[idx] != list_sets_all[j]: 
            if is_match(list_sets_multi[idx], list_sets_all[j]):
                crs3_mc_sim_matrix[i, j] += 0.1

print(crs3_mc_sim_matrix)

In [None]:
final_similarity_matrix += crs3_mc_sim_matrix

In [None]:
del crs3_sim_matrix, crs3_mc_sim_matrix
gc.collect()

### **CRS5**

In [None]:
n = len(project_df)
crs5_sim_matrix = np.zeros((n, n)) 

# search for matches 
matches = project_df["crs_5_code"].values[:, None] == project_df["crs_5_code"].values
crs5_sim_matrix += matches.astype(int) * 0.15

print(crs5_sim_matrix)

In [None]:
final_similarity_matrix += crs5_sim_matrix

In [None]:
multi_code_df = project_df[project_df['crs_5_list'].apply(lambda x: len(x) > 1)]

multi_code_indices = multi_code_df.index.to_list()

n = len(project_df)  
crs5_mc_sim_matrix = np.zeros((n, n))  

list_sets_multi = [set(project_df['crs_5_list'].iloc[idx]) for idx in multi_code_indices]
list_sets_all = project_df['crs_5_list'].apply(set).tolist()

def is_match(set1, set2):
    return not set1.isdisjoint(set2)

for idx, i in enumerate(tqdm(multi_code_indices, desc='Processing')):
    for j in range(n):
        if list_sets_multi[idx] != list_sets_all[j]: 
            if is_match(list_sets_multi[idx], list_sets_all[j]):
                crs5_mc_sim_matrix[i, j] += 0.15  

print(crs5_mc_sim_matrix)

In [None]:
final_similarity_matrix += crs5_mc_sim_matrix

In [None]:
del crs5_mc_sim_matrix, crs5_sim_matrix
gc.collect()

### **SDG**

In [None]:
n = len(project_df)
sdg_sim_matrix = np.zeros((n, n)) 

# search for matches 
matches = project_df["sgd_pred_code"].values[:, None] == project_df["sgd_pred_code"].values
sdg_sim_matrix += matches.astype(int) * 0.2 

print(sdg_sim_matrix)

In [None]:
final_similarity_matrix += sdg_sim_matrix

### **Add text similarity to final matrix**

In [None]:
del sdg_sim_matrix
gc.collect()

In [None]:
final_similarity_matrix += sim_matrix

In [None]:
print(final_similarity_matrix)

In [None]:
np.amax(final_similarity_matrix)

## **Normalize new matrix**

In [None]:
max_value = final_similarity_matrix.max()
print(max_value)

# Normalize the matrix to scale values between 0 and 1
normalized_matrix = np.round(final_similarity_matrix / max_value, 4)

print(normalized_matrix)


## **Drop 1 values**

In [None]:
# Drop 1 values because there is no perfect match and its most likely a doublicate
normalized_matrix[normalized_matrix >= 1] = 0

## **Create Sparse Matrix**

In [None]:
threshold = 0.35
mask = normalized_matrix > threshold

In [None]:
normalized_matrix

In [None]:
sparse_matrix = sps.coo_matrix(normalized_matrix * mask)

## **Export Extended Matrix**

In [None]:
save_npz("../../synergy-app/src/extended_similarities.npz", sparse_matrix)

## **Create sim matrix for mateches between only different orgas**

In [None]:
non_org_sim_matrix = final_similarity_matrix.copy()
non_org_sim_matrix

### **Remove similarities if projects with similar orga**

In [None]:
# search for matches 
matches = project_df["orga_abbreviation"].values[:, None] == project_df["orga_abbreviation"].values
non_org_sim_matrix[matches] = 0 # Set all orga matches similarity to 0

print(non_org_sim_matrix)

In [None]:
max_value = non_org_sim_matrix.max()
max_value

### **Normalize**

In [None]:
del final_similarity_matrix, max_value, sparse_matrix, normalized_matrix
gc.collect()

In [None]:
max_value = non_org_sim_matrix.max()
print(max_value)

# Normalize the matrix to scale values between 0 and 1
norm_non_orga_matrix = np.round(non_org_sim_matrix / max_value, 4)

print(norm_non_orga_matrix)

### **Drop 1 values**

In [None]:
# Drop 1 values because there is no perfect match and its most likely a doublicate
norm_non_orga_matrix[norm_non_orga_matrix >= 1] = 0

### **Create sparse**

In [None]:
threshold = 0.3
mask = norm_non_orga_matrix > threshold

In [None]:
sparse_non_orga_matrix = sps.coo_matrix(norm_non_orga_matrix * mask)

### **Export**

In [None]:
save_npz("../../synergy-app/src/extended_similarities_nonsimorga.npz", sparse_non_orga_matrix)