# **MiniLM Similarity Model**

## **Dependecies**

In [2]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import pickle
import numpy as np
import scipy.sparse as sps
from scipy.sparse import save_npz
import gc
from scipy.sparse import load_npz


## **Load Model**

MiniLM was specifically designed to maintain the performance of the large language models while reducing the model dimensions and thus the required computational resources. This is achieved through various techniques such as knowledge distillation, where a smaller model (the "student") is trained to mimic the behaviour of a larger, pre-trained model (the "teacher").

https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2

### Load all-MiniLM Model

In [4]:
model = SentenceTransformer('all-MiniLM-L6-v2')

## **Read Data**

### Load csv

In [5]:
df = pd.read_csv("../../src/merged_orgas.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31700 entries, 0 to 31699
Data columns (total 30 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   iati_id                31700 non-null  object
 1   iati_orga_id           31700 non-null  object
 2   orga_abbreviation      31700 non-null  object
 3   orga_full_name         31700 non-null  object
 4   client                 31700 non-null  object
 5   title_en               31326 non-null  object
 6   title_other            23675 non-null  object
 7   title_main             31700 non-null  object
 8   organization           31700 non-null  object
 9   country_code           26589 non-null  object
 10  country                26589 non-null  object
 11  region                 5117 non-null   object
 12  location               19454 non-null  object
 13  description_en         30676 non-null  object
 14  description_other      23578 non-null  object
 15  description_main   

In [6]:
df.columns

Index(['iati_id', 'iati_orga_id', 'orga_abbreviation', 'orga_full_name',
       'client', 'title_en', 'title_other', 'title_main', 'organization',
       'country_code', 'country', 'region', 'location', 'description_en',
       'description_other', 'description_main', 'status', 'planned_start',
       'actual_start', 'planned_end', 'actual_end', 'last_update',
       'crs_5_code', 'crs_5_name', 'crs_3_code', 'crs_3_name', 'docs',
       'title_and_description', 'sgd_pred_code', 'sgd_pred_str'],
      dtype='object')

In [7]:
df.head(1)

Unnamed: 0,iati_id,iati_orga_id,orga_abbreviation,orga_full_name,client,title_en,title_other,title_main,organization,country_code,...,actual_end,last_update,crs_5_code,crs_5_name,crs_3_code,crs_3_name,docs,title_and_description,sgd_pred_code,sgd_pred_str
0,DE-1-201401124-0,DE-1,bmz,Bundesministerium für wirtschaftliche Zusammen...,BMZ,Kenia Protected Relief and Recovery Programme ...,Kenia Protected Relief and Recovery Programme ...,Kenia Protected Relief and Recovery Programme ...,Bundesministerium für wirtschaftliche Zusammen...,['KE'],...,2019-02-15T00:00:00Z,2024-03-06T00:00:00Z,52010;,Food assistance;,520;,Development Food Assistance;,"['https://www.bmz.de/de/laender/kenia', 'https...",Kenia Protected Relief and Recovery Programme ...,2,"1 2. End hunger, achieve food security and ..."


### remove duplicates from df

In [8]:
len(df)

31700

In [9]:
df = df.drop_duplicates(keep="first")
len(df)

31700

## **Embeddings**

### calculate embeddings (last run: 31:23 min)

In [10]:
# for test purposes select just 100 random samples
#test_df = df.sample(10000)

In [11]:
sentences = df['title_and_description'].tolist()

In [12]:
embeddings = model.encode(sentences, show_progress_bar=True)

Batches:   0%|          | 0/991 [00:00<?, ?it/s]

In [13]:
embeddings

array([[-0.00542458,  0.07612374, -0.01190117, ..., -0.01088327,
        -0.08634929, -0.0527638 ],
       [ 0.00396193,  0.05668392, -0.02865329, ...,  0.01807695,
         0.01017955, -0.02630395],
       [-0.05679462,  0.04753661,  0.0053708 , ..., -0.05990424,
        -0.08364748, -0.0485591 ],
       ...,
       [-0.01351139,  0.08917006,  0.02691875, ...,  0.07068525,
        -0.0103591 , -0.0200639 ],
       [ 0.01089143,  0.04269382,  0.0635475 , ..., -0.01490735,
        -0.0214653 ,  0.01009444],
       [ 0.00055788,  0.07568503,  0.0187108 , ..., -0.10134399,
        -0.08061431, -0.03309685]], dtype=float32)

### save embeddings as pickle

In [14]:
with open("../../src/transformer/embeddings.pkl", "wb") as fOut:
    pickle.dump({"sentences": sentences, "embeddings": embeddings}, fOut, protocol=pickle.HIGHEST_PROTOCOL)

### load embeddings from local storage if needed

In [3]:
with open("../../src/transformer/embeddings.pkl", "rb") as fIn:
    stored_data = pickle.load(fIn)
    sentences = stored_data["sentences"]
    embeddings = stored_data["embeddings"]

In [4]:
embeddings

array([[-0.00542458,  0.07612374, -0.01190117, ..., -0.01088327,
        -0.08634929, -0.0527638 ],
       [ 0.00396193,  0.05668392, -0.02865329, ...,  0.01807695,
         0.01017955, -0.02630395],
       [-0.05679462,  0.04753661,  0.0053708 , ..., -0.05990424,
        -0.08364748, -0.0485591 ],
       ...,
       [-0.01351139,  0.08917006,  0.02691875, ...,  0.07068525,
        -0.0103591 , -0.0200639 ],
       [ 0.01089143,  0.04269382,  0.0635475 , ..., -0.01490735,
        -0.0214653 ,  0.01009444],
       [ 0.00055788,  0.07568503,  0.0187108 , ..., -0.10134399,
        -0.08061431, -0.03309685]], dtype=float32)

## **Cosine Similarity**

### calculate similarity matrix

In [5]:
similarity_matrix = util.cos_sim(embeddings, embeddings)

In [18]:
similarity_matrix.shape

torch.Size([31700, 31700])

### create sparse matrix

In [6]:
# collect garbage variables and delete to clear memory
gc.collect()

40

In [7]:
threshold = 0.5
mask = similarity_matrix > threshold

In [8]:
sparse_matrix = sps.coo_matrix(similarity_matrix.numpy() * mask.numpy())

### save matrix

In [9]:
save_npz("../../app/src/similarities.npz", sparse_matrix)

### load matrix

In [23]:
loaded_sparse_matrix = load_npz("../../app/src/similarities.npz")

In [24]:
print(loaded_sparse_matrix)

  (0, 0)	0.9999999
  (0, 2)	0.7788637
  (0, 10)	0.47926852
  (0, 33)	0.47682035
  (0, 45)	0.5269123
  (0, 48)	0.5594749
  (0, 61)	0.5211487
  (0, 68)	0.48016948
  (0, 95)	0.5202849
  (0, 97)	0.5202849
  (0, 108)	0.5770738
  (0, 112)	0.5399726
  (0, 134)	0.49919048
  (0, 138)	0.5660058
  (0, 156)	0.5028298
  (0, 159)	0.53406537
  (0, 188)	0.5956906
  (0, 189)	0.61039144
  (0, 191)	0.6847315
  (0, 198)	0.5413348
  (0, 205)	0.52982235
  (0, 208)	0.5719239
  (0, 236)	0.61851823
  (0, 252)	0.5134039
  (0, 261)	0.5660058
  :	:
  (31699, 30996)	0.50494766
  (31699, 30998)	0.63078773
  (31699, 31000)	0.48562074
  (31699, 31003)	0.53423905
  (31699, 31052)	0.52794164
  (31699, 31053)	0.5302916
  (31699, 31067)	0.5296182
  (31699, 31081)	0.5550677
  (31699, 31085)	0.6148765
  (31699, 31096)	0.6461128
  (31699, 31136)	0.49704528
  (31699, 31137)	0.6598016
  (31699, 31268)	0.5693135
  (31699, 31421)	0.58891875
  (31699, 31426)	0.4885249
  (31699, 31553)	0.56600904
  (31699, 31608)	0.5368564
  (316

In [25]:
dense_matrix = loaded_sparse_matrix.toarray()
dense_matrix

array([[0.9999999 , 0.        , 0.7788637 , ..., 0.        , 0.4758543 ,
        0.        ],
       [0.        , 1.0000002 , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.7788637 , 0.        , 1.0000001 , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.0000002 , 0.56515884,
        0.        ],
       [0.47585428, 0.        , 0.        , ..., 0.56515884, 1.0000001 ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.0000002 ]], dtype=float32)