# **MiniLM Similarity Model**

## **Dependecies**

In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import pickle
import numpy as np
import scipy.sparse as sps
from scipy.sparse import save_npz
import gc
from scipy.sparse import load_npz


## **Load Model**

MiniLM was specifically designed to maintain the performance of the large language models while reducing the model dimensions and thus the required computational resources. This is achieved through various techniques such as knowledge distillation, where a smaller model (the "student") is trained to mimic the behaviour of a larger, pre-trained model (the "teacher").

https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2

### Load all-MiniLM Model

In [2]:
model = SentenceTransformer('all-MiniLM-L6-v2')

## **Read Data**

### Load csv

In [3]:
df = pd.read_csv("../../src/merged_orgas.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35575 entries, 0 to 35574
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   iati_id                35575 non-null  object
 1   iati_orga_id           35575 non-null  object
 2   orga_abbreviation      35575 non-null  object
 3   orga_full_name         35575 non-null  object
 4   title_en               35116 non-null  object
 5   title_other            26789 non-null  object
 6   title_main             35575 non-null  object
 7   country_code_list      30493 non-null  object
 8   country                30493 non-null  object
 9   country_name           30473 non-null  object
 10  country_flag           35575 non-null  object
 11  description_en         34488 non-null  object
 12  description_other      26712 non-null  object
 13  description_main       35477 non-null  object
 14  status                 35575 non-null  object
 15  crs_5_code         

In [4]:
df.columns

Index(['iati_id', 'iati_orga_id', 'orga_abbreviation', 'orga_full_name',
       'title_en', 'title_other', 'title_main', 'country_code_list', 'country',
       'country_name', 'country_flag', 'description_en', 'description_other',
       'description_main', 'status', 'crs_5_code', 'crs_5_name', 'crs_3_code',
       'crs_3_name', 'title_and_description', 'sgd_pred_code', 'sgd_pred_str',
       'client'],
      dtype='object')

In [5]:
df.head(1)

Unnamed: 0,iati_id,iati_orga_id,orga_abbreviation,orga_full_name,title_en,title_other,title_main,country_code_list,country,country_name,...,description_main,status,crs_5_code,crs_5_name,crs_3_code,crs_3_name,title_and_description,sgd_pred_code,sgd_pred_str,client
0,DE-1-201920016-0,DE-1,bmz,Bundesministerium für wirtschaftliche Zusammen...,Strengthening of Metrology for the Improvement...,Stärkung des Messwesens in Ägypten zur Verbess...,Strengthening of Metrology for the Improvement...,['AG'],AG;,Antigua and Barbuda,...,With a rapidly growing population and a promis...,Implementation,14010;,Water sector policy and administrative managem...,140;,Water Supply & Sanitation;,Strengthening of Metrology for the Improvement...,9,"8 9. Build resilient infrastructure, promot...",bmz


## **Embeddings**

### calculate embeddings (last run: 31:23 min)

In [8]:
# for test purposes select just 100 random samples
#test_df = df.sample(10000)

In [4]:
sentences = df['title_and_description'].tolist()

In [5]:
embeddings = model.encode(sentences, show_progress_bar=True)

Batches:   0%|          | 0/1112 [00:00<?, ?it/s]

In [11]:
embeddings

array([[-0.05611133, -0.02287832, -0.01245939, ..., -0.01833626,
         0.01593846,  0.00362427],
       [ 0.01444224, -0.0499334 , -0.00599655, ..., -0.02526089,
         0.00581959, -0.06996275],
       [-0.04930571, -0.02229353, -0.01424212, ..., -0.05813498,
         0.05638228,  0.01333214],
       ...,
       [-0.01351141,  0.08917006,  0.02691877, ...,  0.07068525,
        -0.01035911, -0.02006389],
       [ 0.01089143,  0.04269382,  0.06354749, ..., -0.01490738,
        -0.0214653 ,  0.0100944 ],
       [ 0.00055788,  0.07568505,  0.01871078, ..., -0.10134395,
        -0.08061433, -0.03309683]], dtype=float32)

### save embeddings as pickle

In [12]:
with open("../../src/transformer/embeddings.pkl", "wb") as fOut:
    pickle.dump({"sentences": sentences, "embeddings": embeddings}, fOut, protocol=pickle.HIGHEST_PROTOCOL)

### load embeddings from local storage if needed

In [13]:
with open("../../src/transformer/embeddings.pkl", "rb") as fIn:
    stored_data = pickle.load(fIn)
    sentences = stored_data["sentences"]
    embeddings = stored_data["embeddings"]

In [14]:
embeddings

array([[-0.05611133, -0.02287832, -0.01245939, ..., -0.01833626,
         0.01593846,  0.00362427],
       [ 0.01444224, -0.0499334 , -0.00599655, ..., -0.02526089,
         0.00581959, -0.06996275],
       [-0.04930571, -0.02229353, -0.01424212, ..., -0.05813498,
         0.05638228,  0.01333214],
       ...,
       [-0.01351141,  0.08917006,  0.02691877, ...,  0.07068525,
        -0.01035911, -0.02006389],
       [ 0.01089143,  0.04269382,  0.06354749, ..., -0.01490738,
        -0.0214653 ,  0.0100944 ],
       [ 0.00055788,  0.07568505,  0.01871078, ..., -0.10134395,
        -0.08061433, -0.03309683]], dtype=float32)

## **Cosine Similarity**

### calculate similarity matrix

In [15]:
similarity_matrix = util.cos_sim(embeddings, embeddings)

In [16]:
similarity_matrix.shape

torch.Size([35575, 35575])

### create sparse matrix

In [17]:
# collect garbage variables and delete to clear memory
del embeddings
gc.collect()

0

In [18]:
threshold = 0.35
mask = similarity_matrix > threshold

In [19]:
sparse_matrix = sps.coo_matrix(similarity_matrix.numpy() * mask.numpy())

### save matrix

In [20]:
save_npz("../../src/similarities.npz", sparse_matrix)

### load matrix

In [21]:
loaded_sparse_matrix = load_npz("../../src/similarities.npz")

In [22]:
print(loaded_sparse_matrix)

  (0, 0)	1.0
  (0, 1)	0.40987324714660645
  (0, 9)	0.4692996144294739
  (0, 87)	0.4334986209869385
  (0, 212)	0.5124557614326477
  (0, 580)	0.5096009969711304
  (0, 655)	0.5890508890151978
  (0, 773)	0.47568202018737793
  (0, 782)	0.4000920057296753
  (0, 985)	0.5248297452926636
  (0, 1366)	0.4316326379776001
  (0, 1411)	0.45594102144241333
  (0, 1445)	0.40196606516838074
  (0, 1447)	0.42272526025772095
  (0, 1676)	0.49519771337509155
  (0, 1851)	0.4958919882774353
  (0, 1916)	0.4569401741027832
  (0, 1957)	0.4001171588897705
  (0, 2489)	0.5611311197280884
  (0, 2814)	0.41348257660865784
  (0, 2889)	0.4781516492366791
  (0, 3654)	0.5267900228500366
  (0, 3914)	0.409831166267395
  (0, 4046)	0.4335138201713562
  (0, 4140)	0.4178239405155182
  :	:
  (35574, 35451)	0.4346131682395935
  (35574, 35453)	0.5107957124710083
  (35574, 35454)	0.47138547897338867
  (35574, 35460)	0.5328255295753479
  (35574, 35461)	0.4587545096874237
  (35574, 35463)	0.49336934089660645
  (35574, 35467)	0.53458774

In [23]:
dense_matrix = loaded_sparse_matrix.toarray()
dense_matrix

array([[1.        , 0.40987325, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.40987325, 1.0000002 , 0.607293  , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.607293  , 1.0000001 , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.56515884,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.56515884, 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.0000001 ]], dtype=float32)