# **MiniLM Similarity Model**

## **Dependecies**

In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import pickle
import numpy as np
import scipy.sparse as sps
from scipy.sparse import save_npz
import gc
from scipy.sparse import load_npz


## **Load Model**

MiniLM was specifically designed to maintain the performance of the large language models while reducing the model dimensions and thus the required computational resources. This is achieved through various techniques such as knowledge distillation, where a smaller model (the "student") is trained to mimic the behaviour of a larger, pre-trained model (the "teacher").

https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2

### Load all-MiniLM Model

In [2]:
model = SentenceTransformer('all-MiniLM-L6-v2')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

## **Read Data**

### Load csv

In [3]:
df = pd.read_csv("../../src/merged_orgas.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27397 entries, 0 to 27396
Data columns (total 32 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   iati_id                27397 non-null  object
 1   iati_orga_id           27397 non-null  object
 2   orga_abbreviation      27397 non-null  object
 3   orga_full_name         27397 non-null  object
 4   client                 27397 non-null  object
 5   title_en               27101 non-null  object
 6   title_other            20113 non-null  object
 7   title_main             27397 non-null  object
 8   organization           27397 non-null  object
 9   country_code_list      23273 non-null  object
 10  country                23273 non-null  object
 11  country_name           23252 non-null  object
 12  country_flag           27397 non-null  object
 13  region                 4130 non-null   object
 14  location               16883 non-null  object
 15  description_en     

In [4]:
df.columns

Index(['iati_id', 'iati_orga_id', 'orga_abbreviation', 'orga_full_name',
       'client', 'title_en', 'title_other', 'title_main', 'organization',
       'country_code_list', 'country', 'country_name', 'country_flag',
       'region', 'location', 'description_en', 'description_other',
       'description_main', 'status', 'planned_start', 'actual_start',
       'planned_end', 'actual_end', 'last_update', 'crs_5_code', 'crs_5_name',
       'crs_3_code', 'crs_3_name', 'docs', 'title_and_description',
       'sgd_pred_code', 'sgd_pred_str'],
      dtype='object')

In [5]:
df.head(1)

Unnamed: 0,iati_id,iati_orga_id,orga_abbreviation,orga_full_name,client,title_en,title_other,title_main,organization,country_code_list,...,actual_end,last_update,crs_5_code,crs_5_name,crs_3_code,crs_3_name,docs,title_and_description,sgd_pred_code,sgd_pred_str
0,DE-1-201822287-0,DE-1,bmz,Bundesministerium für wirtschaftliche Zusammen...,BMZ,Strengthening quality infrastructure for trade...,Stärkung der Qualitätsinfrastruktur für den Ha...,Strengthening quality infrastructure for trade...,Bundesministerium für wirtschaftliche Zusammen...,,...,2016-03-14T00:00:00Z,2024-02-29T00:00:00Z,33130;,Regional trade agreements (RTAs);,331;,Trade Policies & Regulations;,,Strengthening quality infrastructure for trade...,9,"8 9. Build resilient infrastructure, promot..."


### remove duplicates from df

In [6]:
len(df)

27397

In [7]:
df = df.drop_duplicates(keep="first")
len(df)

27397

## **Embeddings**

### calculate embeddings (last run: 31:23 min)

In [8]:
# for test purposes select just 100 random samples
#test_df = df.sample(10000)

In [9]:
sentences = df['title_and_description'].tolist()

In [10]:
embeddings = model.encode(sentences, show_progress_bar=True)

Batches:   0%|          | 0/857 [00:00<?, ?it/s]

In [11]:
embeddings

array([[-0.04930571, -0.02229353, -0.01424212, ..., -0.05813498,
         0.05638228,  0.01333214],
       [-0.05611133, -0.02287832, -0.01245939, ..., -0.01833626,
         0.01593846,  0.00362427],
       [ 0.01444224, -0.0499334 , -0.00599655, ..., -0.02526089,
         0.00581959, -0.06996275],
       ...,
       [-0.01351143,  0.08917006,  0.02691879, ...,  0.07068525,
        -0.01035912, -0.02006389],
       [ 0.01089144,  0.04269383,  0.0635475 , ..., -0.01490737,
        -0.02146529,  0.0100944 ],
       [ 0.00055788,  0.07568505,  0.01871078, ..., -0.10134395,
        -0.08061433, -0.03309683]], dtype=float32)

### save embeddings as pickle

In [12]:
with open("../../src/transformer/embeddings.pkl", "wb") as fOut:
    pickle.dump({"sentences": sentences, "embeddings": embeddings}, fOut, protocol=pickle.HIGHEST_PROTOCOL)

### load embeddings from local storage if needed

In [13]:
with open("../../src/transformer/embeddings.pkl", "rb") as fIn:
    stored_data = pickle.load(fIn)
    sentences = stored_data["sentences"]
    embeddings = stored_data["embeddings"]

In [14]:
embeddings

array([[-0.04930571, -0.02229353, -0.01424212, ..., -0.05813498,
         0.05638228,  0.01333214],
       [-0.05611133, -0.02287832, -0.01245939, ..., -0.01833626,
         0.01593846,  0.00362427],
       [ 0.01444224, -0.0499334 , -0.00599655, ..., -0.02526089,
         0.00581959, -0.06996275],
       ...,
       [-0.01351143,  0.08917006,  0.02691879, ...,  0.07068525,
        -0.01035912, -0.02006389],
       [ 0.01089144,  0.04269383,  0.0635475 , ..., -0.01490737,
        -0.02146529,  0.0100944 ],
       [ 0.00055788,  0.07568505,  0.01871078, ..., -0.10134395,
        -0.08061433, -0.03309683]], dtype=float32)

## **Cosine Similarity**

### calculate similarity matrix

In [15]:
similarity_matrix = util.cos_sim(embeddings, embeddings)

In [16]:
similarity_matrix.shape

torch.Size([27397, 27397])

### create sparse matrix

In [17]:
# collect garbage variables and delete to clear memory
gc.collect()

18

In [18]:
threshold = 0.5
mask = similarity_matrix > threshold

In [19]:
sparse_matrix = sps.coo_matrix(similarity_matrix.numpy() * mask.numpy())

### save matrix

In [20]:
save_npz("../../src/similarities.npz", sparse_matrix)

### load matrix

In [21]:
loaded_sparse_matrix = load_npz("../../app/src/similarities.npz")

In [22]:
print(loaded_sparse_matrix)

  (0, 0)	0.9999998807907104
  (0, 2)	0.7788637280464172
  (0, 45)	0.526912271976471
  (0, 48)	0.5594748854637146
  (0, 61)	0.521148681640625
  (0, 95)	0.52028489112854
  (0, 97)	0.52028489112854
  (0, 108)	0.5770738124847412
  (0, 112)	0.5399726033210754
  (0, 138)	0.5660058259963989
  (0, 156)	0.5028297901153564
  (0, 159)	0.5340653657913208
  (0, 188)	0.5956906080245972
  (0, 189)	0.6103914380073547
  (0, 191)	0.6847314834594727
  (0, 198)	0.5413348078727722
  (0, 205)	0.5298223495483398
  (0, 208)	0.5719239115715027
  (0, 236)	0.6185182332992554
  (0, 252)	0.5134038925170898
  (0, 261)	0.5660058259963989
  (0, 276)	0.5863873958587646
  (0, 278)	0.5632790923118591
  (0, 290)	0.6717886924743652
  (0, 294)	0.5548366904258728
  :	:
  (31699, 30904)	0.5664306282997131
  (31699, 30913)	0.5089055299758911
  (31699, 30914)	0.6269343495368958
  (31699, 30967)	0.5254554748535156
  (31699, 30969)	0.518643856048584
  (31699, 30974)	0.516819953918457
  (31699, 30996)	0.5049476623535156
  (31699,

In [23]:
dense_matrix = loaded_sparse_matrix.toarray()
dense_matrix

array([[0.9999999 , 0.        , 0.7788637 , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.0000002 , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.7788637 , 0.        , 1.0000001 , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.0000002 , 0.56515884,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.56515884, 1.0000001 ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.0000002 ]], dtype=float32)