# **MiniLM Similarity Model**

## **Dependecies**

In [3]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import pickle
import numpy as np
import scipy.sparse as sps
from scipy.sparse import save_npz
import gc
from scipy.sparse import load_npz


## **Load Model**

MiniLM was specifically designed to maintain the performance of the large language models while reducing the model dimensions and thus the required computational resources. This is achieved through various techniques such as knowledge distillation, where a smaller model (the "student") is trained to mimic the behaviour of a larger, pre-trained model (the "teacher").

https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2

### Load all-MiniLM Model

In [2]:
model = SentenceTransformer('all-MiniLM-L6-v2')

## **Read Data**

### Load csv

In [3]:
df =pd.read_csv("../../src/non_closed.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34523 entries, 0 to 34522
Data columns (total 31 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Unnamed: 0             34523 non-null  int64 
 1   iati_id                34523 non-null  object
 2   iati_orga_id           34523 non-null  object
 3   orga_abbreviation      34523 non-null  object
 4   orga_full_name         34523 non-null  object
 5   client                 34523 non-null  object
 6   title_en               34083 non-null  object
 7   title_other            26056 non-null  object
 8   title_main             34523 non-null  object
 9   organization           34523 non-null  object
 10  country_code           28454 non-null  object
 11  country                28454 non-null  object
 12  region                 6079 non-null   object
 13  location               21264 non-null  object
 14  description_en         33449 non-null  object
 15  description_other  

  df =pd.read_csv("../../src/non_closed.csv")


In [4]:
df.columns

Index(['Unnamed: 0', 'iati_id', 'iati_orga_id', 'orga_abbreviation',
       'orga_full_name', 'client', 'title_en', 'title_other', 'title_main',
       'organization', 'country_code', 'country', 'region', 'location',
       'description_en', 'description_other', 'description_main', 'status',
       'planned_start', 'actual_start', 'planned_end', 'actual_end',
       'last_update', 'crs_5_code', 'crs_5_name', 'crs_3_code', 'crs_3_name',
       'docs', 'title_and_description', 'sgd_pred_code', 'sgd_pred_str'],
      dtype='object')

In [5]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,iati_id,iati_orga_id,orga_abbreviation,orga_full_name,client,title_en,title_other,title_main,organization,...,actual_end,last_update,crs_5_code,crs_5_name,crs_3_code,crs_3_name,docs,title_and_description,sgd_pred_code,sgd_pred_str
0,0,DE-1-201420207,DE-1,giz,Deutsche Gesellschaft für Internationale Zusam...,BMZ,Environmental and climate governance,Umwelt- und Klimagovernance,Environmental and climate governance,Bundesministerium für wirtschaftliche Zusammen...,...,2022-06-30T00:00:00Z,2024-02-29T00:00:00Z,41010;,Environmental policy and administrative manage...,410;,General Environment Protection;,['https://www.giz.de/projektdaten/projects.act...,"Environmental and climate governance. Public, ...",9,"8 Goal 9. Build resilient infrastructure, p..."


### remove duplicates from df

In [6]:
len(df)

34523

In [7]:
df = df.drop_duplicates(keep="first")
len(df)

34523

## **Embeddings**

### calculate embeddings (last run: 31:23 min)

In [8]:
# for test purposes select just 100 random samples
#test_df = df.sample(10000)

In [9]:
sentences = df['title_and_description'].tolist()

In [10]:
embeddings = model.encode(sentences, show_progress_bar=True)

Batches:   0%|          | 0/1079 [00:00<?, ?it/s]

In [11]:
embeddings

array([[ 0.04446795,  0.06397948, -0.00366975, ...,  0.04417085,
        -0.01882802,  0.0103603 ],
       [ 0.08771117,  0.05170955,  0.05286624, ...,  0.10659182,
        -0.01310572,  0.10529935],
       [ 0.00639981,  0.05616608, -0.04642351, ...,  0.05253979,
         0.03234456, -0.01128971],
       ...,
       [ 0.05137816,  0.08635731,  0.04291049, ...,  0.02325561,
         0.00975689,  0.04211039],
       [ 0.06849658,  0.06803955, -0.02111158, ..., -0.02403513,
        -0.02469418, -0.0532915 ],
       [-0.04827396,  0.01469501,  0.03287731, ...,  0.03966878,
         0.02058095,  0.07663082]], dtype=float32)

### save embeddings as pickle

In [12]:
with open("../../src/transformer/non_closed_embeddings.pkl", "wb") as fOut:
    pickle.dump({"sentences": sentences, "embeddings": embeddings}, fOut, protocol=pickle.HIGHEST_PROTOCOL)

### load embeddings from local storage if needed

In [13]:
with open("../../src/transformer/non_closed_embeddings.pkl", "rb") as fIn:
    stored_data = pickle.load(fIn)
    sentences = stored_data["sentences"]
    embeddings = stored_data["embeddings"]

In [4]:
embeddings

array([[ 0.04446795,  0.06397948, -0.00366975, ...,  0.04417085,
        -0.01882802,  0.0103603 ],
       [ 0.08771117,  0.05170955,  0.05286624, ...,  0.10659182,
        -0.01310572,  0.10529935],
       [ 0.00639981,  0.05616608, -0.04642351, ...,  0.05253979,
         0.03234456, -0.01128971],
       ...,
       [ 0.05137816,  0.08635731,  0.04291049, ...,  0.02325561,
         0.00975689,  0.04211039],
       [ 0.06849658,  0.06803955, -0.02111158, ..., -0.02403513,
        -0.02469418, -0.0532915 ],
       [-0.04827396,  0.01469501,  0.03287731, ...,  0.03966878,
         0.02058095,  0.07663082]], dtype=float32)

## **Cosine Similarity**

### calculate similarity matrix

In [14]:
similarity_matrix = util.cos_sim(embeddings, embeddings)

In [6]:
similarity_matrix.shape

torch.Size([34523, 34523])

### create sparse matrix

In [15]:
# collect garbage variables and delete to clear memory
gc.collect()

1006

In [27]:
threshold = 0.47
mask = similarity_matrix > threshold

In [28]:
sparse_matrix = sps.coo_matrix(similarity_matrix.numpy() * mask.numpy())

### save matrix

In [29]:
save_npz("../../app/src/similarities.npz", sparse_matrix)

### load matrix

In [18]:
loaded_sparse_matrix = load_npz("../../app/src/similarities.npz")

In [19]:
print(loaded_sparse_matrix)

  (0, 0)	1.0
  (0, 36)	0.4761685
  (0, 51)	0.51603603
  (0, 70)	0.45514828
  (0, 104)	0.45296735
  (0, 115)	0.45514226
  (0, 123)	0.41041362
  (0, 134)	0.41039714
  (0, 143)	0.53401893
  (0, 144)	0.414064
  (0, 198)	0.4214546
  (0, 204)	0.41871434
  (0, 215)	0.58279383
  (0, 223)	0.4570006
  (0, 224)	0.49696803
  (0, 244)	0.42374802
  (0, 257)	0.41315815
  (0, 278)	0.4374295
  (0, 287)	0.41041365
  (0, 291)	0.41458076
  (0, 322)	0.45986035
  (0, 406)	0.4420184
  (0, 414)	0.41622907
  (0, 419)	0.43624574
  (0, 424)	0.52522236
  :	:
  (34522, 32383)	0.40193433
  (34522, 32538)	0.46140018
  (34522, 32799)	0.40708384
  (34522, 32850)	0.4127875
  (34522, 32871)	0.42146295
  (34522, 32899)	0.45997784
  (34522, 32964)	0.49078763
  (34522, 32968)	0.40979627
  (34522, 32974)	0.40607744
  (34522, 33004)	0.4160214
  (34522, 33062)	0.4076392
  (34522, 33111)	0.4019813
  (34522, 33133)	0.46026075
  (34522, 33197)	0.4334693
  (34522, 33312)	0.4359737
  (34522, 33331)	0.4110967
  (34522, 33530)	0.413

In [6]:
dense_matrix = loaded_sparse_matrix.toarray()
dense_matrix

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.45697343, 0.        ,
        0.        ],
       [0.        , 0.        , 1.0000002 , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.45697355, 0.        , ..., 1.0000001 , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.0000002 ]], dtype=float32)