## **Dependencies**

In [22]:
from transformers import BertTokenizer, BertModel
from tqdm import tqdm
from scipy.spatial.distance import cosine
import pandas as pd
import numpy as np

## **Load Data**

In [23]:
df = pd.read_csv("../../src/merged_orgas.csv")
df.head(3)

Unnamed: 0,iati_id,iati_orga_id,orga_abbreviation,orga_full_name,client,title_en,title_other,title_main,organization,country_code,...,actual_end,last_update,crs_5_code,crs_5_name,crs_3_code,crs_3_name,docs,title_and_description,sgd_pred_code,sgd_pred_str
0,DE-1-201420207,DE-1,giz,Deutsche Gesellschaft für Internationale Zusam...,BMZ,Environmental and climate governance,Umwelt- und Klimagovernance,Environmental and climate governance,Bundesministerium für wirtschaftliche Zusammen...,['MA'],...,2022-06-30T00:00:00Z,2024-02-29T00:00:00Z,41010;,Environmental policy and administrative manage...,410;,General Environment Protection;,['https://www.giz.de/projektdaten/projects.act...,"Environmental and climate governance. Public, ...",9,"8 Goal 9. Build resilient infrastructure, p..."
1,DE-1-201516970-0,DE-1,bmz,Bundesministerium für wirtschaftliche Zusammen...,BMZ,Acquisition of a chateau d´eau for water stock,Anschaffung eines Wasserturms zur Wasserspeich...,Acquisition of a chateau d´eau for water stock,Bundesministerium für wirtschaftliche Zusammen...,['SN'],...,2022-06-30T00:00:00Z,2024-03-06T00:00:00Z,14030;,Basic drinking water supply and basic sanitati...,140;,Water Supply & Sanitation;,"['https://www.bmz.de/de/laender/senegal', 'htt...",Acquisition of a chateau d´eau for water stock...,6,5 Goal 6. Ensure availability and sustainab...
2,DE-1-201601228-1705,DE-1,bmz,Bundesministerium für wirtschaftliche Zusammen...,BMZ,Multisectoral food and nutrition security for ...,Multisektorale Ernährungssicherung für junge K...,Multisectoral food and nutrition security for ...,Bundesministerium für wirtschaftliche Zusammen...,['MW'],...,2022-06-30T00:00:00Z,2024-03-06T00:00:00Z,31120;,Agricultural development;,311;,Agriculture;,"['https://www.bmz.de/de/laender/malawi', 'http...",Multisectoral food and nutrition security for ...,2,"1 Goal 2. End hunger, achieve food security..."


In [24]:
sample_df = df.sample(n=1000)
sample_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 17430 to 34964
Data columns (total 30 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   iati_id                1000 non-null   object
 1   iati_orga_id           1000 non-null   object
 2   orga_abbreviation      1000 non-null   object
 3   orga_full_name         1000 non-null   object
 4   client                 1000 non-null   object
 5   title_en               980 non-null    object
 6   title_other            829 non-null    object
 7   title_main             1000 non-null   object
 8   organization           1000 non-null   object
 9   country_code           809 non-null    object
 10  country                809 non-null    object
 11  region                 191 non-null    object
 12  location               656 non-null    object
 13  description_en         962 non-null    object
 14  description_other      827 non-null    object
 15  description_main     

In [25]:
sample_df.columns

Index(['iati_id', 'iati_orga_id', 'orga_abbreviation', 'orga_full_name',
       'client', 'title_en', 'title_other', 'title_main', 'organization',
       'country_code', 'country', 'region', 'location', 'description_en',
       'description_other', 'description_main', 'status', 'planned_start',
       'actual_start', 'planned_end', 'actual_end', 'last_update',
       'crs_5_code', 'crs_5_name', 'crs_3_code', 'crs_3_name', 'docs',
       'title_and_description', 'sgd_pred_code', 'sgd_pred_str'],
      dtype='object')

## **Initiate Model**

### Load Model & Tokenizer

In [26]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## **Calculate Embeddings**

### create embeddings data frame

In [32]:
embedding_df = pd.DataFrame()
embedding_df["iati_id"] = sample_df["iati_id"]
embedding_df["embedding"] = list

embedding_df.head(3)

Unnamed: 0,iati_id,embedding
17430,DE-1-201921238,<class 'list'>
1862,DE-1-201606003-1415,<class 'list'>
15509,DE-1-200865113,<class 'list'>


### calculate embeddings for all descriptions

In [30]:
# Function to get embeddings
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    outputs = model(**inputs)

    return outputs.pooler_output.squeeze().detach().numpy()

In [33]:
for index, row in tqdm(sample_df.iterrows(), total=sample_df.shape[0]):
    try:
        embedding_df.at[index, "embedding"] = get_embedding(row["title_and_description"])
    except Exception as e:
        print(f"Error processing index {index}: {e}")


100%|██████████| 1000/1000 [03:25<00:00,  4.87it/s]


In [34]:
embedding_df.head(3)

Unnamed: 0,iati_id,embedding
17430,DE-1-201921238,"[-0.96990985, -0.7947024, -0.9995594, 0.939018..."
1862,DE-1-201606003-1415,"[-0.78278816, -0.66643614, -0.99546945, 0.7873..."
15509,DE-1-200865113,"[-0.60512286, -0.63203555, -0.9918106, 0.69386..."


### export embeddings as csv

In [36]:
embedding_df.to_csv("../../src/transformer/embedding.csv", index=False) 

## **Calculate Similarities**

### calculate matrix

In [39]:
n = len(embedding_df)
similarity_matrix = np.zeros((n, n))

for i in tqdm(range(n), desc="Calculating similarities"):
    for j in range(n):
        if i != j:
            emb_i = embedding_df.iloc[i]['embedding']
            emb_j = embedding_df.iloc[j]['embedding']
            # Calculate the similarity
            similarity_matrix[i, j] = 1 - cosine(emb_i, emb_j)
        else:
            similarity_matrix[i, j] = 1  # Assuming maximum similarity with itself

# Convert the similarity matrix to a DataFrame for better readability
similarity_df = pd.DataFrame(similarity_matrix, columns=embedding_df['iati_id'], index=embedding_df['iati_id'])

Calculating similarities: 100%|██████████| 1000/1000 [02:39<00:00,  6.28it/s]


In [42]:
similarity_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, DE-1-201921238 to XM-DAC-46004-54430-001-LN8434
Columns: 1000 entries, DE-1-201921238 to XM-DAC-46004-54430-001-LN8434
dtypes: float64(1000)
memory usage: 7.6+ MB


### export similarities as csv

In [43]:
similarity_df.to_csv("../../src/transformer/similarities.csv", index=False) 