## **Dependencies**

In [30]:
from transformers import BertTokenizer, BertModel
import torch
from scipy.spatial.distance import cosine
import pandas as pd
import numpy as np

## **Load Data**

In [31]:
df = pd.read_csv("../../src/merged_orgas.csv")
df.head(3)

Unnamed: 0,iati_id,iati_orga_id,orga_abbreviation,orga_full_name,client,title_en,title_other,title_main,organization,country_code,...,actual_end,last_update,crs_5_code,crs_5_name,crs_3_code,crs_3_name,docs,title_and_description,sgd_pred_code,sgd_pred_str
0,DE-1-201420207,DE-1,giz,Deutsche Gesellschaft für Internationale Zusam...,BMZ,Environmental and climate governance,Umwelt- und Klimagovernance,Environmental and climate governance,Bundesministerium für wirtschaftliche Zusammen...,['MA'],...,2022-06-30T00:00:00Z,2024-02-29T00:00:00Z,41010;,Environmental policy and administrative manage...,410;,General Environment Protection;,['https://www.giz.de/projektdaten/projects.act...,"Environmental and climate governance. Public, ...",9,"8 Goal 9. Build resilient infrastructure, p..."
1,DE-1-201516970-0,DE-1,bmz,Bundesministerium für wirtschaftliche Zusammen...,BMZ,Acquisition of a chateau d´eau for water stock,Anschaffung eines Wasserturms zur Wasserspeich...,Acquisition of a chateau d´eau for water stock,Bundesministerium für wirtschaftliche Zusammen...,['SN'],...,2022-06-30T00:00:00Z,2024-03-06T00:00:00Z,14030;,Basic drinking water supply and basic sanitati...,140;,Water Supply & Sanitation;,"['https://www.bmz.de/de/laender/senegal', 'htt...",Acquisition of a chateau d´eau for water stock...,6,5 Goal 6. Ensure availability and sustainab...
2,DE-1-201601228-1705,DE-1,bmz,Bundesministerium für wirtschaftliche Zusammen...,BMZ,Multisectoral food and nutrition security for ...,Multisektorale Ernährungssicherung für junge K...,Multisectoral food and nutrition security for ...,Bundesministerium für wirtschaftliche Zusammen...,['MW'],...,2022-06-30T00:00:00Z,2024-03-06T00:00:00Z,31120;,Agricultural development;,311;,Agriculture;,"['https://www.bmz.de/de/laender/malawi', 'http...",Multisectoral food and nutrition security for ...,2,"1 Goal 2. End hunger, achieve food security..."


In [25]:
sample_df = df.sample(n=100)
sample_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 4148 to 29877
Data columns (total 30 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   iati_id                1000 non-null   object
 1   iati_orga_id           1000 non-null   object
 2   orga_abbreviation      1000 non-null   object
 3   orga_full_name         1000 non-null   object
 4   client                 1000 non-null   object
 5   title_en               974 non-null    object
 6   title_other            799 non-null    object
 7   title_main             1000 non-null   object
 8   organization           1000 non-null   object
 9   country_code           816 non-null    object
 10  country                816 non-null    object
 11  region                 183 non-null    object
 12  location               626 non-null    object
 13  description_en         958 non-null    object
 14  description_other      794 non-null    object
 15  description_main      

In [26]:
sample_df.columns

Index(['iati_id', 'iati_orga_id', 'orga_abbreviation', 'orga_full_name',
       'client', 'title_en', 'title_other', 'title_main', 'organization',
       'country_code', 'country', 'region', 'location', 'description_en',
       'description_other', 'description_main', 'status', 'planned_start',
       'actual_start', 'planned_end', 'actual_end', 'last_update',
       'crs_5_code', 'crs_5_name', 'crs_3_code', 'crs_3_name', 'docs',
       'title_and_description', 'sgd_pred_code', 'sgd_pred_str'],
      dtype='object')

## **Model**

### Load Model & Tokenizer

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Functions

- getting embeddings
- calculating similarity

In [9]:
# Function to get embeddings
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    outputs = model(**inputs)

    return outputs.pooler_output.squeeze()

# Calculate similarity
def calculate_similarity(text1, text2):
    embedding1 = get_embedding(text1)
    embedding2 = get_embedding(text2)
    # Ensure the embeddings are 1-D arrays
    embedding1 = embedding1.detach().numpy()
    embedding2 = embedding2.detach().numpy()
    # Calculate cosine similarity
    cos_sim = 1 - cosine(embedding1, embedding2)
    return cos_sim

In [18]:
# Example texts
text1 = "Diversification and improvement of sustainable small-scale aquaculture opportunities for rural communities in Pohnpei; Diversification and improvement of sustainable small-scale aquaculture opportunities for rural communities in Pohnpei"
text2 = "Through procurement of modern fishing technics as well as skills training, the social-economic status of fisher(wo)men in the region, especially of divorced or widowed fisherwomen who generally suffer from marginalisation within the community and society, is being improved. Based on higher and safer income, the fisher(wo)men benefit from better health. Through fishing skills trainings, the community spirit should be strengthened, reciprocal support among the community members should be enhanced. The project follows a sustainable approach: After use of modern fishing technics, the fisher(wo)men will generate bigger capture leading to higher incomes for the fisher(wo)men which they will then spend on maintenance of new fishing technics and procurement of more modern fishing technics."

# Calculate and print similarity
similarity = calculate_similarity(text1, text2)
print(f"Similarity: {similarity}")

Similarity: 0.9707517623901367


## **Run Model**

### calculate embeddings for all descriptions

In [29]:
sample_df['Embedding'] = sample_df['title_and_description'].apply(get_embedding)

RuntimeError: [enforce fail at C:\cb\pytorch_1000000000000\work\c10\core\impl\alloc_cpu.cpp:81] data. DefaultCPUAllocator: not enough memory: you tried to allocate 702768 bytes.

In [27]:
sample_df.head(3)

Unnamed: 0,iati_id,iati_orga_id,orga_abbreviation,orga_full_name,client,title_en,title_other,title_main,organization,country_code,...,actual_end,last_update,crs_5_code,crs_5_name,crs_3_code,crs_3_name,docs,title_and_description,sgd_pred_code,sgd_pred_str
4148,DE-1-201321512,DE-1,giz,Deutsche Gesellschaft für Internationale Zusam...,BMZ,Vocational Education &amp; Training and Employ...,Berufsbildungsreform und Beschäftigungsförderung,Vocational Education &amp; Training and Employ...,Bundesministerium für wirtschaftliche Zusammen...,['KG'],...,2022-06-30T00:00:00Z,2024-02-29T00:00:00Z,11330; 16020; 11120;,Vocational training; Employment creation; Educ...,113; 160; 111;,Secondary Education; Other Social Infrastructu...,['https://www.giz.de/projektdaten/projects.act...,Vocational Education &amp Training and Employ...,8,"7 Goal 8. Promote sustained, inclusive and ..."
1598,DE-1-201518158-0,DE-1,bmz,Bundesministerium für wirtschaftliche Zusammen...,BMZ,Support for Syrian Health Infrastructure in th...,Stärkung der syrischen Gesundheitsinfrastruktu...,Support for Syrian Health Infrastructure in th...,Bundesministerium für wirtschaftliche Zusammen...,['SY'],...,2022-06-30T00:00:00Z,2024-03-06T00:00:00Z,73010;,Immediate post-emergency reconstruction and re...,730;,Reconstruction Relief & Rehabilitation;,"['https://www.bmz.de/de/laender/syrien', 'http...",Support for Syrian Health Infrastructure in th...,3,2 Goal 3. Ensure healthy lives and promote ...
10495,DE-1-201021104,DE-1,giz,Deutsche Gesellschaft für Internationale Zusam...,BMZ,"Promotion of Microfinance in Central Asia, reg...",Förderung des Mikrofinanzwesens in Zentralasie...,"Promotion of Microfinance in Central Asia, reg...",Bundesministerium für wirtschaftliche Zusammen...,,...,2022-06-30T00:00:00Z,2024-02-29T00:00:00Z,24010; 25010;,Financial policy and administrative management...,240; 250;,Banking & Financial Services; Business & Other...,['https://www.giz.de/projektdaten/projects.act...,"Promotion of Microfinance in Central Asia, reg...",1,0 Goal 1. End poverty in all its forms ever...


In [None]:
n = len(sample_df)
similarity_matrix = np.zeros((n, n))

for i in range(n):
    for j in range(n):
        if i != j:
            similarity_matrix[i, j] = 1 - cosine(sample_df.iloc[i]['Embedding'], sample_df.iloc[j]['Embedding'])
        else:
            similarity_matrix[i, j] = 1  # Assuming maximum similarity with itself

# Convert the similarity matrix to a DataFrame for better readability
similarity_df = pd.DataFrame(similarity_matrix, columns=sample_df['title_and_description'], index=sample_df['title_and_description'])

similarity_df