# **SDG Prediction**

## **Dependencies**

In [30]:
from transformers import pipeline
import pandas as pd
from tqdm import tqdm

## **SDG Classifier**

### Load Model

Model predicts first 15 sdg

https://huggingface.co/jonas/sdg_classifier_osdg

In [31]:
pipe = pipeline("text-classification", model="jonas/roberta-base-finetuned-sdg")

### Load CSV

In [32]:
df = pd.read_csv("../../../src/transformed/transformed_bmz.csv")
df.head(1)

Unnamed: 0,iati_id,iati_orga_id,orga_abbreviation,orga_full_name,title_en,title_other,title_main,organization,country_code,country,...,planned_end,actual_end,last_update,crs_5_code,crs_5_name,crs_3_code,crs_3_name,docs,sgd_pred_code,sgd_pred_str
0,DE-1-198670044,DE-1,bmz,Bundesministerium für wirtschaftliche Zusammen...,,Studien- und Fachkräftefonds I; Fonds d&apos;E...,Studien- und Fachkräftefonds I; Fonds d&apos;E...,Bundesministerium für wirtschaftliche Zusammen...,['CD'],CD;,...,2026-12-31T00:00:00Z,2024-01-31T00:00:00Z,2024-01-31T00:00:00Z,43010;,Multisector aid;,430;,Other Multisector;,['https://www.kfw-entwicklungsbank.de/Internat...,9,"8 Goal 9. Build resilient infrastructure, p..."


### Load SDG CSV

In [33]:
sdg_df = pd.read_csv("../../../src/codelists/sdg_goals.csv")
sdg_df.head(1)

Unnamed: 0,code,name,description,language,category,category-name,category-description
0,1,Goal 1. End poverty in all its forms everywhere,,en,,,


### Apply Model

In [None]:
df["sgd_pred_code"] = "NaN"
df["sgd_pred_str"] = "NaN"

len_df = len(df)

for index, row in tqdm(df.iterrows(), total=len_df, desc="Processing"):
    if index % 500 == 0:
        print(f"Debugger: {index} / {len_df}")
    descr_row = row['description_main']
    try:
        # nan in pandas is type float
        # check if nan 
            if isinstance(descr_row, float):
                df["sgd_pred_code"][index] = "NaN"
                df["sgd_pred_str"][index] = "NaN"
            else:
                if len(descr_row) > 512:
                    descr_row = descr_row[:512]
                # use clf with description and predict sgd 
                pred = pipe(descr_row)
                pred_str = pred[0]["label"]
                pred_int = int(pred_str)

                # Map sgd codes to names
                sdg_translation = sdg_df.loc[sdg_df['code'] == pred_int, 'name']

                df["sgd_pred_code"][index] = pred_int
                df["sgd_pred_str"][index] = sdg_translation
    except Exception as e:
        print(f"{e}: {descr_row}")

df.head()

### With Batch processing

>> Not faster!

In [None]:
df["sgd_pred_code"] = "NaN"
df["sgd_pred_str"] = "NaN"

batch_size = 8
n_batches = len(df) // batch_size + (len(df) % batch_size > 0)

for batch_n in tqdm(range(n_batches), desc="Processing batches"):
    batch_start = batch_n * batch_size
    batch_end = (batch_n + 1) * batch_size
    df_batch = df.iloc[batch_start:batch_end]

    for index, row in df_batch.iterrows():
        descr_row = row['description_main']
        try:
            if isinstance(descr_row, float):  
                continue  
            else:
                if len(descr_row) > 512:
                    descr_row = descr_row[:512]
                pred = pipe(descr_row)  
                pred_str = pred[0]["label"]
                pred_int = int(pred_str)
                
                sdg_translation = sdg_df.loc[sdg_df['code'] == pred_int, 'name'].values[0] if not sdg_df.loc[sdg_df['code'] == pred_int, 'name'].empty else "NaN"

                df.loc[index, "sgd_pred_code"] = pred_int
                df.loc[index, "sgd_pred_str"] = sdg_translation
        except Exception as e:
            print(f"Error at index {index}: {e}")

    if (batch_n + 1) % 1 == 0 or batch_n == n_batches - 1:
        tqdm.write(f"Processed batch {batch_n + 1}/{n_batches}")