In [3]:
import os
import time
import deepl
import re as re
import numpy as np
import pandas as pd
import seaborn as sns
import translators as ts
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker 
# check 
%matplotlib inline
import numbers
import string
from langdetect import detect
from tabulate import tabulate
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from pandas.errors import ParserError
from langdetect import DetectorFactory
from stop_words import get_stop_words

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn import preprocessing
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder #maybe dont' need
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, classification_report, hamming_loss, accuracy_score




df = pd.read_csv("grant_dataset_6nov_newheader.csv", dtype=object)  #reading in the data 
print("shape of loaded dataset : ", df.shape)
# Translation - 2 steps

#1. langauge detection
#2. if language not english = save and translate
#We want to translate 3 text types = Titles, Descriptions and ("Receiver + Title")

#_ = ts.preaccelerate_and_speedtest() #preacceleration step for translation
#1 Filter out grant entities with insufficient data
#2 Tranlsation



#1 filter out bad grant entities

def fun_remove_lines2(column):
    """
    This function returns a mask where True indicates the presence of valid data in the column,
    and False indicates missing or invalid data.
    """
    
    mask = ~column.isna() & (column != "#NA") & (column != "-") & (column != "Årstal") & (column.astype(str).str.strip() != "")
    return mask

def higher_remove_function2(amount, year, receiver_1, receiver_name, receiver_title, receiver_inst, descr, title):

    """
    This function incorporates the mask from remove_lines function, and keep lines that have documented:
    1. Grant size and year
    2. A receiver (name, organisation name, profession or institution)
    3. A project description or title (Title, description, or combined text of receiver and title)
    
    it returns two lists, of kept (keep_lines) and removed lines (remove_lines)
    """
    
    amount_mask = fun_remove_lines2(amount)
    year_mask = fun_remove_lines2(year)
    rec_mask = fun_remove_lines2(receiver_1)
    rec_name_mask = fun_remove_lines2(receiver_name)
    rec_title_mask = fun_remove_lines2(receiver_title)
    rec_inst_mask = fun_remove_lines2(receiver_inst)
    descrip_mask = fun_remove_lines2(descr)
    title_mask = fun_remove_lines2(title)
    

    grant_condition = amount_mask & year_mask
    receiver_condition = rec_mask | rec_name_mask | rec_title_mask | rec_inst_mask 
    project_info_condition = descrip_mask | title_mask | rec_title_mask 
    

    keep_mask = grant_condition & receiver_condition & project_info_condition 
    keep_lines2 = list(keep_mask[keep_mask].index)
    remove_lines2 = list(keep_mask[~keep_mask].index)
    
    return keep_lines2, remove_lines2


def detection_fun(masked_list):
    """
    this function takes a list of text strings as input, 
    detects the language of the string, 
    and saves it to a list returned
    """
    detection_list = []
    for num, i in enumerate(masked_list):

        if i != "18+" and i != "34756600" and pd.notna(i):
            d = detect(i) 
            detection_list.append(d)
        else:
            detection_list.append(i) 
        if num % 100 == 0:
            print(num)
    return detection_list

def translater_function(column_name, langdetect_list):
    """
    This function takes a series object with text info that is to be translated. 
    It translates the text and saves the data to a new series object called translated_description
    """
    translated_description_full = []
    not_translated = []
    index = []

    for num, i in enumerate(column_name):
        if num % 100 == 0:
            print(num)
        
        if langdetect_list[num] != "en":
            try:
                item = ts.translate_text(query_text=i, if_use_preacceleration = True, if_ignore_empty_query=True)
                translated_description_full.append(item)

            except Exception as e:
                print(f"Error translating: {e}")
                translated_description_full.append(i)
                not_translated.append(i)
                index.append(num)
                print(num, i)
        else:
            translated_description_full.append(i)
                
    return translated_description_full, not_translated, index


#functions adapted from: https://www.kaggle.com/code/abdmental01/text-preprocessing-nlp-steps-to-process-text

def removing_html_tags(text):
    """This function removes HTML tags in a given string input (text)"""
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', str(text))

def removing_url(text):
    """This function removes URL's in a given string input (text)"""
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)

punc = string.punctuation  
def removing_string_punc(text):
    """This function removes string punctuations in a given string input (text)"""
    return text.translate(str.maketrans('', '', punc))

stop_words = get_stop_words('en') #initiating variable with english stop words
def remove_stopwords(text):
    """This function removes stop words from english vocabulary in a given string input (text)"""
    new_text = []
    for word in text.split():
        if word in stop_words:
            new_text.append('')
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return " ".join(x)

def data_processing_function(list_of_interest):
    """This function takes a list or series as input, and applies each datapre-processing steps defined before,
    and return the processed list"""
    
    nl = list_of_interest.astype(str).str.lower()  #Lowercase formatting
    nl_html = nl.apply(removing_html_tags) #removing HTMLs
    nl_url = nl_html.apply(removing_url) #removing URLs
    nl_sp = nl_url.apply(removing_string_punc) #removing string punctuation: '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
    nl_stopword = nl_sp.apply(remove_stopwords) #removing stop words
    return nl_stopword


def non_translated_rows(non_translated_list):
    """
    This function takes the "non_translated_list" output from the translater_function and prints and 
    saves rows that were not translated with the Translator module. 
    
    input : non_translated_list
    
    return : list of strings + index that where not translated
    
    """
    non_translated_list = pd.Series(non_translated_list) 
    mask_natitle = pd.notna(non_translated_list)
    count_title = 0
    non_translated = non_translated_list[mask_natitle]
    for i in non_translated:
        s = len(i)
        count_title += s

    print("Rows Not tranlated with Translator module : ", len(non_translated))
    print("Characters Not tranlated with Translator module : ", count_title)
    return non_translated



keeper, remover = higher_remove_function2(df['Grant_size_(DKK)'], df['Year'], df['Receiver'], df['Receiver_Name'], df['Receiver_Title'], df['Institution'], df['Description'], df['Title'])

print("Number of lines kept :", len(keeper))
print("Number of lines removed :", len(remover))


df.loc[remover].to_csv('removed_rows6th_nov.csv', index=True)
na_removed_df = df.loc[remover]  #new df
na_removed_df = na_removed_df.reset_index() 

#Saving kept lines for new dataframe
df = df.loc[keeper]  #new df
df = df.reset_index(drop = True)  #reset index (kept old index for now as new column)




#Language detect function for fast translation step 
detected_descriptionlist = detection_fun(df["Description"])
de_title = detection_fun(df["Title"])
de_receiver_title = detection_fun(df["Receiver_Title"])

#Lower convertion of Strings
df["Description"] = df["Description"].str.lower()  
df["Title"] = df["Title"].str.lower()
df["Receiver_Title"] = df["Receiver_Title"].str.lower()

#Translation of Description, Title and Receiver_Title
translated_description_list, not_tra_desc, ind_desc = translater_function(df["Description"], detected_descriptionlist)  #translation of description
translated_title_list, not_tra_title, ind_title = translater_function(df["Title"], de_title)  #translation of title
translated_rec_n_title_list, not_tra_rec_dec, ind_rec_dec = translater_function(df["Receiver_Title"], de_receiver_title)  #translation of rec + title list

#all translations are saved to this list - except for few rows that failed - later translated with DeepL - API required
translated_list = pd.DataFrame(
    {'desc': translated_description_list,
     'title': translated_title_list,
     'rec_title': translated_rec_n_title_list
    })

#saving to csv to save results
translated_list.to_csv("translated_lists_xth_month.csv")  #saving to csv to save results


#checking number of rows that where not translated in the 3 lists

no_translation_description = non_translated_rows(not_tra_desc)
no_translation_title = non_translated_rows(not_tra_title)
no_translation_rec_title = non_translated_rows(not_tra_rec_dec)



#The DeepL module offers 500.000 free characters translated pr. month, 
#and therefore it's enough to translate the remaining

#it's only in the "Description" list we have untranslated 4816 rows, so we introduce deep L translator
translator = deepl.Translator("caae833d-76d5-40f6-a1a6-b7badb3abd91:fx")


##test - remove hash if deepL is not working, to test API
result = translator.translate_text("Hello, world!", target_lang="FR")
print(result)

#only run once!!
#translate untranslated descrptions: 

deep_l_translations = []
for i in no_translation_description:
    results = translator.translate_text(i, target_lang="EN-US")  #applying EN-US english 
    deep_l_translations.append(results.text)

deepl_arr = np.array(deep_l_translations)

translated_desc = np.array(translated_description_list) #make copy

index_arr = np.array(ind_desc)
index_no_na = index_arr[mask_natitle]  #using the masking list of nan lines from before
print(len(index_no_na))  #Index file
print(len(deepl_arr)) #translated file
translated_desc[index_no_na]=deepl_arr


df.insert(13, "Translated_descriptions", translated_desc, True)  #only run once!
df.insert(11, "Translated_title", translated_title_list, True) #only run once!
df.insert(12, "Translated_receiver_title", translated_rec_n_title_list, True) #only run once!

#Data processing step
df["Translated_descriptions_pro"] = data_processing_function(df["Translated_descriptions"])
df["Translated_title_pro"] = data_processing_function(df["Translated_title"])
df["Translated_receiver_title_pro"] = data_processing_function(df["Translated_receiver_title"])

df.to_csv("df_translated_proc_19thnov.csv")

shape of loaded dataset :  (37380, 16)
Number of lines kept : 37159
Number of lines removed : 221
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000
10100
10200
10300
10400
10500
10600
10700
10800
10900
11000
11100
11200
11300
11400
11500
11600
11700
11800
11900
12000
12100
12200
12300
12400
12500
12600
12700
12800
12900
13000
13100
13200
13300
13400
13500
13600
13700
13800
13900
14000
14100
14200
14300
14400
14500
14600
14700
14800
14900
15000
15100
15200
15300
15400
15500
15600
15700
15800
15900
16000
16100
16200
16300
16400
16500
16600
16700
16800


6500
Error translating: 
6524 nan
Error translating: 
6549 i svanninge bjerge arbejder vi målrettet for verdensmål nr. 15 (livet på land, herunder stoppe tabet af biodiversitet), ligesom vi indtænker co2-absorption i forvaltningen. i 2023 blev vi tildelt den mest omfattende økosystemtjenestecertificering for biodiversitet (esc), der endnu er lavet i danmark. med certificeringen har vi opnået en blåstempling af vores arbejde, ligesom det giver os en mulighed for at kvalitetssikre vores nuværende og fremtidige biodiversitetsfremmende indsats. certificeringen er sket i samarbejde med hededanmark og fsc danmark, og var mulig at gennemføre, fordi vi står på et solidt fundament af biodiversitets-fremmende arbejde siden 2005/2006. i den kommende periode arbejder vi videre med udvikling af det strategiske sigte for biodiversitetsindsatsen. formålet med forvaltningen af svanninge bjerge er: at fremme biodiversiteten, at danne udgangspunkt for et fysisk laboratorium i natur til et godt liv og at

Error translating: 
7470 nan
Error translating: 
7474 nan
Error translating: 
7476 nan
Error translating: 
7478 nan
Error translating: 
7489 nan
Error translating: 
7490 nan
Error translating: 
7492 nan
Error translating: 
7493 nan
Error translating: 
7494 nan
Error translating: 
7495 nan
Error translating: 
7496 nan
Error translating: 
7498 nan
7500
Error translating: 
7508 nan
Error translating: 
7542 nan
Error translating: 
7543 nan
Error translating: 
7546 nan
Error translating: 
7562 nan
Error translating: 
7565 nan
Error translating: 
7569 nan
Error translating: 
7575 nan
Error translating: 
7597 nan
Error translating: 
7598 nan
Error translating: 
7599 nan
7600
Error translating: 
7601 nan
Error translating: 
7602 nan
Error translating: 
7604 nan
Error translating: 
7605 nan
Error translating: 
7612 nan
Error translating: 
7628 nan
Error translating: 
7629 nan
Error translating: 
7634 nan
Error translating: 
7636 nan
Error translating: 
7637 nan
Error translating: 
7639 nan
Erro

15900
16000
16100
16200
16300
Error translating: 
16343 nan
Error translating: 
16344 nan
Error translating: 
16345 nan
Error translating: 
16346 nan
Error translating: 
16347 nan
Error translating: 
16348 nan
Error translating: 
16349 nan
Error translating: 
16350 nan
Error translating: 
16351 nan
Error translating: 
16352 nan
Error translating: 
16353 nan
Error translating: 
16354 nan
Error translating: 
16355 nan
Error translating: 
16356 nan
Error translating: 
16357 nan
Error translating: 
16358 nan
Error translating: 
16359 nan
Error translating: 
16360 nan
Error translating: 
16361 nan
Error translating: 
16362 nan
Error translating: 
16363 nan
Error translating: 
16364 nan
Error translating: 
16365 nan
Error translating: 
16366 nan
Error translating: 
16367 nan
Error translating: 
16368 nan
Error translating: 
16369 nan
Error translating: 
16370 nan
Error translating: 
16371 nan
Error translating: 
16372 nan
Error translating: 
16373 nan
Error translating: 
16374 nan
Error tran

28600
28700
28800
28900
29000
29100
29200
29300
29400
29500
29600
29700
29800
29900
30000
30100
30200
30300
30400
30500
30600
30700
30800
30900
31000
31100
31200
31300
31400
31500
31600
31700
31800
31900
32000
32100
32200
32300
32400
32500
32600
32700
32800
32900
33000
33100
33200
33300
33400
Error translating: 'translations'
33484 vi vil være det sikre område
33500
33600
33700
33800
33900
34000
34100
34200
34300
34400
34500
34600
34700
34800
34900
35000
35100
35200
35300
35400
35500
35600
35700
35800
35900
36000
36100
36200
36300
36400
Error translating: 'translations'
36425 journalistikkens grønne omstilling: den konstruktive vej til klimamotivation 
Error translating: 'translations'
36442 indkøb af coastal både 
Error translating: 'translations'
36477 korspilstræf 24 
Error translating: 'translations'
36487 bogudgivelse: tanker om krig og fred 
Error translating: 'translations'
36494 indkøb af trædrejebænk, der kan betjenes af kørestolsbrugere 
36500
36600
36700
36800
36900
37000
37

6400
Error translating: 
6498 nan
Error translating: 
6499 nan
6500
Error translating: 
6500 nan
Error translating: 
6501 nan
Error translating: 
6502 nan
Error translating: 
6503 nan
Error translating: 
6504 nan
Error translating: 
6505 nan
Error translating: 
6506 nan
Error translating: 
6507 nan
Error translating: 
6508 nan
Error translating: 
6509 nan
Error translating: 
6510 nan
Error translating: 
6511 nan
Error translating: 
6512 nan
Error translating: 
6513 nan
Error translating: 
6514 nan
Error translating: 
6515 nan
Error translating: 
6516 nan
Error translating: 
6517 nan
Error translating: 
6518 nan
Error translating: 
6519 nan
Error translating: 
6520 nan
Error translating: 
6521 nan
Error translating: 
6522 nan
Error translating: 
6523 nan
Error translating: 
6524 nan
Error translating: 
6525 nan
Error translating: 
6526 nan
Error translating: 
6527 nan
Error translating: 
6528 nan
Error translating: 
6529 nan
Error translating: 
6530 nan
Error translating: 
6531 nan
Erro

13600
13700
13800
13900
14000
14100
14200
14300
14400
14500
14600
14700
Error translating: 503 Server Error: Service Unavailable for url: https://www.bing.com/ttranslatev3?isVertical=1&&IG=4911F615EFC34B97B66021EF6FA2C53A&IID=translator.5026
14730 a diamond-encased bose-einstein condensate 
14800
14900
15000
15100
15200
15300
15400
15500
15600
15700
Error translating: 
15788 nan
Error translating: 
15789 nan
Error translating: 
15790 nan
Error translating: 
15791 nan
Error translating: 
15792 nan
Error translating: 
15793 nan
Error translating: 
15794 nan
Error translating: 
15795 nan
Error translating: 
15796 nan
Error translating: 
15797 nan
Error translating: 
15798 nan
Error translating: 
15799 nan
15800
Error translating: 
15800 nan
Error translating: 
15801 nan
Error translating: 
15802 nan
Error translating: 
15803 nan
Error translating: 
15804 nan
Error translating: 
15805 nan
Error translating: 
15806 nan
Error translating: 
15807 nan
Error translating: 
15808 nan
Error transl

16200
16300
Error translating: 
16343 nan
Error translating: 
16344 nan
Error translating: 
16345 nan
Error translating: 
16346 nan
Error translating: 
16347 nan
Error translating: 
16348 nan
Error translating: 
16349 nan
Error translating: 
16350 nan
Error translating: 
16351 nan
Error translating: 
16352 nan
Error translating: 
16353 nan
Error translating: 
16354 nan
Error translating: 
16355 nan
Error translating: 
16356 nan
Error translating: 
16357 nan
Error translating: 
16358 nan
Error translating: 
16359 nan
Error translating: 
16360 nan
Error translating: 
16361 nan
Error translating: 
16362 nan
Error translating: 
16363 nan
Error translating: 
16364 nan
Error translating: 
16365 nan
Error translating: 
16366 nan
Error translating: 
16367 nan
Error translating: 
16368 nan
Error translating: 
16369 nan
Error translating: 
16370 nan
Error translating: 
16371 nan
Error translating: 
16372 nan
Error translating: 
16373 nan
Error translating: 
16374 nan
Error translating: 
16375 na

25700 nan
Error translating: 
25701 nan
Error translating: 
25702 nan
Error translating: 
25703 nan
Error translating: 
25704 nan
Error translating: 
25705 nan
Error translating: 
25706 nan
Error translating: 
25707 nan
Error translating: 
25708 nan
Error translating: 
25709 nan
Error translating: 
25710 nan
Error translating: 
25711 nan
Error translating: 
25712 nan
Error translating: 
25713 nan
Error translating: 
25714 nan
Error translating: 
25715 nan
Error translating: 
25716 nan
Error translating: 
25717 nan
Error translating: 
25718 nan
Error translating: 
25719 nan
Error translating: 
25720 nan
Error translating: 
25721 nan
Error translating: 
25722 nan
Error translating: 
25723 nan
Error translating: 
25724 nan
Error translating: 
25725 nan
Error translating: 
25726 nan
Error translating: 
25727 nan
Error translating: 
25728 nan
Error translating: 
25729 nan
Error translating: 
25730 nan
Error translating: 
25731 nan
Error translating: 
25732 nan
Error translating: 
25733 nan


6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
Error translating: 'translations'
7418 på vej
Error translating: 'translations'
7425 rialto:dokumentation
Error translating: 'translations'
7440 dansk folkehjælps julehjælp
7500
7600
Error translating: 
7661 nan
Error translating: 
7662 nan
Error translating: 
7663 nan
Error translating: 
7664 nan
Error translating: 
7665 nan
Error translating: 
7666 nan
Error translating: 
7667 nan
Error translating: 
7668 nan
Error translating: 
7669 nan
Error translating: 
7670 nan
Error translating: 
7671 nan
Error translating: 
7672 nan
Error translating: 
7673 nan
Error translating: 
7674 nan
Error translating: 
7675 nan
Error translating: 
7676 nan
Error translating: 
7677 nan
Error translating: 
7678 nan
Error translating: 
7679 nan
Error translating: 
7680 nan
Error translating: 
7681 nan
Error translating: 
7682 nan
Error translating: 
7683 nan
Error translating: 
7684 nan
Error translating: 
7685 nan
Error translating: 
7686 nan
Error transla

16400
16500
Error translating: 
16526 nan
Error translating: 
16527 nan
Error translating: 
16528 nan
Error translating: 
16529 nan
Error translating: 
16530 nan
Error translating: 
16531 nan
Error translating: 
16532 nan
Error translating: 
16533 nan
Error translating: 
16534 nan
Error translating: 
16535 nan
Error translating: 
16536 nan
Error translating: 
16537 nan
Error translating: 
16538 nan
Error translating: 
16539 nan
Error translating: 
16540 nan
Error translating: 
16541 nan
Error translating: 
16542 nan
Error translating: 
16543 nan
Error translating: 
16544 nan
Error translating: 
16545 nan
Error translating: 
16546 nan
Error translating: 
16547 nan
Error translating: 
16548 nan
Error translating: 
16549 nan
Error translating: 
16550 nan
Error translating: 
16551 nan
Error translating: 
16552 nan
Error translating: 
16553 nan
Error translating: 
16554 nan
Error translating: 
16555 nan
Error translating: 
16556 nan
Error translating: 
16557 nan
Error translating: 
16558 na

28400
28500
Error translating: 
28557 nan
Error translating: 
28558 nan
Error translating: 
28559 nan
Error translating: 
28560 nan
Error translating: 
28561 nan
Error translating: 
28562 nan
Error translating: 
28563 nan
Error translating: 
28564 nan
Error translating: 
28565 nan
Error translating: 
28566 nan
Error translating: 
28567 nan
Error translating: 
28568 nan
Error translating: 
28569 nan
Error translating: 
28570 nan
Error translating: 
28571 nan
Error translating: 
28572 nan
Error translating: 
28573 nan
Error translating: 
28574 nan
Error translating: 
28575 nan
Error translating: 
28576 nan
Error translating: 
28577 nan
Error translating: 
28578 nan
Error translating: 
28579 nan
Error translating: 
28580 nan
Error translating: 
28581 nan
Error translating: 
28582 nan
Error translating: 
28583 nan
Error translating: 
28584 nan
Error translating: 
28585 nan
Error translating: 
28586 nan
Error translating: 
28587 nan
Error translating: 
28588 nan
Error translating: 
28589 na

Rows Not tranlated with Translator module :  28
Characters Not tranlated with Translator module :  7386
Rows Not tranlated with Translator module :  1
Characters Not tranlated with Translator module :  43
Rows Not tranlated with Translator module :  3
Characters Not tranlated with Translator module :  53
Bonjour à tous !


NameError: name 'mask_nadesc' is not defined