In [None]:
import os
import time
import deepl
import re as re
import numpy as np
import pandas as pd
import seaborn as sns
import translators as ts
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker 
# check 
%matplotlib inline
import numbers
import string
from langdetect import detect
from tabulate import tabulate
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from pandas.errors import ParserError
from langdetect import DetectorFactory
from stop_words import get_stop_words

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn import preprocessing
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder #maybe dont' need
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, classification_report, hamming_loss, accuracy_score




df = pd.read_csv("grant_dataset_6nov_newheader.csv", dtype=object)  #reading in the data 
print("shape of loaded dataset : ", df.shape)
# Translation - 2 steps

#1. langauge detection
#2. if language not english = save and translate
#We want to translate 3 text types = Titles, Descriptions and ("Receiver + Title")

#_ = ts.preaccelerate_and_speedtest() #preacceleration step for translation
#1 Filter out grant entities with insufficient data
#2 Tranlsation



#1 filter out bad grant entities

def fun_remove_lines2(column):
    """
    This function returns a mask where True indicates the presence of valid data in the column,
    and False indicates missing or invalid data.
    """
    
    mask = ~column.isna() & (column != "#NA") & (column != "-") & (column != "Årstal") & (column.astype(str).str.strip() != "")
    return mask

def higher_remove_function2(amount, year, receiver_1, receiver_name, receiver_title, receiver_inst, descr, title):

    """
    This function incorporates the mask from remove_lines function, and keep lines that have documented:
    1. Grant size and year
    2. A receiver (name, organisation name, profession or institution)
    3. A project description or title (Title, description, or combined text of receiver and title)
    
    it returns two lists, of kept (keep_lines) and removed lines (remove_lines)
    """
    
    amount_mask = fun_remove_lines2(amount)
    year_mask = fun_remove_lines2(year)
    rec_mask = fun_remove_lines2(receiver_1)
    rec_name_mask = fun_remove_lines2(receiver_name)
    rec_title_mask = fun_remove_lines2(receiver_title)
    rec_inst_mask = fun_remove_lines2(receiver_inst)
    descrip_mask = fun_remove_lines2(descr)
    title_mask = fun_remove_lines2(title)
    

    grant_condition = amount_mask & year_mask
    receiver_condition = rec_mask | rec_name_mask | rec_title_mask | rec_inst_mask 
    project_info_condition = descrip_mask | title_mask | rec_title_mask 
    

    keep_mask = grant_condition & receiver_condition & project_info_condition 
    keep_lines2 = list(keep_mask[keep_mask].index)
    remove_lines2 = list(keep_mask[~keep_mask].index)
    
    return keep_lines2, remove_lines2


def detection_fun(masked_list):
    """
    this function takes a list of text strings as input, 
    detects the language of the string, 
    and saves it to a list returned
    """
    detection_list = []
    for num, i in enumerate(masked_list):

        if i != "18+" and i != "34756600" and pd.notna(i):
            d = detect(i) 
            detection_list.append(d)
        else:
            detection_list.append(i) 
        if num % 100 == 0:
            print(num)
    return detection_list

def translater_function(column_name, langdetect_list):
    """
    This function takes a series object with text info that is to be translated. 
    It translates the text and saves the data to a new series object called translated_description
    """
    translated_description_full = []
    not_translated = []
    index = []

    for num, i in enumerate(column_name):
        if num % 100 == 0:
            print(num)
        
        if langdetect_list[num] != "en":
            try:
                item = ts.translate_text(query_text=i, if_use_preacceleration = True, if_ignore_empty_query=True)
                translated_description_full.append(item)

            except Exception as e:
                print(f"Error translating: {e}")
                translated_description_full.append(i)
                not_translated.append(i)
                index.append(num)
                print(num, i)
        else:
            translated_description_full.append(i)
                
    return translated_description_full, not_translated, index


#functions adapted from: https://www.kaggle.com/code/abdmental01/text-preprocessing-nlp-steps-to-process-text

def removing_html_tags(text):
    """This function removes HTML tags in a given string input (text)"""
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', str(text))

def removing_url(text):
    """This function removes URL's in a given string input (text)"""
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)

punc = string.punctuation  
def removing_string_punc(text):
    """This function removes string punctuations in a given string input (text)"""
    return text.translate(str.maketrans('', '', punc))

stop_words = get_stop_words('en') #initiating variable with english stop words
def remove_stopwords(text):
    """This function removes stop words from english vocabulary in a given string input (text)"""
    new_text = []
    for word in text.split():
        if word in stop_words:
            new_text.append('')
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return " ".join(x)

def data_processing_function(list_of_interest):
    """This function takes a list or series as input, and applies each datapre-processing steps defined before,
    and return the processed list"""
    
    nl = list_of_interest.astype(str).str.lower()  #Lowercase formatting
    nl_html = nl.apply(removing_html_tags) #removing HTMLs
    nl_url = nl_html.apply(removing_url) #removing URLs
    nl_sp = nl_url.apply(removing_string_punc) #removing string punctuation: '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
    nl_stopword = nl_sp.apply(remove_stopwords) #removing stop words
    return nl_stopword


def non_translated_rows(non_translated_list):
    """
    This function takes the "non_translated_list" output from the translater_function and prints and 
    saves rows that were not translated with the Translator module. 
    
    input : non_translated_list
    
    return : list of strings + index that where not translated
    
    """
    non_translated_list = pd.Series(non_translated_list) 
    mask_natitle = pd.notna(non_translated_list)
    count_title = 0
    non_translated = non_translated_list[mask_natitle]
    for i in non_translated:
        s = len(i)
        count_title += s

    print("Rows Not tranlated with Translator module : ", len(non_translated))
    print("Characters Not tranlated with Translator module : ", count_title)
    return non_translated



keeper, remover = higher_remove_function2(df['Grant_size_(DKK)'], df['Year'], df['Receiver'], df['Receiver_Name'], df['Receiver_Title'], df['Institution'], df['Description'], df['Title'])

print("Number of lines kept :", len(keeper))
print("Number of lines removed :", len(remover))


df.loc[remover].to_csv('removed_rows6th_nov.csv', index=True)
na_removed_df = df.loc[remover]  #new df
na_removed_df = na_removed_df.reset_index() 

#Saving kept lines for new dataframe
df = df.loc[keeper]  #new df
df = df.reset_index(drop = True)  #reset index (kept old index for now as new column)




#Language detect function for fast translation step 
detected_descriptionlist = detection_fun(df["Description"])
de_title = detection_fun(df["Title"])
de_receiver_title = detection_fun(df["Receiver_Title"])

#Lower convertion of Strings
df["Description"] = df["Description"].str.lower()  
df["Title"] = df["Title"].str.lower()
df["Receiver_Title"] = df["Receiver_Title"].str.lower()

#Translation of Description, Title and Receiver_Title
translated_description_list, not_tra_desc, ind_desc = translater_function(df["Description"], detected_descriptionlist)  #translation of description
translated_title_list, not_tra_title, ind_title = translater_function(df["Title"], de_title)  #translation of title
translated_rec_n_title_list, not_tra_rec_dec, ind_rec_dec = translater_function(df["Receiver_Title"], de_receiver_title)  #translation of rec + title list

#all translations are saved to this list - except for few rows that failed - later translated with DeepL - API required
translated_list = pd.DataFrame(
    {'desc': translated_description_list,
     'title': translated_title_list,
     'rec_title': translated_rec_n_title_list
    })

#saving to csv to save results
translated_list.to_csv("translated_lists_xth_month.csv")  #saving to csv to save results


#checking number of rows that where not translated in the 3 lists

no_translation_description = non_translated_rows(not_tra_desc)
no_translation_title = non_translated_rows(not_tra_title)
no_translation_rec_title = non_translated_rows(not_tra_rec_dec)



#The DeepL module offers 500.000 free characters translated pr. month, 
#and therefore it's enough to translate the remaining

#it's only in the "Description" list we have untranslated 4816 rows, so we introduce deep L translator
translator = deepl.Translator("caae833d-76d5-40f6-a1a6-b7badb3abd91:fx")


##test - remove hash if deepL is not working, to test API
result = translator.translate_text("Hello, world!", target_lang="FR")
print(result)

#only run once!!
#translate untranslated descrptions: 

deep_l_translations = []
for i in no_translation_description:
    results = translator.translate_text(i, target_lang="EN-US")  #applying EN-US english 
    deep_l_translations.append(results.text)

deepl_arr = np.array(deep_l_translations)

translated_desc = np.array(translated_description_list) #make copy

index_arr = np.array(ind_desc)
index_no_na = index_arr[mask_natitle]  #using the masking list of nan lines from before
print(len(index_no_na))  #Index file
print(len(deepl_arr)) #translated file
translated_desc[index_no_na]=deepl_arr


df.insert(13, "Translated_descriptions", translated_desc, True)  #only run once!
df.insert(11, "Translated_title", translated_title_list, True) #only run once!
df.insert(12, "Translated_receiver_title", translated_rec_n_title_list, True) #only run once!

#Data processing step
df["Translated_descriptions_pro"] = data_processing_function(df["Translated_descriptions"])
df["Translated_title_pro"] = data_processing_function(df["Translated_title"])
df["Translated_receiver_title_pro"] = data_processing_function(df["Translated_receiver_title"])

df.to_csv("df_translated_proc_19thnov.csv")