# Imports and Installs

In [60]:
## Emotion Classification 
#https://huggingface.co/datasets/viewer/?dataset=emotion
!pip install datasets
!pip install langdetect
!pip install google_trans_new 

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[K     |████████████████████████████████| 981 kB 8.6 MB/s 
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993242 sha256=976652afc56710a4071fb8f1eea5ba20c724438ad7c961145df1395cedf572b4
  Stored in directory: /root/.cache/pip/wheels/c5/96/8a/f90c59ed25d75e50a8c10a1b1c2d4c402e4dacfa87f3aff36a
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9
Collecting google_trans_new
  Downloading google_trans_new-1.1.9-py3-none-any.whl (9.2 kB)
Installing collected packages: google-trans-new
Successfully installed google-trans-new-1.1.9


In [61]:
# importing dataset loader HF
from datasets import load_dataset

In [62]:
from google.colab import drive
from argparse import Namespace
import os
# setting 
settings = Namespace()
# Paths
settings.mount_path = "/content/drive"
drive.mount(settings.mount_path)
settings.project_path = os.path.join(settings.mount_path, "MyDrive/HackathonMaratoTV3")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Defining Functions 

In [None]:
PATH = '/content/drive/MyDrive/HackathonMaratoTV3'

In [57]:
def get_hf_dataset(NAME):
  '''
  Function which loads the dataset from HuggingFace, replaces the labels with a text mapping and re maps to a new one. 

  input:type: dictionary
  output:type: csv / pandas.DataFrame
  '''
  dataset = load_dataset(NAME)
  # extracting the train, test, val from the dict 
  all_text_train, all_text_train_labels = dataset['train']['text'], dataset['train']['label']
  all_text_test, all_text_test_labels = dataset['test']['text'], dataset['test']['label']
  all_text_val, all_text_val_labels = dataset['validation']['text'], dataset['validation']['label']
  # joining them 
  joined_texts = all_text_train + all_text_test + all_text_val
  all_labels = all_text_train_labels + all_text_test_labels + all_text_val_labels
  # convert to pandas.DataFrame
  df = pd.DataFrame({"text":joined_texts, "label":all_labels})

  return df


def re_map_labels(DATAFRAME, PATH, save_to_csv=False):
  '''
  Function: remaps the label to a new custom label 

  input:type: pandas.DataFrame 
  output:type: pandas.DataFrame / CSV with remapped columns 
  '''
  original_label2int = {
                        "sadness": 0,
                        "joy": 1,
                        "love": 2,
                        "anger": 3,
                        "fear": 4,
                        "surprise": 5
                          }

  # remap keys to vals 
  d = dict()
  for k, v in original_label2int.items():
    d[v] = k

  # mapping the dataframe columns to the new labels 
  DATAFRAME.label = DATAFRAME.label.replace(d)

  # applying a new dictionary mapping 
  new_label2int = {
                    "fear": 1,
                    "sadness": 1,
                    "anger": 2,
                    "surprise": 4,
                    "joy": 5,
                    "love": 5
                  }
  # replacing 
  DATAFRAME.label = DATAFRAME.label.replace(new_label2int)

  if save_to_csv == True:
    DATAFRAME.to_csv(PATH+'/'+'emotions_numeric_labels.csv',sep=':',index=False)
    return f"Saved the dataframe to {PATH} "
  else: 
    return DATAFRAME

# ETL "Emotion" dataset HuggingFace

In [58]:
df = get_hf_dataset('emotion')
df_remap = re_map_labels(df, PATH, save_to_csv=False)

Using custom data configuration default
Reusing dataset emotion (/root/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705)


  0%|          | 0/3 [00:00<?, ?it/s]

# Translating the text

In [108]:
# Option 1: Google Translate --> DOES NOT WORK BECAUSE COMMIT HAS NOT BEEN ACCEPTED AND DONT FEEL LIKE CHANGING SOURCE CODE !!!!
from langdetect import detect
from google_trans_new import google_translator  

#simple function to detect and translate text 
def detect_and_translate(text,target_lang):
    
    result_lang = detect(text)
    
    if result_lang == target_lang:
        return text 
    
    else:
        translator = google_translator()
        translate_text = translator.translate(text,lang_src=result_lang,lang_tgt=target_lang)
        return translate_text 

In [174]:
# OPTION 2: DeepL --> WORKS VERY WELL!!!
def get_translation(TEXT, TARGET_LANG='ES'):
  '''
  Function which makes a request to the DeepL Api to translate the texts given a sample text and a target language 

  input:type: str, target-language
  output:type: str, TEXT --> Target-language 

  ATTENTION: 

  Requires over 500K characters which is limit from DeepL free user: 
  alternative is to append and cut-off at max-char limit and instantiate another session with a different AUTH_KEY
  '''
  r =  requests.post(url='https://api-free.deepl.com/v2/translate',
                          data = {
                            'target_lang' : TARGET_LANG,  
                            'auth_key' : '60de2938-e48a-1fbc-b844-1ac2a5058006:fx',
                            'text': TEXT
                          })
  trans_text = r.json()['translations'][0]['text']

  return trans_text

In [144]:
trans_text = get_translation(sample_text, 'ES')
print(sample_text)
print(trans_text)

i think i must have caught a mild version of big as cold as i had the sniffles and was just not feeling inspired
creo que debo haber cogido una versión leve del gran resfriado, ya que estaba resfriado y no me sentía inspirado


In [139]:
#df_remap['es_text'] = df_remap['text'].apply(lambda x: get_translation(x, 'ES'))

In [157]:
trans_fn = '/content/drive/MyDrive/HackathonMaratoTV3/text_trans.csv'

In [158]:
df_trans = pd.read_csv(trans_fn,sep=',',encoding='utf-8')

In [161]:
df_remap['es_text'] = df_trans['text_es']

In [172]:
#df_remap = df_remap.drop('text',axis=1)
df_remap = df_remap.rename(columns={'es_text':'text'})
df_remap = df_remap[['text','label']]
df_remap.to_csv(PATH+'/'+'translated_emotions_numeric_labels.csv',sep=':',index=False)

In [173]:
df_remap.head()

Unnamed: 0,text,label
0,no me sentí humillado,1
1,Puedo pasar de sentirme tan desesperado a tan ...,1
2,estoy agarrando un minuto para publicar me sie...,2
3,Siempre me siento nostálgico por la chimenea. ...,5
4,me siento malhumorado,2


In [175]:
# translating the dictionary keys 
new_label2int = {
                    "fear": 1,
                    "sadness": 1,
                    "anger": 2,
                    "surprise": 4,
                    "joy": 5,
                    "love": 5
                  }



In [180]:
es_label2int = dict()
for eng_key in new_label2int.keys():
  es_key = get_translation(eng_key, 'ES')
  es_label2int[es_key] = new_label2int[eng_key]

'''

es_label2int = {'alegría': 5, 'amor': 5, 'ira': 2, 'miedo': 1, 'sorpresa': 4, 'tristeza': 1}

'''

{'alegría': 5, 'amor': 5, 'ira': 2, 'miedo': 1, 'sorpresa': 4, 'tristeza': 1}