In [1]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
You should consider upgrading via the '/usr/bin/python -m pip install --upgrade pip' command.[0m


In [54]:
import numpy as np
import pandas as pd

from datasets import load_dataset

import matplotlib.pyplot as plt

In [55]:
dataset_dair_ai = load_dataset('dair-ai/emotion')

train_dair_ai = dataset_dair_ai['train']
test_dair_ai = dataset_dair_ai['test']
validation_dair_ai = dataset_dair_ai['validation']


In [56]:
class_labels = train_dair_ai.features['label']

dictionary_label = {0:'sadness',
 1: 'joy',
 2: 'love',
 3: 'anger',
 4: 'fear',
 5: 'surprise'}


train_dair_ai_pd = train_dair_ai.to_pandas()
test_dair_ai_pd = test_dair_ai.to_pandas()
validation_dair_ai_pd = validation_dair_ai.to_pandas()


train_dair_ai_pd['label'] = train_dair_ai_pd['label'].apply(lambda x: dictionary_label[x])
test_dair_ai_pd['label'] = test_dair_ai_pd['label'].apply(lambda x: dictionary_label[x])
validation_dair_ai_pd['label'] = validation_dair_ai_pd['label'].apply(lambda x: dictionary_label[x])

In [57]:
train_dair_ai_pd['label'].value_counts()

joy         5362
sadness     4666
anger       2159
fear        1937
love        1304
surprise     572
Name: label, dtype: int64

In [58]:
test_dair_ai_pd['label'].value_counts()

joy         695
sadness     581
anger       275
fear        224
love        159
surprise     66
Name: label, dtype: int64

In [59]:
validation_dair_ai_pd['label'].value_counts()

joy         704
sadness     550
anger       275
fear        212
love        178
surprise     81
Name: label, dtype: int64

In [60]:
train_dair_ai_pd.rename(columns={"text": "Text_processed", "label": "Emotion"}, inplace=True)
test_dair_ai_pd.rename(columns={"text": "Text_processed", "label": "Emotion"}, inplace=True)
validation_dair_ai_pd.rename(columns={"text": "Text_processed", "label": "Emotion"}, inplace=True)

In [61]:
# Drop rows where label is equal to 'love' or 'surprise'
filtered_train_dair_ai_pd = train_dair_ai_pd[(train_dair_ai_pd['Emotion'] != 'love') & (train_dair_ai_pd['Emotion'] != 'surprise')]
filtered_test_dair_ai_pd = test_dair_ai_pd[(test_dair_ai_pd['Emotion'] != 'love') & (test_dair_ai_pd['Emotion'] != 'surprise')]
filtered_validation_dair_ai_pd = validation_dair_ai_pd[(validation_dair_ai_pd['Emotion'] != 'love') & (validation_dair_ai_pd['Emotion'] != 'surprise')]

In [62]:
all_dair_ai = pd.concat([filtered_train_dair_ai_pd, filtered_test_dair_ai_pd, filtered_validation_dair_ai_pd])

In [63]:
all_dair_ai['text'] = all_dair_ai.apply(lambda x: f"### Human: Now I want you to perform a classification of the following sentence based on the emotion it represents, you can use Anger, Joy, Sadness, Guilt, Shame, Fear, and Disgust. {x['Text_processed']} ### Assistant: {x['Emotion']}", axis=1)
all_dair_ai['Augmented'] = False

In [64]:
from datasets import Dataset

dataset_dair_ai = Dataset.from_pandas(all_dair_ai)

In [65]:
dataset_dair_ai.remove_columns('__index_level_0__')

Dataset({
    features: ['Text_processed', 'Emotion', 'text', 'Augmented'],
    num_rows: 17640
})

In [66]:
dataset_dair_ai.push_to_hub('RikoteMaster/dataset_dair_ai_4_llama2_v3')

Pushing dataset shards to the dataset hub:   0%|                                                                                                                                                                                                                                                        | 0/1 [00:00<?, ?it/s]
Creating parquet from Arrow format: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 18/18 [00:00<00:00, 555.35ba/s][A
Pushing dataset shards to the dataset hub: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.88s/it]


In [67]:
goemotion_dataset = pd.read_csv('goemotions_selected_emotions.csv')

In [68]:
goemotion_dataset['label'].value_counts()

anger            8084
joy              7983
sadness          6758
disgust          5301
fear             3197
remorse          2525
embarrassment    2476
Name: label, dtype: int64

In [69]:
goemotion_dataset['label'] = goemotion_dataset['label'].replace({'remorse': 'guilt', 'embarrassment': 'shame'})


In [70]:
goemotion_dataset['label'].value_counts()

anger      8084
joy        7983
sadness    6758
disgust    5301
fear       3197
guilt      2525
shame      2476
Name: label, dtype: int64

In [71]:
goemotion_dataset.rename(columns={"text": "Text_processed", "label": "Emotion"}, inplace=True)
goemotion_dataset['text'] = goemotion_dataset.apply(lambda x: f"### Human: Now I want you to perform a classification of the following sentence based on the emotion it represents, you can use Anger, Joy, Sadness, Guilt, Shame, Fear, and Disgust. {x['Text_processed']} ### Assistant: {x['Emotion']}", axis=1)
goemotion_dataset['Augmented'] = False

In [72]:
goemotion_dataset = goemotion_dataset.sample(frac=1)

In [73]:
hf_goemotion_dataset = Dataset.from_pandas(goemotion_dataset)

In [74]:
hf_goemotion_dataset = hf_goemotion_dataset.remove_columns('__index_level_0__')

In [75]:
hf_goemotion_dataset.push_to_hub('goemotion_4_llama2_v3')

Pushing dataset shards to the dataset hub:   0%|                                                                                                                                                                                                                                                        | 0/1 [00:00<?, ?it/s]
Creating parquet from Arrow format: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 37/37 [00:00<00:00, 724.77ba/s][A
Pushing dataset shards to the dataset hub: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.68s/it]


In [88]:
from datasets import load_dataset

isear_dataset = load_dataset('RikoteMaster/isear_for_llama2')
dair_ai_dataset = load_dataset('RikoteMaster/dataset_dair_ai_4_llama2_v2')
goemotion_dataset = load_dataset('RikoteMaster/goemotion_4_llama2_v2')

Downloading readme: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 616/616 [00:00<00:00, 990kB/s]


In [89]:
dair_ai_dataset = dair_ai_dataset.remove_columns('__index_level_0__')

In [90]:
from datasets import concatenate_datasets

macro_ds = concatenate_datasets([isear_dataset['train'], dair_ai_dataset['train'], goemotion_dataset['train']])

In [92]:
macro_ds

Dataset({
    features: ['Text_processed', 'Emotion', 'Augmented', 'text'],
    num_rows: 61463
})

In [94]:
macro_ds.push_to_hub('Emotion_Recognition_4_llama2')

Pushing dataset shards to the dataset hub:   0%|                                                                                                                                                                                                                                                        | 0/1 [00:00<?, ?it/s]
Creating parquet from Arrow format: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 62/62 [00:00<00:00, 623.97ba/s][A
Pushing dataset shards to the dataset hub: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.50s/it]


In [96]:
macro_ds_pd = macro_ds.to_pandas()

In [103]:
macro_emotions = macro_ds_pd['Emotion']

In [108]:
macro_emotions.value_counts()

joy        15859
sadness    13627
anger      11872
fear        6653
disgust     6406
guilt       3543
shame       3503
Name: Emotion, dtype: int64

In [109]:
!python trl/examples/scripts/sft_trainer.py \
    --model_name meta-llama/Llama-2-7b-hf \
    --dataset_name RikoteMaster/isear_for_llama2 \
    --output_dir ./model
    --load_in_4bit \
    --use_peft \
    --batch_size 8 \
    --gradient_accumulation_steps 2
    


IndentationError: unexpected indent (2899810926.py, line 2)

In [1]:
from datasets import load_dataset




  from .autonotebook import tqdm as notebook_tqdm


In [22]:

ds = load_dataset('RikoteMaster/Emotion_Recognition_4_llama2_v2')

def bigger_formatting(ds):
    ds['text'] = f"""###Human:\nIn this task, you will be performing a classification exercise aimed at identifying the underlying emotion conveyed by a given sentence. The emotions to consider are as follows:

    Joy: Joy is a positive and uplifting emotion characterized by happiness, elation, and a sense of contentment. It arises from pleasant experiences, achievements, or connections with others.

    Sadness: Sadness is a feeling of sorrow, unhappiness, or despondency. It is often triggered by loss, disappointment, or a sense of longing.

    Guilt: Guilt is a self-directed emotion that arises from a sense of wrongdoing or moral transgression. It involves feeling responsible for a negative outcome or harm done to others.

    Shame: Shame is a powerful emotion associated with feeling embarrassed, humiliated, or unworthy. It typically arises from a perception of public exposure of one's flaws or mistakes.

    Fear: Fear is an emotion triggered by a perceived threat or danger. It can lead to a heightened state of alertness, anxiety, and a desire to avoid the source of fear.

    Disgust: Disgust is an aversive emotion linked to feelings of revulsion, repulsion, or strong distaste. It arises in response to things that are offensive or unpleasant.

    Anger: Anger is a strong feeling of displeasure, hostility, or frustration. It often arises when one's boundaries, values, or rights are violated, leading to a desire for confrontation or retaliation.
    
    Your task is to analyze each sentence provided and categorize it into one of these emotions based on the dominant feeling conveyed by the text. This classification will require an understanding of the nuances of human emotions and the context in which the sentences are presented.
        
    Remember, you have to classify the sentences using only Anger, Joy, Sadnes, Guilt, Shame, fear or disgust
    
    Sentence: {ds['Text_processed']}\n\n###Assistant:\n{ds['Emotion']}"""
    

ds = ds.map(bigger_formatting)

Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 61463/61463 [00:02<00:00, 29729.98 examples/s]


In [16]:
ds.push_to_hub('RikoteMaster/Emotion_Recognition_4_llama2_v2')

Pushing dataset shards to the dataset hub:   0%|                                                                                                                                                                                                                                         | 0/1 [00:00<?, ?it/s]
Creating parquet from Arrow format:   0%|                                                                                                                                                                                                                                               | 0/62 [00:00<?, ?ba/s][A
Creating parquet from Arrow format: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 62/62 [00:00<00:00, 408.01ba/s][A
Pushing dataset shards to the dataset hub: 100%|██████████████████████████████████

In [19]:
ds = load_dataset("RikoteMaster/isear_for_llama2")
def bigger_formatting(ds):
    ds['text'] = f"""###Human:\nIn this task, you will be performing a classification exercise aimed at identifying the underlying emotion conveyed by a given sentence. The emotions to consider are as follows:

    Joy: Joy is a positive and uplifting emotion characterized by happiness, elation, and a sense of contentment. It arises from pleasant experiences, achievements, or connections with others.

    Sadness: Sadness is a feeling of sorrow, unhappiness, or despondency. It is often triggered by loss, disappointment, or a sense of longing.

    Guilt: Guilt is a self-directed emotion that arises from a sense of wrongdoing or moral transgression. It involves feeling responsible for a negative outcome or harm done to others.

    Shame: Shame is a powerful emotion associated with feeling embarrassed, humiliated, or unworthy. It typically arises from a perception of public exposure of one's flaws or mistakes.

    Fear: Fear is an emotion triggered by a perceived threat or danger. It can lead to a heightened state of alertness, anxiety, and a desire to avoid the source of fear.

    Disgust: Disgust is an aversive emotion linked to feelings of revulsion, repulsion, or strong distaste. It arises in response to things that are offensive or unpleasant.

    Anger: Anger is a strong feeling of displeasure, hostility, or frustration. It often arises when one's boundaries, values, or rights are violated, leading to a desire for confrontation or retaliation.
    
    Your task is to analyze each sentence provided and categorize it into one of these emotions based on the dominant feeling conveyed by the text. This classification will require an understanding of the nuances of human emotions and the context in which the sentences are presented.
        
    Remember, you have to classify the sentences using only Anger, Joy, Sadnes, Guilt, Shame, fear or disgust
    
    Sentence: {ds['Text_processed']}\n\n###Assistant:\n{ds['Emotion']}"""
    
    return ds

ds = ds.map(bigger_formatting)

Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7499/7499 [00:00<00:00, 15362.43 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1324/1324 [00:00<00:00, 17803.70 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1879/1879 [00:00<00:00, 17932.21 examples/s]


In [21]:
ds.push_to_hub("RikoteMaster/isear_for_llama2_v3")

Pushing dataset shards to the dataset hub:   0%|                                                                                                                                                                                                                                                 | 0/1 [00:00<?, ?it/s]
Creating parquet from Arrow format: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 315.48ba/s][A
Pushing dataset shards to the dataset hub: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.99s/it]
Pushing dataset shards to the dataset hub:   0%|             

In [3]:
from datasets import load_dataset

ds = load_dataset("RikoteMaster/Emotion_Recognition_4_llama2_v2")
def bigger_formatting(ds):
    ds['text'] = f"""###Human:\nIn this task, you will be performing a classification exercise aimed at identifying the underlying emotion conveyed by a given sentence. The emotions to consider are as follows:

    Anger, Joy, Sadnes, Guilt, Shame, fear or disgust
    
    Sentence: {ds['Text_processed']}\n\n###Assistant:\n{ds['Emotion']}"""
    
    return ds

ds = ds.map(bigger_formatting)

ds.push_to_hub("RikoteMaster/Emotion_Recognition_4_llama2_v3")

Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 61463/61463 [00:03<00:00, 19853.20 examples/s]
Pushing dataset shards to the dataset hub:   0%|                                                                                                                                                                                                                                                 | 0/1 [00:00<?, ?it/s]
Creating parquet from Arrow format: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 62/62 [00:00<00:00, 620.48ba/s][A
Pushing dataset shards to the dataset hub: 100%|█████████████

"<s>[INST] Me gradué hace poco de la carrera de medicina ¿Me podrías aconsejar para conseguir rápidamente un puesto de trabajo? [/INST] Esto vale tanto para médicos como para cualquier otra profesión tras finalizar los estudios aniversarios y mi consejo sería preguntar a cuántas personas haya conocido mejor. En este caso, mi primera opción sería hablar con otros profesionales médicos, echar currículos en hospitales y cualquier centro de salud. En paralelo, trabajaría por mejorar mi marca personal como médico mediante un blog o formas digitales de comunicación como los vídeos. Y, para mejorar las posibilidades de encontrar trabajo, también participaría en congresos y encuentros para conseguir más contactos. Y, además de todo lo anterior, seguiría estudiando para presentarme a las oposiciones y ejercer la medicina en el sector público de mi país. </s>"

In [1]:
from datasets import load_dataset

ds = load_dataset("RikoteMaster/Emotion_Recognition_4_llama2_v2")
def bigger_formatting(ds):
    ds['text'] = f"""<s>[INST] In this task, you will be performing a classification exercise aimed at identifying the underlying emotion conveyed by a given sentence. The emotions to consider are as follows:

    Anger, Joy, Sadnes, Guilt, Shame, fear or disgust
    
    Sentence: {ds['Text_processed']} [/INST] {ds['Emotion']} <s>"""
    
    return ds

ds = ds.map(bigger_formatting)

ds.push_to_hub("RikoteMaster/Emotion_Recognition_4_llama2_chat")

  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 61463/61463 [00:03<00:00, 19205.84 examples/s]
Pushing dataset shards to the dataset hub:   0%|                                                                                                                                                                                                                                                 | 0/1 [00:00<?, ?it/s]
Creating parquet from Arrow format:   0%|                                                                                                                                                                                                                                                       | 0/62 [00:00<?, ?ba/s][A
Creating pa