In [None]:
!pip install datasets huggingface_hub



In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd
# 1. Cargar dataset
df = pd.read_csv("datasetV2.csv")

X_train, X_test, X_eval = [], [], []

train_ratio = 0.8
test_ratio = 0.1
eval_ratio = 0.1

for emotion in df['emotion'].unique():
    subset = df[df['emotion'] == emotion]
    train_data, temp_data = train_test_split(subset, test_size=(1 - train_ratio), random_state=42)
    test_data, eval_data = train_test_split(temp_data, test_size=eval_ratio / (eval_ratio + test_ratio), random_state=42)
    X_train.append(train_data)
    X_test.append(test_data)
    X_eval.append(eval_data)

# Combinar y barajar
X_train = pd.concat(X_train).sample(frac=1, random_state=10).reset_index(drop=True)
X_test = pd.concat(X_test).sample(frac=1, random_state=10).reset_index(drop=True)
X_eval = pd.concat(X_eval).sample(frac=1, random_state=10).reset_index(drop=True)



In [None]:
# Confirmar distribuciones
print("Distribución en X_train:")
print(X_train['emotion'].value_counts())

print("\nDistribución en X_test:")
print(X_test['emotion'].value_counts())

print("\nDistribución en X_eval:")
print(X_eval['emotion'].value_counts())

Distribución en X_train:
emotion
ira         4489
miedo       4489
alegría     4489
neutral     4489
tristeza    4489
disgusto    4489
Name: count, dtype: int64

Distribución en X_test:
emotion
neutral     561
tristeza    561
ira         561
alegría     561
miedo       561
disgusto    561
Name: count, dtype: int64

Distribución en X_eval:
emotion
ira         562
tristeza    562
alegría     562
neutral     562
disgusto    562
miedo       562
Name: count, dtype: int64


In [None]:
# save csv
X_train.to_csv("train.csv", index=False)
X_test.to_csv("test.csv", index=False)
X_eval.to_csv("validation.csv", index=False)  # llamado 'validation' para Hugging Face

In [None]:
from datasets import DatasetDict, Dataset

# Convertir a Hugging Face Datasets y eliminar índice
train_dataset = Dataset.from_pandas(X_train.reset_index(drop=True))
test_dataset = Dataset.from_pandas(X_test.reset_index(drop=True))
eval_dataset = Dataset.from_pandas(X_eval.reset_index(drop=True))

# Crear DatasetDict
dataset_dict = DatasetDict({
    "train": train_dataset,
    "test": test_dataset,
    "validation": eval_dataset
})

# Eliminar columna __index_level_0__ si existe
for split in dataset_dict:
    if "__index_level_0__" in dataset_dict[split].column_names:
        dataset_dict[split] = dataset_dict[split].remove_columns("__index_level_0__")

dataset_dict

DatasetDict({
    train: Dataset({
        features: ['text', 'emotion'],
        num_rows: 26934
    })
    test: Dataset({
        features: ['text', 'emotion'],
        num_rows: 3366
    })
    validation: Dataset({
        features: ['text', 'emotion'],
        num_rows: 3372
    })
})

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
The token `Datset_Promt_Llama2` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `Dats

In [None]:
from huggingface_hub import HfApi

# Nombre del dataset en tu espacio personal
dataset_dict.push_to_hub("Joseph7D/emotion-dataset-v2")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

In [None]:


def formatear_para_llama(df):
    """
    Formatear un DataFrame con columnas 'text' y 'emotion' al formato de entrenamiento tipo Llama 2 chat.

    Retornar un nuevo DataFrame con la columna 'text'.
    """
    def construir_prompt(row):
        return f"<s>[INST] Clasifica el siguiente texto en una de estas emociones: ira, disgusto, tristeza, alegría, miedo o neutral. " \
               f"Responde únicamente con la emoción correspondiente.\n\n" \
               f"Texto: \"{row['text']}\" [/INST] {row['emotion']} </s>"

    df_formatted = df.copy()
    df_formatted["text"] = df_formatted.apply(construir_prompt, axis=1)
    return df_formatted[["text"]]  # Retorna solo la columna necesaria para entrenamiento

# Usar la función
X_train_formatted = formatear_para_llama(X_train)
X_test_formatted = formatear_para_llama(X_test)
X_eval_formatted = formatear_para_llama(X_eval)

# Ejemplo de vista previa
print(X_train_formatted.head(2))
print(X_test_formatted.head(2))
print(X_eval_formatted.head(2))


                                                text
0  <s>[INST] Clasifica el siguiente texto en una ...
1  <s>[INST] Clasifica el siguiente texto en una ...
                                                text
0  <s>[INST] Clasifica el siguiente texto en una ...
1  <s>[INST] Clasifica el siguiente texto en una ...
                                                text
0  <s>[INST] Clasifica el siguiente texto en una ...
1  <s>[INST] Clasifica el siguiente texto en una ...


In [None]:
from datasets import DatasetDict, Dataset

# 6. Convertir a Hugging Face Datasets y eliminar índice
train_dataset = Dataset.from_pandas(X_train_formatted.reset_index(drop=True))
test_dataset = Dataset.from_pandas(X_test_formatted.reset_index(drop=True))
eval_dataset = Dataset.from_pandas(X_eval_formatted.reset_index(drop=True))

# 7. Crear DatasetDict
dataset_dict = DatasetDict({
    "train": train_dataset,
    "test": test_dataset,
    "validation": eval_dataset
})

# 8. Eliminar columna __index_level_0__ si existe
for split in dataset_dict:
    if "__index_level_0__" in dataset_dict[split].column_names:
        dataset_dict[split] = dataset_dict[split].remove_columns("__index_level_0__")

dataset_dict

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 26934
    })
    test: Dataset({
        features: ['text'],
        num_rows: 3366
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3372
    })
})

In [None]:
from huggingface_hub import HfApi

# Nombre del dataset en tu espacio personal
dataset_dict.push_to_hub("Joseph7D/prompt-emotion-dataset-v2")


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]