# View and pull data from the dair-ai/emotion dataset

In [1]:
from datasets import load_dataset
import pandas as pd

In [2]:
dataset = load_dataset("dair-ai/emotion", "split")

In [3]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})


In [4]:
# Label mapping
label_names = dataset["train"].features["label"].names
print(label_names)

['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']


In [5]:
# Convert the 'train' split to a DataFrame
df_train = pd.DataFrame(dataset["train"])

# If you want to include readable label names:
df_train["label_name"] = df_train["label"].map(lambda x: dataset["train"].features["label"].names[x])

# Set display option to show full text in all columns
pd.set_option('display.max_colwidth', None)

# View the first few rows
display(df_train.head())

Unnamed: 0,text,label,label_name
0,i didnt feel humiliated,0,sadness
1,i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake,0,sadness
2,im grabbing a minute to post i feel greedy wrong,3,anger
3,i am ever feeling nostalgic about the fireplace i will know that it is still on the property,2,love
4,i am feeling grouchy,3,anger


In [6]:
# Convert the 'train' split to a DataFrame
df_validation = pd.DataFrame(dataset["validation"])

# If you want to include readable label names:
df_validation["label_name"] = df_validation["label"].map(lambda x: dataset["validation"].features["label"].names[x])

# View the first few rows
display(df_validation.head())

Unnamed: 0,text,label,label_name
0,im feeling quite sad and sorry for myself but ill snap out of it soon,0,sadness
1,i feel like i am still looking at a blank canvas blank pieces of paper,0,sadness
2,i feel like a faithful servant,2,love
3,i am just feeling cranky and blue,3,anger
4,i can have for a treat or if i am feeling festive,1,joy


In [7]:
# Convert the 'train' split to a DataFrame
df_test = pd.DataFrame(dataset["test"])

# If you want to include readable label names:
df_test["label_name"] = df_test["label"].map(lambda x: dataset["test"].features["label"].names[x])

# View the first few rows
display(df_test.head())

Unnamed: 0,text,label,label_name
0,im feeling rather rotten so im not very ambitious right now,0,sadness
1,im updating my blog because i feel shitty,0,sadness
2,i never make her separate from me because i don t ever want her to feel like i m ashamed with her,0,sadness
3,i left with my bouquet of red and yellow tulips under my arm feeling slightly more optimistic than when i arrived,1,joy
4,i was feeling a little vain when i did this one,0,sadness


In [8]:
# Reset column width display option to default (truncates long text)
pd.reset_option('display.max_colwidth')

In [9]:
sum_of_records = len(df_train)+len(df_validation)+len(df_test)

sum_of_train_records = len(df_train)
sum_of_validation_records = len(df_validation)
sum_of_test_records = len(df_test)

train_percent = round( (len(df_train)/sum_of_records) * 100 )
print(f"{train_percent = }%, number of records = {sum_of_train_records}")

validation_percent = round( (len(df_validation)/sum_of_records) * 100 )
print(f"{validation_percent = }%, number of records = {sum_of_validation_records}")

test_percent = round( (len(df_test)/sum_of_records) * 100 )
print(f"{test_percent = }%, number of records = {sum_of_test_records}")

train_percent = 80%, number of records = 16000
validation_percent = 10%, number of records = 2000
test_percent = 10%, number of records = 2000


In [10]:
# Save the dataset in .csv files on local machine
df_train.to_csv('dataset/train.csv', index=False)
df_validation.to_csv('dataset/validation.csv', index=False)
df_test.to_csv('dataset/test.csv', index=False)