In [1]:
import pandas as pd
import os
from google.colab import drive

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Change this path to your actual dataset folder
folder_path = '/content/drive/My Drive/nlp'

# Set working directory
os.chdir(folder_path)

# Check files in directory
os.listdir()


['glove.6B.100d.txt',
 'training.csv',
 'test.csv',
 'validation.csv',
 'dataset.csv',
 'glove.6B.100d.word2vec.txt',
 'fine-tune embeddings.ipynb',
 'torchmoji.ipynb',
 'DeepMoji.ipynb',
 'train.csv',
 'train1.csv',
 'test1.csv',
 'Untitled0.ipynb',
 '1_dataset_preparation.ipynb',
 'torchmoji_model',
 '2_task.ipynb',
 'roberta-finetuned',
 'task_1_to_7.ipynb',
 'task_8_ml_model_implementations.ipynb']

In [4]:
train_df = pd.read_csv('training.csv')
test_df = pd.read_csv('test.csv')
val_df = pd.read_csv('validation.csv')


In [5]:
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)
print("Validation shape:", val_df.shape)

Train shape: (16000, 2)
Test shape: (2000, 2)
Validation shape: (2000, 2)


In [6]:
# Merge all three DataFrames
combined_df = pd.concat([train_df, test_df, val_df], axis=0, ignore_index=True)

print("Combined shape:", combined_df.shape)

# Optional: Check a few rows
combined_df.head()

Combined shape: (20000, 2)


Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3


In [7]:
# Define the label mapping
label_map = {
    0: 'sadness',
    1: 'joy',
    2: 'anger',
    3: 'fear',
    4: 'love',
    5: 'surprise'
}

# Replace numeric labels directly
combined_df['label'] = combined_df['label'].map(label_map)

# Check result
combined_df.head()


Unnamed: 0,text,label
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,fear
3,i am ever feeling nostalgic about the fireplac...,anger
4,i am feeling grouchy,fear


In [8]:
combined_df.to_csv('dataset.csv', index=False)
print("Merged dataset saved as dataset.csv in your Drive folder.")


Merged dataset saved as dataset.csv in your Drive folder.


In [9]:
print(combined_df['label'].unique())

['sadness' 'fear' 'anger' 'surprise' 'love' 'joy']


In [10]:
print(combined_df['label'].value_counts())


label
joy         6761
sadness     5797
fear        2709
love        2373
anger       1641
surprise     719
Name: count, dtype: int64


In [11]:
combined_df.shape

(20000, 2)

In [12]:
# Find unique categories in the 'label' column
unique_labels = combined_df['label'].unique()
num_categories = len(unique_labels)

print(f"Number of unique categories: {num_categories}")
print("Categories:", unique_labels)

Number of unique categories: 6
Categories: ['sadness' 'fear' 'anger' 'surprise' 'love' 'joy']
