In [64]:
import requests
import shutil
import pandas as pd
import numpy as np
import os
from sklearn.utils import shuffle
import string
import re

def load_and_preprocess_data(file_path):
  df = pd.read_csv(file_path, names=['Emotion', 'Text', 'DNTKNOW']).drop(columns=['DNTKNOW']).dropna()
  df['Text_processed'] = df.Text.apply(clean_text)
  return df

def clean_text(text):
  # to lower case
  text = text.lower()
  # remove links
  text = re.sub('https:\/\/\S+', '', text)
  # remove punctuation
  text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
  # remove next line
  text = re.sub(r'[^ \w\.]', '', text)
  # remove words containing numbers
  text = re.sub('\w*\d\w*', '', text)

  return text



# Descarga y procesa el csv
url = "https://raw.githubusercontent.com/PoorvaRane/Emotion-Detector/master/ISEAR.csv"
output_file = "ISEAR.csv"
destination_folder = "data"

response = requests.get(url)
with open(output_file, 'wb') as f:
    f.write(response.content)
#if dir created do not create again
if not os.path.exists('data'):
   os.mkdir('data')

shutil.move(output_file, f"{destination_folder}/{output_file}")

# Carga y preprocesa el dataset
df = load_and_preprocess_data(f'{destination_folder}/{output_file}')
df['Emotion'] = df['Emotion'].replace('guit', 'guilt')
df

Unnamed: 0,Emotion,Text,Text_processed
0,joy,On days when I feel close to my partner and ot...,on days when i feel close to my partner and ot...
1,fear,Every time I imagine that someone I love or I ...,every time i imagine that someone i love or i ...
2,anger,When I had been obviously unjustly treated and...,when i had been obviously unjustly treated and...
3,sadness,When I think about the short time that we live...,when i think about the short time that we live...
4,disgust,At a gathering I found myself involuntarily si...,at a gathering i found myself involuntarily si...
...,...,...,...
7511,shame,Two years back someone invited me to be the tu...,two years back someone invited me to be the tu...
7512,shame,I had taken the responsibility to do something...,i had taken the responsibility to do something...
7513,fear,I was at home and I heard a loud sound of spit...,i was at home and i heard a loud sound of spit...
7514,guilt,I did not do the homework that the teacher had...,i did not do the homework that the teacher had...


In [65]:
df.Emotion.unique()

array(['joy', 'fear', 'anger', 'sadness', 'disgust', 'shame', 'guilt'],
      dtype=object)

In [66]:
df_25 = pd.read_csv('data_isear/output_25.csv')
df_50 = pd.read_csv('data_isear/output_50.csv')
df_75 = pd.read_csv('data_isear/output_75.csv')
df_25.drop(columns=['Sentence_Number'], inplace=True)
df_50.drop(columns=['Sentence_Number'], inplace=True)
df_75.drop(columns=['Sentence_Number'], inplace=True)


In [67]:
print(df_50.iloc[0].Augmented_text, df_50.iloc[0].Sentiment)
print(df[(df['Text'].str.contains('corner')) & (df['Emotion'] == 'anger')])
start_50 = df[(df['Text'].str.contains('corner')) & (df['Emotion'] == 'anger')].index

While I was at the corner shop, a place I visit regularly, I had only a $50 note with me. I purchased essential goods worth about $3, but the shopkeeper made cynical remarks to others about people relying on him for cashing larger notes. anger
     Emotion                                               Text  \
1880   anger  I was at the corner shop, which I patronise re...   

                                         Text_processed  
1880  i was at the corner shop which i patronise reg...  


In [68]:
print(df_75.iloc[2].Augmented_text, df_75.iloc[2].Sentiment)
start_75 = df[(df['Text'].str.contains('Every year during carnival')) & (df['Emotion'] == 'joy')].index - 2

Every year during carnival, a profound sense of delight envelops me, accompanied by an overwhelming surge of exuberance. joy


In [69]:
print(df_25.iloc[0].Augmented_text, df_25.iloc[0].Sentiment)
start_25 = 0

During moments of deep connection with my partner and dear friends, accompanied by a profound inner serenity and an intimate bond with those I hold dear, a sense of warmth and contentment envelops me. joy


In [70]:

#Rename Sentiment for Emotion in df_50 and df_75

df_25.rename(columns={'Sentiment': 'Emotion'}, inplace=True)
df_50.rename(columns={'Sentiment': 'Emotion'}, inplace=True)
df_75.rename(columns={'Sentiment': 'Emotion'}, inplace=True)
df_25_original = df.iloc[start_25:start_50[0]].drop(columns=['Text'])
df_50_original = df.iloc[start_50[0]:start_75[0]].drop(columns=['Text'])
df_75_original = df.iloc[start_75[0]:start_75[0]+len(df_75)].drop(columns=['Text'])


In [71]:
#Crea una máscara para conocer cuales son las etiquetas que coinciden, llamdas Emotion
df_25_original.reset_index(drop=True, inplace=True)
mask_25 = df_25_original['Emotion']== df_25['Emotion']
df_50_original.reset_index(drop=True, inplace=True)
mask_50 = df_50_original['Emotion']== df_50['Emotion']
df_75_original.reset_index(drop=True, inplace=True)
mask_75 = df_75_original['Emotion']== df_75['Emotion']


In [72]:
print(df_25_original[mask_25].Emotion.value_counts()==df_25.Emotion.value_counts())
print(df_50_original[mask_50].Emotion.value_counts()==df_50.Emotion.value_counts())
print(df_75_original[mask_75].Emotion.value_counts()==df_75.Emotion.value_counts())

Emotion
fear       True
joy        True
anger      True
sadness    True
guilt      True
disgust    True
shame      True
Name: count, dtype: bool
Emotion
anger      True
guilt      True
joy        True
fear       True
disgust    True
shame      True
sadness    True
Name: count, dtype: bool
Emotion
joy        True
shame      True
anger      True
fear       True
sadness    True
disgust    True
guilt      True
Name: count, dtype: bool


In [73]:
#Create a list with the tags
tags = df_50_original[mask_50].Emotion.unique()
tags

array(['anger', 'sadness', 'disgust', 'shame', 'guilt', 'joy', 'fear'],
      dtype=object)

In [74]:
#Get a mask with the rows that contains a tag in the Augmented_text
mask_25 = df_25.Augmented_text.str.contains('|'.join(tags))
mask_50 = df_50.Augmented_text.str.contains('|'.join(tags))
mask_75 = df_75.Augmented_text.str.contains('|'.join(tags))


In [75]:
print(df_25[mask_25].Emotion.value_counts())
print(df_50[mask_50].Emotion.value_counts())
print(df_75[mask_75].Emotion.value_counts())

Emotion
guilt      60
fear       59
shame      48
anger      39
sadness    29
disgust    23
joy        16
Name: count, dtype: int64
Emotion
fear       66
guilt      64
anger      41
shame      39
sadness    36
joy        27
disgust    25
Name: count, dtype: int64
Emotion
guilt      69
fear       51
anger      48
shame      47
sadness    46
disgust    36
joy        28
Name: count, dtype: int64


In [76]:
#Now drop the rows that contains a tag in the Augmented_text
df_25 = df_25[~mask_25]
df_50 = df_50[~mask_50]
df_75 = df_75[~mask_75]


In [77]:
#Create a new column called augmented in df_50 df_75 and set default to true, and set to false in df_50_original and df_75_original
df_25['Augmented'] = True
df_50['Augmented'] = True
df_75['Augmented'] = True
df_25_original['Augmented'] = False
df_50_original['Augmented'] = False
df_75_original['Augmented'] = False

#Now rename the columns in df_50 and df_75 
df_25.rename(columns={'Augmented_text': 'Text_processed'}, inplace=True)
df_50.rename(columns={'Augmented_text': 'Text_processed'}, inplace=True)
df_75.rename(columns={'Augmented_text': 'Text_processed'}, inplace=True)

#Now apply clean_text to df_50 and df_75
df_25['Text_processed'] = df_25.Text_processed.apply(clean_text)
df_50['Text_processed'] = df_50.Text_processed.apply(clean_text)
df_75['Text_processed'] = df_75.Text_processed.apply(clean_text)

In [78]:
df_test = df[int(df.shape[0]*0.75) : df.shape[0]]

In [79]:

df.drop(columns=['Text'], inplace=True)
df['Augmented'] = False
#Concat df_50 df_75 with df

df_train = pd.concat([df_25, df_50, df_75, df[0: int(df.shape[0]*0.75)]], ignore_index=True)
df_train

Unnamed: 0,Text_processed,Emotion,Augmented
0,during moments of deep connection with my part...,joy,True
1,whenever the thought crosses my mind that some...,fear,True
2,when confronted with a situation where i have ...,anger,True
3,reflecting upon the brevity of our existence a...,sadness,True
4,during a gathering i unintentionally found mys...,disgust,True
...,...,...,...
10375,i dreamed that this girl and i were cuddling i...,guilt,False
10376,being involved in a car accident and having my...,fear,False
10377,a coworker and i had to rush a project i was o...,anger,False
10378,losing a pet chick our family had just put the...,sadness,False


In [85]:
df_test['Augmented'] = False
df_test.drop(columns=['Text'], inplace=True)
df_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['Augmented'] = False
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test.drop(columns=['Text'], inplace=True)


Unnamed: 0,Emotion,Text_processed,Augmented
5637,shame,no response,False
5638,guilt,having told a certain lie,False
5639,joy,i experienced joy at a friends birthday party...,False
5640,fear,two summers ago my best friend and i drove int...,False
5641,anger,i had just seen my pseudogirlfriend and our co...,False
...,...,...,...
7511,shame,two years back someone invited me to be the tu...,False
7512,shame,i had taken the responsibility to do something...,False
7513,fear,i was at home and i heard a loud sound of spit...,False
7514,guilt,i did not do the homework that the teacher had...,False


In [86]:
#Drop the rows that contains no response in Text_processed, but 
df = df[df['Text_processed'] != 'no response provided']
df 

Unnamed: 0,Emotion,Text_processed,Augmented
0,joy,on days when i feel close to my partner and ot...,False
1,fear,every time i imagine that someone i love or i ...,False
2,anger,when i had been obviously unjustly treated and...,False
3,sadness,when i think about the short time that we live...,False
4,disgust,at a gathering i found myself involuntarily si...,False
...,...,...,...
7511,shame,two years back someone invited me to be the tu...,False
7512,shame,i had taken the responsibility to do something...,False
7513,fear,i was at home and i heard a loud sound of spit...,False
7514,guilt,i did not do the homework that the teacher had...,False


In [87]:
df_train.to_csv('data_isear/ISEAR_augmented_train.csv', index=False)
df_test.to_csv('data_isear/ISEAR_augmented_test.csv', index=False)

In [88]:
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
#Separate between augmented and original 
df_augmented = df[df['Augmented'] == True]
df_original = df[df['Augmented'] == False]
#Create df_train,df_val,df_test
df_train,df_val = train_test_split(df_train, test_size=0.15, random_state=42)

#now Create a DatasetDict with the three datasets
dataset_dict = DatasetDict({'train': Dataset.from_pandas(df_train),
                            'validation': Dataset.from_pandas(df_val),
                            'test': Dataset.from_pandas(df_test)})




dataset_dict['validation'] = dataset_dict['validation'].remove_columns(['__index_level_0__'])
dataset_dict['train'] = dataset_dict['train'].remove_columns(['__index_level_0__'])


#push to huggingface datasets
dataset_dict



DatasetDict({
    train: Dataset({
        features: ['Text_processed', 'Emotion', 'Augmented'],
        num_rows: 7499
    })
    validation: Dataset({
        features: ['Text_processed', 'Emotion', 'Augmented'],
        num_rows: 1324
    })
    test: Dataset({
        features: ['Emotion', 'Text_processed', 'Augmented'],
        num_rows: 1879
    })
})

In [89]:
dataset_dict.push_to_hub('isear_augmented')

Pushing split train to the Hub.
Creating parquet from Arrow format: 100%|██████████| 8/8 [00:00<00:00, 759.87ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:02<00:00,  2.19s/it]
Deleting unused files from dataset repository: 100%|██████████| 1/1 [00:00<00:00,  2.79it/s]
Pushing split validation to the Hub.
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 864.54ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:01<00:00,  1.16s/it]
Deleting unused files from dataset repository: 100%|██████████| 1/1 [00:00<00:00,  5.07it/s]
Pushing split test to the Hub.
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 628.08ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:01<00:00,  1.04s/it]
Deleting unused files from dataset repository: 100%|██████████| 1/1 [00:00<00:00,  4.88it/s]
Downloading metadata: 100%|██████████| 3.06k/3.06k [00:00<?, ?B/s]


Overwhelming, remorse, Note