In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import plotly.graph_objects as go
import plotly.express as px

from wordcloud import WordCloud
from IPython.display import Image

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
%matplotlib inline

In [2]:
# Mount drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
df_train = pd.read_csv("/content/drive/MyDrive/GoEmotions/data/train.tsv", sep='\t', header=None, names=['Text', 'Class', 'ID'])
df_dev = pd.read_csv("/content/drive/MyDrive/GoEmotions/data/dev.tsv", sep='\t', header=None, names=['Text', 'Class', 'ID'])
df_test = pd.read_csv("/content/drive/MyDrive/GoEmotions/data/test.tsv", sep='\t', header=None, names=['Text', 'Class', 'ID'])

In [4]:
df_train.head()

Unnamed: 0,Text,Class,ID
0,My favourite food is anything I didn't have to...,27,eebbqej
1,"Now if he does off himself, everyone will thin...",27,ed00q6i
2,WHY THE FUCK IS BAYLESS ISOING,2,eezlygj
3,To make her feel threatened,14,ed7ypvh
4,Dirty Southern Wankers,3,ed0bdzj


In [5]:
df_train.describe()

Unnamed: 0,Text,Class,ID
count,43410,43410,43410
unique,43227,711,43410
top,Thank you.,27,eebbqej
freq,13,12823,1


In [6]:
# Count total NaN in a DataFrame
print(" \nCount total NaN in a DataFrame : \n\n",
       df_train.isnull().sum().sum())

 
Count total NaN in a DataFrame : 

 0


In [7]:
df_train['List of classes'] = df_train['Class'].apply(lambda x: x.split(','))

In [8]:
df_train['Len of classes'] = df_train['List of classes'].apply(lambda x: len(x))

In [9]:
df_train[df_train['Len of classes'] == 5]

Unnamed: 0,Text,Class,ID,List of classes,Len of classes
7873,Yeah I probably would've started crying on the...,912141925,ee6lqiq,"[9, 12, 14, 19, 25]",5


In [10]:
temp_df = df_train["Len of classes"].value_counts().sort_index()

trace1 = go.Bar(
                x = ['1','2','3','4','5'],
                y = temp_df.tolist(),
                marker = dict(color = 'rgb(250,13,92)',
                              line=dict(color='rgb(0,0,0)',width=1.5)),
                text=temp_df.tolist(), textposition='outside',
                width=[0.5, 0.5, 0.5, 0.5, 0.5])
layout = go.Layout(template= "plotly_dark",title = 'Number of classes' , xaxis = dict(title = 'Class Numbers'), yaxis = dict(title = 'Count'))
fig = go.Figure(data = [trace1], layout = layout)
fig.show()

In [11]:
emotions_map = {}

In [12]:
with open('/content/drive/MyDrive/GoEmotions/data/emotions.txt', 'r') as f:
    for i,emotion in enumerate(f):
        emotions_map[str(i)] = emotion.strip()

In [13]:
emotions_map

{'0': 'admiration',
 '1': 'amusement',
 '10': 'disapproval',
 '11': 'disgust',
 '12': 'embarrassment',
 '13': 'excitement',
 '14': 'fear',
 '15': 'gratitude',
 '16': 'grief',
 '17': 'joy',
 '18': 'love',
 '19': 'nervousness',
 '2': 'anger',
 '20': 'optimism',
 '21': 'pride',
 '22': 'realization',
 '23': 'relief',
 '24': 'remorse',
 '25': 'sadness',
 '26': 'surprise',
 '27': 'neutral',
 '3': 'annoyance',
 '4': 'approval',
 '5': 'caring',
 '6': 'confusion',
 '7': 'curiosity',
 '8': 'desire',
 '9': 'disappointment'}

In [14]:
emotion_list = list(emotions_map.values())

In [15]:
temp_list = df_train.drop(['Text'], axis=1).sum(axis=0).tolist()
trace1 = go.Bar(
                x = emotion_list,
                y = temp_list,
                marker = dict(color = 'rgb(127, 16, 238)',
                              line=dict(color='rgb(0,0,0)',width=1.5)),
                text=temp_list, textposition='outside')
layout = go.Layout(template= "plotly_dark",title = 'NUMBER OF EMOTIONS' , xaxis = dict(title = 'Emotion'), yaxis = dict(title = 'Count'))
fig = go.Figure(data = [trace1], layout = layout)
fig.show()

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

feature_column = "Text"
df = df_train
vectorizer = TfidfVectorizer(analyzer='word', token_pattern=r'\S+')
experimental_feature = df[feature_column].to_list()
label = df['Class'].to_list()
vectors = vectorizer.fit_transform(experimental_feature)
print(df[feature_column].head(), df.shape, sep="\n")

0    My favourite food is anything I didn't have to...
1    Now if he does off himself, everyone will thin...
2                       WHY THE FUCK IS BAYLESS ISOING
3                          To make her feel threatened
4                               Dirty Southern Wankers
Name: Text, dtype: object
(43410, 5)


In [24]:
df.head()

Unnamed: 0,Text,Class,ID,List of classes,Len of classes
0,My favourite food is anything I didn't have to...,27,eebbqej,[27],1
1,"Now if he does off himself, everyone will thin...",27,ed00q6i,[27],1
2,WHY THE FUCK IS BAYLESS ISOING,2,eezlygj,[2],1
3,To make her feel threatened,14,ed7ypvh,[14],1
4,Dirty Southern Wankers,3,ed0bdzj,[3],1
