# Importing Libraries

In [None]:
import os
import re
import string 
import emoji
import json
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import linkage
from scipy.spatial.distance import pdist
from wordcloud import WordCloud
from IPython.display import Image

import warnings
warnings.filterwarnings("ignore")

# Importing Data

In [None]:
train_data = pd.read_csv("../input/goemotions/data/train.tsv", sep='\t', header=None, names=['Text', 'Class', 'ID'])
valid_data = pd.read_csv("../input/goemotions/data/dev.tsv", sep='\t', header=None, names=['Text', 'Class', 'ID'])
test_data = pd.read_csv("../input/goemotions/data/test.tsv", sep='\t', header=None, names=['Text', 'Class', 'ID'])

In [None]:
train_data.head()

In [None]:
train_data.info()

# Basic EDA

In [None]:
train_data['Class List'] = train_data['Class'].apply(lambda x: x.split(','))
train_data['Class Length'] = train_data['Class List'].apply(lambda x: len(x))

In [None]:
train_data['Class Length'].max()

In [None]:
train_data['Class Length'].value_counts()

In [None]:
# Checking distribution of the number of labels for each piece of text
temp_df = train_data['Class Length'].value_counts().sort_index()
sns.barplot(x=list(range(1, train_data['Class Length'].max()+1)), y=temp_df, palette=sns.color_palette("dark", 10))
plt.title("Number of Classes")
plt.xlabel("Class Numbers")
plt.ylabel("Count")
plt.show()

In [None]:
emotions_map = {}
with open('../input/goemotions/data/emotions.txt', 'r') as f:
    for i,emotion in enumerate(f):
        emotions_map[str(i)] = emotion.strip()

In [None]:
emotions_map

In [None]:
# Mapping the classes into one hot vectors with multi-label
for i in emotions_map:
    train_data[emotions_map[i]] = train_data["Class List"].apply(lambda x: 1 if i in x else 0)

In [None]:
train_data.drop(['Class', 'Class List', 'Class Length'], axis=1, inplace =True)

In [None]:
train_data.head()

In [None]:
# Checking Distribution of Each Class (Multi-Label included)
emotion_list = list(emotions_map.values())
temp_list = train_data.drop(['Text', "ID"], axis=1).sum(axis=0).tolist()
plt.figure(figsize=(20,10))
sns.barplot(x=emotion_list, y=temp_list, palette=sns.color_palette("dark"))
plt.xticks(rotation="vertical")
plt.title("Distribution of Every Class")
plt.ylabel("Number of Texts")
plt.show()

In [None]:
print("Distribution of number of labels per example:")
print(train_data.drop(['Text', "ID"], axis=1).sum(axis=1).value_counts() / len(train_data))
print("%.2f with more than 3 labels" %
        ((train_data.drop(['Text', "ID"], axis=1).sum(axis=1) > 3).sum() /
         len(train_data)))

In [None]:
train_data.columns

In [None]:
column = train_data.columns
emotions_with_nuetral = column[2:]
emotions = column[2:-1]

positive_col = ['admiration','amusement','approval','caring','curiosity','excitement','gratitude','joy','love','optimism','relief','surprise']
negative_col = ['sadness','pride','fear','embarrassment','disapproval','disappointment','confusion','annoyance','anger','nervousness','desire']
ambiguous_col = ['remorse','realization','grief','disgust']
neutral_col = ['neutral']

In [None]:
print("Label distributions:")
print((train_data[emotions_with_nuetral].sum(axis=0).sort_values(ascending=False) /
         len(train_data) * 100).round(2))

# EDA from GoEmotions Paper

In [None]:
df_emotion = pd.DataFrame()
df_emotion['emotion'] = list(emotions_with_nuetral)
df_emotion['group'] = 'standalone'
df_emotion['group'].loc[df_emotion['emotion'].isin(positive_col)] = 'positive'
df_emotion['group'].loc[df_emotion['emotion'].isin(negative_col)] = 'negative'
df_emotion['group'].loc[df_emotion['emotion'].isin(ambiguous_col)] = 'ambiguous'
df_emotion.head()

In [None]:
temp = train_data[emotions].sum(axis=0) \
    .reset_index() \
    .rename(columns={'index': 'emotion', 0: 'n'}) \
    .merge(df_emotion, how='left', on='emotion')\
    .sort_values('n', ascending=False)
temp = temp[temp['emotion'] != 'neutral']

fig, ax = plt.subplots(figsize=(7, 7))
ax.tick_params(axis='x', rotation=90)
palette ={
    "positive": "skyblue", 
    "negative": "red", 
    "ambiguous": 'gray'
}
sns.barplot(data=temp, x='n', 
            y='emotion', hue='group', 
            dodge=False,
            palette=palette,
            ax=ax)
plt.show()

In [None]:
temp = temp.groupby('group') \
        .agg('sum').reset_index()

sns.barplot(data=temp, x='group', y='n')

In [None]:
print("Plotting label correlations...")
ratings = train_data.groupby("ID")[emotions_with_nuetral].mean()

corr = ratings.corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
fig, _ = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(
      corr,
      mask=mask,
      cmap=cmap,
      vmax=.3,
      center=0,
      square=True,
      linewidths=.5,
      cbar_kws={"shrink": .5})
plt.show()

In [None]:
print("Plotting hierarchical relations...")
z = linkage(
      pdist(ratings.T, metric="correlation"),
      method="ward",
      optimal_ordering=True)
fig = plt.figure(figsize=(11, 4), dpi=400)
plt.xlabel("")
plt.ylabel("")
dendrogram(
      z,
      labels=ratings.columns,
      leaf_rotation=90.,  # rotates the x axis labels
      leaf_font_size=12,  # font size for the x axis labels
      color_threshold=1.05,
  )
plt.show()

In [None]:
sent_color_map = {
      "positive": "#BEECAF",
      "negative": "#94bff5",
      "ambiguous": "#FFFC9E"
  }

sent_colors = {}
for emotion in emotions:
    if emotion in positive_col:
        sent_colors[emotion] = sent_color_map["positive"]
    elif emotion in negative_col:
        sent_colors[emotion] = sent_color_map["negative"]
    else:
        sent_colors[emotion] = sent_color_map["ambiguous"]
        
mask = np.zeros_like(corr, dtype=bool)
mask[np.diag_indices(mask.shape[0])] = True

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

row_colors = pd.Series(
      corr.columns, index=corr.columns, name="sentiment").map(sent_colors)

# Draw the heatmap with the mask and correct aspect ratio
g = sns.clustermap(
      corr,
      mask=mask,
      cmap=cmap,
      vmax=.3,
      vmin=-0.3,
      center=0,
      row_linkage=z,
      col_linkage=z,
      col_colors=row_colors,
      linewidths=.1,
      cbar_kws={
          "ticks": [-.3, -.15, 0, .15, .3],
          "use_gridspec": False,
          "orientation": "horizontal",
      },
      figsize=(10, 10))

g.ax_row_dendrogram.set_visible(False)
g.cax.set_position([.34, -0.05, .5, .03])

# WordCloud with Ekman Taxonomy

In [None]:
emotions_data = pd.read_csv("../input/goemotions/tables/emotion_words.csv")

In [None]:
emotions_data.head()

In [None]:
emotions_data["emotion"].value_counts()

In [None]:
with open('../input/goemotions/data/ekman_mapping.json') as file:
    ekman_mapping = json.load(file)

In [None]:
ekman_mapping

In [None]:
for i in range(len(emotions_data)):
    if emotions_data.iloc[i,0] in ekman_mapping["anger"]:
        emotions_data.iloc[i,0] = "anger"
    elif emotions_data.iloc[i,0] in ekman_mapping["disgust"]:
        emotions_data.iloc[i,0] = "disgust"
    elif emotions_data.iloc[i,0] in ekman_mapping["fear"]:
        emotions_data.iloc[i,0] = "fear"
    elif emotions_data.iloc[i,0] in ekman_mapping["joy"]:
        emotions_data.iloc[i,0] = "joy"
    elif emotions_data.iloc[i,0] in ekman_mapping["sadness"]:
        emotions_data.iloc[i,0] = "sadness"
    elif emotions_data.iloc[i,0] in ekman_mapping["surprise"]:
        emotions_data.iloc[i,0] = "surprise"
    else:
        emotions_data.iloc[i,0] = "neutral"

In [None]:
emotions_data.head()

In [None]:
emotions_data["emotion"].value_counts()

In [None]:
wc_dict = {}
for i in emotions_data['emotion'].unique():
    wc_dict[i] = {}
    odds_list = emotions_data[emotions_data['emotion'] == i]['odds'].tolist()
    word_list = emotions_data[emotions_data['emotion'] == i]['word'].tolist()
    wc_dict[i] = dict(dict(zip(word_list,odds_list)))

In [None]:
wc_dict.keys()

In [None]:
def plot_cloud(wordcloud):
    plt.figure(figsize=(30, 20))
    plt.imshow(wordcloud) 
    plt.axis("off");

In [None]:
wordcloud = WordCloud(width = 1000, height = 500, random_state=42, background_color='black', colormap='Set2', collocations=False)

In [None]:
# Plotting WordCloud for Class ['joy', 'anger', 'surprise', 'sadness', 'disgust', 'fear', 'neutral'] respectively
plot_cloud(wordcloud.generate_from_frequencies(wc_dict['joy']))

In [None]:
plot_cloud(wordcloud.generate_from_frequencies(wc_dict['anger']))

In [None]:
plot_cloud(wordcloud.generate_from_frequencies(wc_dict['surprise']))

In [None]:
plot_cloud(wordcloud.generate_from_frequencies(wc_dict['sadness']))

In [None]:
plot_cloud(wordcloud.generate_from_frequencies(wc_dict['disgust']))

In [None]:
plot_cloud(wordcloud.generate_from_frequencies(wc_dict['fear']))

In [None]:
plot_cloud(wordcloud.generate_from_frequencies(wc_dict['neutral']))