In [99]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.io as pio
from collections import Counter

In [None]:
'text'
'upos' Done barplot
'pos_fw_emo'
'count' Done boxplot
'emotion_associations' Done barplot
'sentiment_score' Done
    hateful:
        count    4706.000000
        mean        0.338978
    non_hateful:
        count    7834.000000
        mean        0.437697
'intent' Done barplot

In [91]:
# importing the data
datafolder = '../../data/'
train = datafolder+'train_with_features.csv'
test = datafolder+'test_with_features.csv'
dev = datafolder+'dev_with_features.csv'
test_unseen = datafolder+'test_unseen_with_features.csv'
dev_unseen = datafolder+'dev_unseen_with_features.csv'
df_train = pd.read_csv(train, skip_blank_lines=False)
df_dev = pd.read_csv(dev, skip_blank_lines=False)
df_dev_unseen = pd.read_csv(dev_unseen, skip_blank_lines=False)
df_test = pd.read_csv(test, skip_blank_lines=False)
df_test_unseen = pd.read_csv(test_unseen, skip_blank_lines=False)
df = pd.concat([df_train, df_dev, df_test, df_dev_unseen, df_test_unseen], ignore_index=True, axis=0)
df = df.fillna('')

In [115]:
df.pos_fw_emo

0                  its their NOUN not their NOUN that VERB
1        do not be afraid to love ADV everyone be not l...
2                                    VERB NOUN on your pet
3        I love everything and everybody PUNCT except f...
4        everybody love chocolate NOUN NOUN PUNCT ADV NOUN
                               ...                        
12535                                   fight for ADJ NOUN
12536    that feeling when you VERB your homework in NO...
12537                  the NOUN that VERB PROPN PROPN NOUN
12538                 one of the ADJ NOUN of the atom bomb
12539    NOUN VERB like a ADJ NOUN where I ADV VERB NOU...
Name: pos_fw_emo, Length: 12540, dtype: object

In [113]:
def count_features(df, feature):
    hateful_feat = {}
    nonhateful_feat = {}
    for instance in df.iterrows():
        instance = instance[1]
        if instance[feature]:
            if instance.label == 0:
                for x in instance[feature].split(' '):
                    if x in nonhateful_feat.keys():
                        nonhateful_feat[x] += 1
                    else:
                        nonhateful_feat[x] = 1
            else: # instance.label == 1:
                for x in instance[feature].split(' '):
                    if x in hateful_feat.keys():
                        hateful_feat[x] += 1
                    else:
                        hateful_feat[x] = 1
    return hateful_feat, nonhateful_feat

######################################################################################
def plot_difference(hateful, nonhateful, feature_name):
    fig = go.Figure()

    fig.add_trace(
        go.Bar(
            x=list(hateful.keys()),
            y=list(hateful.values()),
            name='Hateful ' +feature_name,
            marker_color='red'
        )
    )

    fig.add_trace(
        go.Bar(
            x=list(nonhateful.keys()),
            y=list(nonhateful.values()),
            name='Non-Hateful ' + feature_name,
            marker_color='blue'
        )
    )

    fig.update_layout(
        title={
        'text': 'Comparison of Hateful vs Non-Hateful '+ feature_name,
        'font': {'size': 24}
    },
        barmode='group',
        bargap=0.15,
        bargroupgap=0.1,
    yaxis_title={
        'text': 'Count',
        'font': {'size': 18}
    },
    xaxis_title={
        'text': feature_name,
        'font': {'size': 18}
    },
    legend={
        'title': {'text': 'Meme Type', 'font': {'size': 18}},
        'font': {'size': 16}
    },
    xaxis={
        'tickfont': {'size': 20}
    },
    yaxis={
        'tickfont': {'size': 20},
    })


    # use fig.write_html() to save the graph as an HTML file
    pio.write_html(fig, file='plots/'+feature_name+'_comparison.html', auto_open=True)
    fig.show()
    
######################################################################################

def plot_num_difference(df, feat_name, title):
    hateful_counts = df[df['label'] == 1][feat_name].values
    non_hateful_counts = df[df['label'] == 0][feat_name].values

    fig = go.Figure()

    fig.add_trace(
        go.Box(
            y=hateful_counts,
            name='Hateful Memes',
            marker_color='red'
        )
    )

    fig.add_trace(
        go.Box(
            y=non_hateful_counts,
            name='Non-Hateful Memes',
            marker_color='blue'
        )
    )


    fig.update_layout(
        title={
            'text': 'Comparison of Hateful vs Non-Hateful Memes by '+ title,
            'font': {'size': 24}
        },
        yaxis_title={
            'text': title,
            'font': {'size': 18}
        },
        xaxis_title={
            'text': '',
            'font': {'size': 18}
        },
        legend={
            'title': {'text': 'Meme Type', 'font': {'size': 18}},
            'font': {'size': 16}
        },
        xaxis={
            'tickfont': {'size': 20}
        },
        yaxis={
            'tickfont': {'size': 20},
        },
        boxmode='group',
        boxgap=0.3,
        boxgroupgap=0.2
    )

    pio.write_html(fig, file='plots/'+feat_name+'_comparison.html', auto_open=True)
    fig.show()

In [95]:
# hateful_ea, nonhateful_ea = count_features(df, 'emotion_associations')
# plot_difference(hateful_ea, nonhateful_ea, 'Emotion Association')
#####################################################################
# hateful_upos, nonhateful_upos = count_features(df, 'upos')
# plot_difference(hateful_upos, nonhateful_upos, 'UPOS')


In [109]:
hateful_counts = df[df['label'] == 1]['sentiment_score']
non_hateful_counts = df[df['label'] == 0]['sentiment_score']

In [111]:
hateful_counts.describe()

count    4706.000000
mean        0.338978
std         0.462537
min         0.000493
25%         0.002336
50%         0.007852
75%         0.988692
max         0.998919
Name: sentiment_score, dtype: float64

In [112]:
non_hateful_counts.describe()

count    7834.000000
mean        0.437697
std         0.487217
min         0.000489
25%         0.002393
50%         0.012977
75%         0.995584
max         0.998919
Name: sentiment_score, dtype: float64

In [114]:
# plot_num_difference(df, 'count', 'Emotion Words Count')
# plot_num_difference(df, 'sentiment_score', 'Sentiment Score')

In [98]:
# Remove stopwords from intent column
df['intent'] = df['intent'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in ['to', 'get', 'with','be', 'a', 'it','the', 'of', 'part', 'have']]))

# Group by label and count frequency of each intent word
not_hateful = Counter(df[df['label'] == 0]['intent'].str.cat(sep=' ').split()).most_common(10)
hateful = Counter(df[df['label'] == 1]['intent'].str.cat(sep=' ').split()).most_common(10)

# Create traces for both groups
not_hateful_trace = go.Bar(x=[i[0] for i in not_hateful], y=[i[1] for i in not_hateful], name='Not Hateful', marker=dict(color='blue', opacity=0.5))
hateful_trace = go.Bar(x=[i[0] for i in hateful], y=[i[1] for i in hateful], name='Hateful', marker=dict(color='red', opacity=0.5))

# Create layout and figure
layout = go.Layout(title='Top 10 Most Common Intent Words in Memes', xaxis=dict(title='Intent Words'), yaxis=dict(title='Frequency'))
fig = go.Figure(data=[not_hateful_trace, hateful_trace], layout=layout)
fig.update_layout(
        title={
            'text': 'Top 10 Most Common Intent Words in Memes',
            'font': {'size': 24}
        },
        yaxis_title={
            'text': 'count',
            'font': {'size': 20}
        },
        xaxis_title={
            'text': '',
            'font': {'size': 18}
        },
        legend={
            'title': {'text': 'Meme Type', 'font': {'size': 18}},
            'font': {'size': 16}
        },
        xaxis={
            'tickfont': {'size': 20}
        },
        yaxis={
            'tickfont': {'size': 20},
        },
        boxmode='group',
    )
pio.write_html(fig, file='plots/intent_comparison.html', auto_open=True)



In [118]:
# comparing the lenght of text
# number of letters
# number of words

In [119]:
# Create traces for both groups
not_hateful_trace = go.Histogram(x=df[df['label'] == 0]['text'].str.len(), name='Not Hateful', marker=dict(color='blue'))
hateful_trace = go.Histogram(x=df[df['label'] == 1]['text'].str.len(), name='Hateful', marker=dict(color='red'))

# Create layout and figure
layout = go.Layout(title='Comparison of Text Length (letter-wise) between Hateful and Not Hateful Memes', xaxis=dict(title='Text Length'), yaxis=dict(title='Frequency'))
fig = go.Figure(data=[not_hateful_trace, hateful_trace], layout=layout)

# Show figure
pio.write_html(fig, file='plots/text_len_comparison.html', auto_open=True)
fig.show()

In [126]:
# Create traces for both groups
not_hateful_trace = go.Histogram(x=df[df['label'] == 0]['text'].str.split(' ').apply(len), name='Not Hateful', marker=dict(color='blue'))
hateful_trace = go.Histogram(x=df[df['label'] == 1]['text'].str.split(' ').apply(len), name='Hateful', marker=dict(color='red'))

# Create layout and figure
layout = go.Layout(xaxis=dict(title='Text Length'), yaxis=dict(title='Frequency'))
fig = go.Figure(data=[not_hateful_trace, hateful_trace], layout=layout)


fig.update_layout(
        title={
            'text': 'Comparison of Text Length (word-wise) between Hateful and Not Hateful Memes',
            'font': {'size': 24}
        },
        yaxis_title={
            'text': 'count',
            'font': {'size': 20}
        },
        xaxis_title={
            'text': '',
            'font': {'size': 18}
        },
        legend={
            'title': {'text': 'Meme Type', 'font': {'size': 18}},
            'font': {'size': 16}
        },
        xaxis={
            'tickfont': {'size': 20}
        },
        yaxis={
            'tickfont': {'size': 20},
        },
        boxmode='group',
    )

# Show figure
pio.write_html(fig, file='plots/text_len_comparison.html', auto_open=True)
fig.show()

In [127]:
# Create traces for both groups
not_hateful_trace = go.Box(y=df[df['label'] == 0]['text'].str.split().apply(len), name='Not Hateful', marker=dict(color='blue'), boxpoints='outliers', boxmean=True)
hateful_trace = go.Box(y=df[df['label'] == 1]['text'].str.split().apply(len), name='Hateful', marker=dict(color='red'), boxpoints='outliers', boxmean=True)

# Create layout and figure
layout = go.Layout(title='Comparison of Number of Words between Hateful and Not Hateful Memes', xaxis=dict(title='Meme Type'), yaxis=dict(title='Number of Words'))
fig = go.Figure(data=[not_hateful_trace, hateful_trace], layout=layout)

# Transpose figure
fig.update_layout(xaxis=dict(title='Number of Words'), yaxis=dict(title='Meme Type'))

fig.update_layout(
        title={
            'text': 'Comparison of Text Length (word-wise) between Hateful and Not Hateful Memes',
            'font': {'size': 24}
        },
        yaxis_title={
            'text': 'count',
            'font': {'size': 20}
        },
        xaxis_title={
            'text': '',
            'font': {'size': 18}
        },
        legend={
            'title': {'text': 'Meme Type', 'font': {'size': 18}},
            'font': {'size': 16}
        },
        xaxis={
            'tickfont': {'size': 20}
        },
        yaxis={
            'tickfont': {'size': 20},
        },
        boxmode='group',
    )


# Show figure
pio.write_html(fig, file='plots/text_len_comparison.html', auto_open=True)
fig.show()