# EDA
Includes code from Purvi, Eric, André, Michael. Merge done by Michael, reviewed by group.

@all:  
I use a max code line length of 79, as suggested by PEP-8, as this is useful when you want to compare code side by side (https://github.com/python/peps/blob/main/peps/pep-0008.rst). You can setup VS Code to show a ruler/marker (or several of them) to help keep line length in check: https://levelup.gitconnected.com/do-you-know-about-rulers-in-visual-studio-code-f754b221a135. Nice to have, not a must. I'm open for any discussions about this.

## Setup

In [None]:
# import the usual suspects
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# scikit-learn
from sklearn.feature_extraction.text import CountVectorizer

# nltk (Natural Language Toolkit, https://www.nltk.org)
import nltk
from nltk import ngrams
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.probability import FreqDist

# download stuff for nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# other imports
from wordcloud import WordCloud # https://pypi.org/project/wordcloud/
from collections import Counter
import string
import re
import missingno as msno

# @André and all:
# I leave warnings switched on. They can be annoying, yes, but I think
# they are usually thrown for a good reason and it's probably always
# better to write code that doesn't produce them in order to avoid
# problems further down the road.
#import warnings
#warnings.filterwarnings("ignore")

# increase number of displayed df columns, since data has quite many
# (default is 20)
pd.options.display.max_columns = 100

## Load data

In [None]:
# merged_data.csv contains only complete rows (~ 450,000 of a total 2M)
# Target: 'toxic' column (1 if 'toxicity' >= 0.5)
df = pd.read_csv('data/merged_data.csv')

## Define column groups

In [None]:

main_cols = ['id', 'comment_text', 'split', 'toxicity']

subtype_cols = ['severe_toxicity', 'obscene', 'insult', 'threat',
                'identity_attack','sexual_explicit']

metadata_cols = ['created_date', 'publication_id', 'parent_id',
                 'article_id', 'rating', 'funny', 'wow', 'sad', 'likes',
                 'disagree', 'identity_annotator_count',
                 'toxicity_annotator_count']

identity_cols = ['male', 'female', 'transgender', 'other_gender',
                 'heterosexual', 'homosexual_gay_or_lesbian',
                 'bisexual', 'other_sexual_orientation', 'christian',
                 'jewish', 'muslim', 'hindu', 'buddhist', 'atheist',
                 'other_religion', 'black', 'white', 'asian', 'latino',
                 'other_race_or_ethnicity', 'physical_disability',
                 'intellectual_or_learning_disability',
                 'psychiatric_or_mental_illness', 'other_disability']

# further split up identity columns
gender_cols = ['male', 'female', 'bisexual', 'transgender', 'heterosexual',
               'other_gender', 'homosexual_gay_or_lesbian',
               'other_sexual_orientation']

race_cols = ['asian', 'black', 'white', 'latino', 'other_race_or_ethnicity']

religion_cols = ['hindu', 'buddhist', 'christian', 'muslim', 'jewish',
                 'atheist', 'other_religion']

## First overview

In [None]:
df.info()

In [None]:
df.head()

In [None]:
# check for NaNs
print('Total # of NaNs in dataset:', df.isnull().sum().sum())

In [None]:
# check distribution of NaNs using Missingno
msno.bar(df)
msno.matrix(df);

In [None]:
# Check string columns other then comment_text. What's in there?
print('Column "rating":', df['rating'].unique())
print('Column "split":', df['split'].unique())

## Visualisations (Michael)

### Toxic vs non-toxic comments (pie plot)

In [None]:
toxic_count = df['toxic'].value_counts()
toxic_count.plot.pie(labels=['Non-toxic', 'Toxic'], autopct='%.1f%%')
plt.title('Comments')
plt.ylabel(None);

--> __Dataset is strongly imbalanced!__

### Distribution of toxicity (histogram)

In [None]:
df['toxicity'].hist(bins=30)
plt.title('Distribution of toxicity')
plt.xlabel('Toxicity')
plt.ylabel('Frequency');

### Histograms for all numerical columns

In [None]:
df.hist(bins=30, figsize=(30, 30));

### Calculate percentage for each identity

In [None]:
# create new binary identity features/columns (1 if value >= 0.5)
bin_identity_cols = []
for identity in identity_cols:
    new_col = 'is_' + identity
    df[new_col] = (df[identity] >= 0.5).astype(int)
    bin_identity_cols.append(new_col)

print('# of identity columns:', len(identity_cols))
print('# of new binary identity columns:', len(bin_identity_cols))

In [None]:
# calculate percentage for each identity
id_percentages = {}
for identity in bin_identity_cols:
    cnt = df[['id', identity]].groupby(identity).count()['id']
    perc = (cnt[1] / len(df) * 100).round(3)
    id_percentages[identity] = perc

print(id_percentages)

In [None]:
# transform dict into Pandas Series
id_perc_ser = pd.Series(id_percentages).sort_values()

# drop 'is_' from index labels
id_perc_ser.rename(lambda x: x.replace('is_', ''), inplace=True)

# plot
id_perc_ser.plot.barh()
plt.title('Identity percentages')
plt.xlabel('Share of total observations (percent)')
plt.ylabel('Identity');

## Visualisations (Eric)

### WordCloud (toxic comments)

In [None]:
# dataframe with only text and target
df_only_text = df[['comment_text','toxic']]

toxic_comments = df_only_text[df_only_text['toxic'] == 1]['comment_text']
nontoxic_comments = df_only_text[df_only_text['toxic'] == 0]['comment_text']

wordcloud = WordCloud(width=800, height=400, background_color='white')\
    .generate("".join(toxic_comments))

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off'); # semicolon suppresses ugly line of code over plot

### WordCloud (non-toxic comments)

In [None]:
wordcloud_non_toxic = WordCloud(width=800, height=400,
    background_color='white')\
    .generate("".join(nontoxic_comments))

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud_non_toxic, interpolation='bilinear')
plt.axis('off');

### Word frequency analysis

#### Tokenize comments

In [None]:
toxic_tokens = toxic_comments.apply(word_tokenize)
nontoxic_tokens = nontoxic_comments.apply(word_tokenize)

toxic_tokens[0]

#### Remove punctuation

Filter out non-alphabetical tokens.

In [None]:
toxic_tokens = toxic_tokens.apply(
    lambda x: [word for word in x if word.isalpha()])
nontoxic_tokens = nontoxic_tokens.apply(
    lambda x: [word for word in x if word.isalpha()])

toxic_tokens[0]

#### Convert to lower case
To ensure uniformity we have to convert all words to lower case.

In [None]:
toxic_tokens = toxic_tokens.apply(lambda x: [word.lower() for word in x])
nontoxic_tokens = nontoxic_tokens.apply(lambda x: [word.lower() for word in x])

toxic_tokens[0]

#### Remove stopwords
This will remove common words like: this, is, and, the, etc.

In [None]:
stop_words = set(stopwords.words('english'))
                         
# stop_words contains a list of 179 words that we want to remove from our
# comments

toxic_tokens = toxic_tokens.apply(
    lambda x: [word for word in x if word not in stop_words])
nontoxic_tokens = nontoxic_tokens.apply(
    lambda x: [word for word in x if word not in stop_words])

toxic_tokens[0]

#### Lemmatization
Lemmatization reduces words to their base or dictionary form. It's usually more sophisticated than stemming.

@Eric:  
Does is make sense to do both lemmatization and stemming as you did below? Isn't this mutually exclusive? Just a thought, not sure ...

In [None]:
lemmatizer = WordNetLemmatizer()

def lemmatize_tokens(tokens):
    lemmatized = []
    for word in tokens:
        lemmatized.append(lemmatizer.lemmatize(word, pos=wordnet.VERB))
    return lemmatized

toxic_tokens = toxic_tokens.apply(lemmatize_tokens)
nontoxic_tokens = nontoxic_tokens.apply(lemmatize_tokens)

toxic_tokens[0]

#### Stemming
Stemming reduces words to their word stem or root form.

In [None]:
stemmer = PorterStemmer()

toxic_tokens = toxic_tokens.apply(
    lambda x: [stemmer.stem(word) for word in x])
nontoxic_tokens = nontoxic_tokens.apply(
    lambda x: [stemmer.stem(word) for word in x])

toxic_tokens[0]

#### Create flat token list

In [None]:
flat_token_list = []
for sublist in toxic_tokens:
    for token in sublist:
        flat_token_list.append(token)

len(flat_token_list)

### Token frequency (line plot)

In [None]:
fdist = FreqDist(flat_token_list)

plt.figure(figsize=(15, 10))
fdist.plot(50, cumulative=False);

### Token frequency top 20 (bar plot with highlighted bars)

In [None]:
word_counts = pd.DataFrame(flat_token_list, columns=['word'])['word']\
    .value_counts()\
    .head(50)

# define words to highlight
highlight = ['black', 'women', 'man', 'trump', 'white', 'muslim',
             'men', 'gay', 'christian']

# create color array
colors = ['red' if word in highlight else 'blue' for word in word_counts.index]

# plot bar chart
plt.figure(figsize=(15, 10))
word_counts.plot(kind='bar', color=colors)
plt.title('Top 20 Most Frequent Tokens with Highlights')
plt.ylabel('Frequency')
plt.xlabel('Tokens')
plt.xticks(rotation=90);

### N-gram analysis

#### Create n-grams

In [None]:
# Generate bi-grams (2-grams)
bi_grams = ngrams(flat_token_list, 2)
bi_gram_counts = Counter(bi_grams)

# Generate tri-grams (3-grams)
tri_grams = ngrams(flat_token_list, 3)
tri_gram_counts = Counter(tri_grams)

#### Print top 10 n-grams

In [None]:
# Print the 10 most common bi-grams
print("Most Common Bi-grams:")
for gram, count in bi_gram_counts.most_common(10):
    print(f"{gram}: {count}")

# Print the 10 most common tri-grams
print("\nMost Common Tri-grams:")
for gram, count in tri_gram_counts.most_common(10):
    print(f"{gram}: {count}")

#### Plot top 10 n-grams

In [None]:
# Convert the bi-gram and tri-gram counts to DataFrames
df_bi_grams = pd.DataFrame(bi_gram_counts.most_common(10),
                           columns=['bi_gram', 'count'])
df_tri_grams = pd.DataFrame(tri_gram_counts.most_common(10),
                            columns=['tri_gram', 'count'])

# Plot bi-grams
plt.figure(figsize=(10, 5))
plt.bar(df_bi_grams['bi_gram'].astype(str), df_bi_grams['count'])
plt.title('Top 10 Most Common Bi-grams')
plt.xticks(rotation=90)

# Plot tri-grams
plt.figure(figsize=(10, 5))
plt.bar(df_tri_grams['tri_gram'].astype(str), df_tri_grams['count'])
plt.title('Top 10 Most Common Tri-grams')
plt.xticks(rotation=90);


## Visualisations (Purvi)

### Correlation heatmaps

In [None]:
toxic_data = df[df['toxic'] == 1] # toxic data have values where toxic = 1

In [None]:
# Correlation heatmap using toxic = 1 and columns >= 0.5

t_gend = (toxic_data[gender_cols].values >= 0.5).any(axis=1)
corr_gender = toxic_data.loc[t_gend, gender_cols + ['toxicity']]

t_race = (toxic_data[race_cols].values >= 0.5).any(axis=1)
corr_race = toxic_data.loc[t_race, race_cols + ['toxicity']]

t_rel = (toxic_data[religion_cols].values >= 0.5).any(axis=1)
corr_rel = toxic_data.loc[t_rel, religion_cols + ['toxicity']]

@Purvi:  
Just a suggestion: Should we hide the "upper triangle" above the main diagonal? I feel that the chart becomes more readable this way. Maybe you want/can look into that if you have time.

In [None]:
# Calculate correlations
corr_gender_matrix = corr_gender.corr()
corr_race_matrix = corr_race.corr()
corr_rel_matrix = corr_rel.corr()

# Create subplot for each heatmap
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot correlation heatmaps
sns.heatmap(corr_gender_matrix, annot=True, cmap='coolwarm',
            fmt='.2f', ax=axes[0])
axes[0].set_title('Correlation Heatmap - Gender')

sns.heatmap(corr_race_matrix, annot=True, cmap='coolwarm',
            fmt='.2f', ax=axes[1])
axes[1].set_title('Correlation Heatmap - Race')

sns.heatmap(corr_rel_matrix, annot=True, cmap='coolwarm',
            fmt='.2f', ax=axes[2])
axes[2].set_title('Correlation Heatmap - Religion')

plt.tight_layout()

In [None]:
#correlation for all columns together
corr_gender_all = toxic_data.loc[t_gend, gender_cols + ['toxicity']]
corr_race_all = toxic_data.loc[t_race, race_cols]
corr_rel_all = toxic_data.loc[t_rel, religion_cols]

all_data = pd.concat([corr_gender_all, corr_race_all, corr_rel_all], axis=1)

# Calculate overall correlation matrix
corr_all_data = all_data.corr()

# Plot combined correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_all_data, annot=True, cmap='coolwarm', fmt='.2f',
            annot_kws={"size": 8})
plt.title('Combined Correlation Heatmap - All Data')

plt.tight_layout()
#plt.savefig('corr')

### Mean toxicity for identity subgroups (all values taken into account)

In [None]:
#  gender columns
mean_gender = df[gender_cols].mean()

# race columns
mean_race = df[race_cols].mean()

# religion columns
mean_religion = df[religion_cols].mean()

# Print means
print("Mean of Gender Columns:")
print(mean_gender)

print("\nMean of Race Columns:")
print(mean_race)

print("\nMean of Religion Columns:")
print(mean_religion)

In [None]:
mean_gender = {
    'male': 0.108047,
    'female': 0.126652,
    'bisexual': 0.001893,
    'transgender': 0.006712,
    'heterosexual': 0.003248,
    'other_gender': 0.000882,
    'homosexual_gay_or_lesbian': 0.025378,
    'other_sexual_orientation': 0.001492
}

mean_race = {
    'asian': 0.011886,
    'black': 0.034276,
    'white': 0.056535,
    'latino': 0.006151,
    'other_race_or_ethnicity': 0.008158
}

mean_religion = {
    'hindu': 0.001443,
    'buddhist': 0.001393,
    'christian': 0.095184,
    'muslim': 0.049078,
    'jewish': 0.017910,
    'atheist': 0.003468,
    'other_religion': 0.006718
}

mean_gender_sorted1 = dict(sorted(mean_gender.items(),
                                  key=lambda item: item[1]))
mean_race_sorted1 = dict(sorted(mean_race.items(),
                                key=lambda item: item[1]))
mean_religion_sorted1 = dict(sorted(mean_religion.items(),
                                    key=lambda item: item[1]))

plt.figure(figsize=(12, 6))

plt.bar(mean_gender_sorted1.keys(), mean_gender_sorted1.values(),
        label='Gender', color='#9CC8ED', hatch='/')
plt.bar(mean_race_sorted1.keys(), mean_race_sorted1.values(),
        label='Race', color='#F4E8AB', hatch='o')
plt.bar(mean_religion_sorted1.keys(), mean_religion_sorted1.values(),
        label='Religion', color='#AAE2C3', hatch='.')

plt.xlabel('Subgroups')
plt.ylabel('Mean Values')
plt.title('Mean Toxicity Scores for Different Subgroups')
plt.xticks(rotation=90)
plt.legend()

# Adding data labels to each bar
for data in [mean_gender_sorted1, mean_race_sorted1, mean_religion_sorted1]:
    for subgroup, value in data.items():
        plt.text(subgroup, value + 0.001, round(value, 3), ha='center', va='bottom', fontsize=8)

plt.tight_layout()

### Mean toxicity for identity subgroups (just values >= 0.5)

@Purvi:  
This plot IMO makes more sense than the previous one. Maybe drop previous?

In [None]:
mean_toxic_gender = toxic_data[gender_cols]\
    .apply(lambda x: x[x >= 0.5].mean())
mean_toxic_race = toxic_data[race_cols]\
    .apply(lambda x: x[x >= 0.5].mean())
mean_toxic_religion = toxic_data[religion_cols]\
    .apply(lambda x: x[x >= 0.5].mean())

# print means

print("Mean of Gender Columns (>= 0.5 where toxic is 1):")
print(mean_toxic_gender)

print("\nMean of Race Columns (>= 0.5 where toxic is 1):")
print(mean_toxic_race)

print("\nMean of Religion Columns (>= 0.5 where toxic is 1):")
print(mean_toxic_religion)

In [None]:
mean_gender = {
    'male': 0.832137,
    'female': 0.890529,
    'bisexual': 0.683539,
    'transgender': 0.821347,
    'heterosexual': 0.747159,
    'other_gender': 0.533333,
    'homosexual_gay_or_lesbian': 0.878025,
    'other_sexual_orientation': 0.533333
}

mean_race = {
    'asian': 0.758636,
    'black': 0.898275,
    'white': 0.896285,
    'latino': 0.729187,
    'other_race_or_ethnicity': 0.567602
}

mean_religion = {
    'hindu': 0.762626,
    'buddhist': 0.731650,
    'christian': 0.864552,
    'muslim': 0.905658,
    'jewish': 0.896070,
    'atheist': 0.847757,
    'other_religion': 0.528125
}

# sort in ascending order

mean_gender_sorted = dict(sorted(mean_gender.items(),
                                 key=lambda item: item[1]))
mean_race_sorted = dict(sorted(mean_race.items(),
                               key=lambda item: item[1]))
mean_religion_sorted = dict(sorted(mean_religion.items(),
                                   key=lambda item: item[1]))

# plot

plt.figure(figsize=(12, 6))

plt.bar(mean_gender_sorted.keys(), mean_gender_sorted.values(),
        label='Gender', color='#9CC8ED', hatch='/')
plt.bar(mean_race_sorted.keys(), mean_race_sorted.values(),
        label='Race', color='#F4E8AB', hatch='o')
plt.bar(mean_religion_sorted.keys(), mean_religion_sorted.values(),
        label='Religion', color='#AAE2C3', hatch='.')

plt.xlabel('Subgroups')
plt.ylabel('Mean Values')
plt.title('Mean Toxicity Scores for Different Subgroups')
plt.xticks(rotation=90)
plt.legend()

# Add data labels to each bar
for data in [mean_gender, mean_race, mean_religion]:
    for subgroup, value in data.items():
        plt.text(subgroup, value + 0.001, round(value, 3), ha='center',
                 va='bottom', fontsize=8)

plt.tight_layout()

#plt.savefig('mean_toxicity.png')

### Word frequency for identity subgroups

In [None]:
toxic_gender = toxic_data.loc[t_gend, gender_cols + ['comment_text']]
toxic_race = toxic_data.loc[t_race, race_cols + ['comment_text']]
toxic_religion = toxic_data.loc[t_rel, religion_cols + ['comment_text']]

In [None]:
# Initialize the Porter Stemmer
stemmer = PorterStemmer()

def most_common_words(data):
    stop_words = set(stopwords.words('english'))  

    all_words = ' '.join(data['comment_text']).lower()
    
    # patterns to remove
    additional_exclusions = ['’', '...', 's', "n't", 'get', 'one', 'would']  
    
    # remove punctuations
    pattern = re.compile(r'[^\w\s]')
    
    words = word_tokenize(all_words)
    # Remove stopwords, punctuation, and apply stemming
    filtered_words = [
        stemmer.stem(word)
        for word in words
        if word not in stop_words
        and word not in string.punctuation
        and word not in additional_exclusions
        and word not in ['like']  # Exclude specific word 'like'
        and not pattern.match(word)
    ]
    words_count = Counter(filtered_words)
    return words_count.most_common(10)  # Get the 10 most common words

# Get the most common stemmed words for each subgroup
most_common_gender = most_common_words(toxic_gender)
most_common_race = most_common_words(toxic_race)
most_common_religion = most_common_words(toxic_religion)

# Prepare data for plotting
common_words = {
    'Gender': most_common_gender,
    'Race': most_common_race,
    'Religion': most_common_religion
}

# Plotting
plt.figure(figsize=(12, 6))
colors = {'Gender': '#9CC8ED', 'Race': '#F4E8AB', 'Religion': '#AAE2C3'}

for subgroup, common_words in common_words.items():
    words, counts = zip(*common_words)
    plt.barh([f'{subgroup}: {word}' for word in words], counts, label=subgroup,color=colors[subgroup])

plt.xlabel('Frequency')
plt.title('Top 10 Most Common Stemmed Words in Toxic Comments for \
          Different Subgroups')
plt.legend()

plt.tight_layout()

#plt.savefig('most_common.png')

## Visualisations (André)

### Ratings: approved vs. rejected (bar plot)

In [None]:
# counting the number of occurences of each identity
counts = df['rating'].value_counts()

# Plotting the distribution of the ratings
plt.figure(figsize=(8, 5))
plt.bar(counts.index, counts.values, color=['green', 'red'])
plt.xticks(counts.index, ['Approved', 'Rejected'])
plt.title('Distribution of Approved vs Rejected Ratings')
plt.ylabel('Number of Comments');

### Analysis of data over time

In [None]:
# copy original df in order to modify it
df_time = df

# @André:
# Not sure: What's the meaning of the 2nd .str in the following line?
# Is it necessary?
df_time['date'] = df_time['created_date'].str.split(' ').str[0]
df_time['date'] = pd.to_datetime(df_time['date'])

# sort df by date
df_time.sort_values(by='date', inplace=True)

#### Toxic comments over time

In [None]:
# plot
plt.figure(figsize=(10, 4))
sns.lineplot(x='date', y='toxic', data=df_time)
plt.title('Toxic comments over time')
plt.xlabel('Data')
plt.ylabel('Toxic comments');

#### Daily percentage of toxic comments

In [None]:
# get only toxic comments per day
daily_toxic_counts = df_time[df_time['toxic'] == 1].groupby('date').size()

# get total comments per day

# @André:
# Just an idea, maybe I'm wrong: Isn't your next line equivalent to
# df_time.groupby('date').size()
# ? Maybe you would like to try it out because it's simpler.

daily_counts = df_time['toxic'].groupby(df_time['date']).size()

# % of toxic comments per day
daily_percentage = (daily_toxic_counts / daily_counts) * 100

# Plot the distribution of toxic comments over time
plt.figure(figsize=(10, 4))
daily_percentage.plot(kind='bar', color='red', alpha=0.7)

# Manually set x-axis labels to display every 7 days
plt.xticks(range(0, len(daily_toxic_counts), 20),
           [str(date.date()) for date in daily_toxic_counts.index[::20]],
           rotation=45)
plt.title('Daily % of Toxic Comments')
plt.xlabel('Date')
plt.ylabel('% Toxic Comments')
plt.tight_layout()

#### Daily total of toxic comments

In [None]:
plt.figure(figsize=(10, 4))
daily_toxic_counts.plot(kind='bar', color='red', alpha=0.7)

# Manually set x-axis labels to display every 7 days
plt.xticks(range(0, len(daily_toxic_counts), 20),
           [str(date.date()) for date in daily_toxic_counts.index[::20]],
           rotation=45)
plt.title('Daily Total of Toxic Comments')
plt.xlabel('Date')
plt.ylabel('Number of Toxic Comments')
plt.tight_layout()

### Analysis of reactions (as contained in Civil Comments metadata)
Analyze how users react ('likes' or 'disagree') to toxic vs non-toxic comments. Reactions: 'funny', 'wow', 'sad', 'likes', 'disagree'.

In [None]:
# new df containing just reaction cols
df_react = df.loc[:, ('funny', 'wow', 'sad', 'likes', 'disagree', 'toxic')]

In [None]:
# Calculate the total number of comments with any reaction
df_react['any_reaction'] = (df_react[['funny', 'wow', 'sad',
                                      'likes', 'disagree']] > 0).any(axis=1)
total_comments_with_reaction = df_react['any_reaction'].sum()

# Calculate the percentage of comments with any reaction
percentage_comments_with_reaction =\
    (total_comments_with_reaction / len(df_react)) * 100

# Print the percentage
print(f"Percentage of comments with any reaction: {percentage_comments_with_reaction:.2f}%")

# Plot the percentage
plt.figure(figsize=(8, 6))
sns.barplot(x=['Comments with Reaction', 'Comments without Reaction'],
            y=[percentage_comments_with_reaction,
               100 - percentage_comments_with_reaction],
            palette=['lightblue', 'lightgrey'])
plt.title('Percentage of Comments with Any Reaction')
plt.ylabel('Percentage');

In [None]:
# Calculate the total number of comments with any reaction
df_react['any_reaction'] = (df_react[['funny', 'wow', 'sad',
                                      'likes', 'disagree']] > 0).any(axis=1)

# Create a contingency table to count occurrences
contingency_table = pd.crosstab(df_react['toxic'], df_react['any_reaction'],
                                margins=True, margins_name="Total")

# Calculate percentages
percentage_comments_with_reaction_and_toxic = \
    (contingency_table[True] / contingency_table['Total']) * 100
percentage_comments_without_reaction_and_toxic = \
    (contingency_table[False] / contingency_table['Total']) * 100

# Print percentages
print("Percentage of comments with reactions by toxicity:")
print(f"With toxicity: {percentage_comments_with_reaction_and_toxic[True]:.2f}%")
print(f"Without toxicity: {percentage_comments_without_reaction_and_toxic[True]:.2f}%")

# Plot the percentages
plt.figure(figsize=(10, 6))
sns.barplot(x=contingency_table.index,
            y=percentage_comments_with_reaction_and_toxic, color='lightgray',
            label='With Reaction')
sns.barplot(x=contingency_table.index,
            y=percentage_comments_without_reaction_and_toxic, color='orange',
            label='Without Reaction',
            bottom=percentage_comments_with_reaction_and_toxic)

plt.title('Percentage of Comments with and without Reactions by Toxicity')
plt.xlabel('Toxicity')
plt.ylabel('Percentage')
plt.legend();

In [None]:
df_react_with_reaction = \
    df_react[df_react[['funny', 'wow', 'sad','likes', 'disagree']].sum(axis=1) > 0]

reactions = ['funny', 'wow', 'sad', 'likes', 'disagree']

for reaction in reactions:
    plt.figure(figsize=(8, 4))
    sns.barplot(data=df_react_with_reaction, x='toxic',
                y=reaction, errorbar=None)
    plt.title(f'Grouped Bar Chart of {reaction.capitalize()} Reactions for Comments with Reactions')
    plt.xlabel('Toxicity')
    plt.ylabel(f'{reaction.capitalize()} Reactions')