# EDA and Visualization

## Import, install and read

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import missingno as msno
import seaborn as sns
from statsmodels.tsa.seasonal import seasonal_decompose

import warnings
warnings.filterwarnings("ignore")

In [None]:
'''df = pd.read_csv('data/all_data.csv')
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')
# df_sample = pd.read_csv('data/sample_submission.csv')
df_test_private = pd.read_csv('data/test_private_expanded.csv')
df_test_public = pd.read_csv('data/test_public_expanded.csv')'''
df_tox_annot = pd.read_csv('data/toxicity_individual_annotations.csv')
df_ident_annot = pd.read_csv('data/identity_individual_annotations.csv')
df_merged = pd.read_csv('data/merged_data.csv')

## EDA

In [None]:
df_merged.head(3)

In [None]:
# counting the number of occurences of each identity
counts = df_merged['rating'].value_counts()

# Plotting the distribution of the ratings
plt.figure(figsize=(8, 5))
plt.bar(counts.index, counts.values, color=['green', 'red'])
plt.xticks(counts.index, ['Approved', 'Rejected'])
plt.title('Distribution of Approved vs Rejected Ratings')
plt.ylabel('Number of Comments')
plt.show()

## Missing values

In [None]:
# overview of missing values
msno.bar(df_merged)
msno.matrix(df_merged)
df_merged.isna().sum()

- 1 = toxic
- 0 = no toxic

## **Time-Based Analysis**

- **Trends Over Time**: trends of toxic comments over time. This can help identify any temporal patterns.

In [None]:
df_merged['date'] =df_merged['created_date'].str.split(' ').str[0]
df_merged['date'] = pd.to_datetime(df_merged['date'])

In [None]:
df_merged.sort_values(by='date', inplace=True)

In [None]:
plt.figure(figsize=(10, 4))
sns.lineplot(x='date', y='toxic', data=df_merged)

# Configurando rótulos e título
plt.title('Toxic comments over time')
plt.xlabel('Data')
plt.ylabel('Toxic comments')
plt.show

In [None]:
# get only toxic comments
daily_toxic_counts = df_merged[df_merged['toxic'] == 1].groupby('date').size()
# get total comments
daily_counts = df_merged['toxic'].groupby(df_merged['date']).size()
# % of toxic comments
daily_percentage = (daily_toxic_counts / daily_counts)*100

# Plot the distribution of toxic comments over time
plt.figure(figsize=(10, 4))
daily_percentage.plot(kind='bar', color='red', alpha=0.7)

# Manually set x-axis labels to display every 7 days
plt.xticks(range(0, len(daily_toxic_counts), 20), [str(date.date()) for date in daily_toxic_counts.index[::20]], rotation=45)

plt.title('Distribution of % Toxic Comments Over Time')
plt.xlabel('Date')
plt.ylabel('% of Toxic Comments')
plt.tight_layout()
plt.show()

In [None]:
daily_toxic_counts = df_merged[df_merged['toxic'] == 1].groupby('date').size()

# Plot the distribution of toxic comments over time
plt.figure(figsize=(10, 4))
daily_toxic_counts.plot(kind='bar', color='red', alpha=0.7)

# Manually set x-axis labels to display every 7 days
plt.xticks(range(0, len(daily_toxic_counts), 20), [str(date.date()) for date in daily_toxic_counts.index[::20]], rotation=45)

plt.title('Distribution of Toxic Comments Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Toxic Comments')
plt.tight_layout()
plt.show()

## **Reaction Metrics**

- **Reactions to Toxic Comments**: Analyze how users react (likes or disagree) to toxic vs non-toxic comments. Reactions: 'funny', 'wow', 'sad', 'likes', 'disagree'

In [None]:
df_react = df_merged.loc[:, ('funny', 'wow', 'sad', 'likes', 'disagree', 'toxic')]
# df_head = df_head[df_head['toxic'] == 1]

In [None]:
# Calculate the total number of comments with any reaction
df_react['any_reaction'] = (df_react[['funny', 'wow', 'sad', 'likes', 'disagree']] > 0).any(axis=1)
total_comments_with_reaction = df_react['any_reaction'].sum()

# Calculate the percentage of comments with any reaction
percentage_comments_with_reaction = (total_comments_with_reaction / len(df_react)) * 100

# Print the percentage
print(f"Percentage of comments with any reaction: {percentage_comments_with_reaction:.2f}%")

# Plot the percentage
plt.figure(figsize=(8, 6))
sns.barplot(x=['Comments with Reaction', 'Comments without Reaction'], y=[percentage_comments_with_reaction, 100 - percentage_comments_with_reaction], palette=['lightblue', 'lightgrey'])
plt.title('Percentage of Comments with Any Reaction')
plt.ylabel('Percentage')
plt.show()


In [None]:
# Calculate the total number of comments with any reaction
df_react['any_reaction'] = (df_react[['funny', 'wow', 'sad', 'likes', 'disagree']] > 0).any(axis=1)

# Create a contingency table to count occurrences
contingency_table = pd.crosstab(df_react['toxic'], df_react['any_reaction'], margins=True, margins_name="Total")

# Calculate percentages
percentage_comments_with_reaction_and_toxic = (contingency_table[True] / contingency_table['Total']) * 100
percentage_comments_without_reaction_and_toxic = (contingency_table[False] / contingency_table['Total']) * 100

# Print the percentages
print("Percentage of comments with reactions by toxicity:")
print(f"With toxicity: {percentage_comments_with_reaction_and_toxic[True]:.2f}%")
print(f"Without toxicity: {percentage_comments_without_reaction_and_toxic[True]:.2f}%")

# Plot the percentages
plt.figure(figsize=(10, 6))
sns.barplot(x=contingency_table.index, y=percentage_comments_with_reaction_and_toxic, color='lightgray', label='With Reaction')
sns.barplot(x=contingency_table.index, y=percentage_comments_without_reaction_and_toxic, color='orange', label='Without Reaction', bottom=percentage_comments_with_reaction_and_toxic)

plt.title('Percentage of Comments with and without Reactions by Toxicity')
plt.xlabel('Toxicity')
plt.ylabel('Percentage')
plt.legend()
plt.show()

In [None]:
df_react_with_reaction = df_react[df_react[['funny', 'wow', 'sad', 'likes', 'disagree']].sum(axis=1) > 0]

reactions = ['funny', 'wow', 'sad', 'likes', 'disagree']

for reaction in reactions:
    plt.figure(figsize=(6, 3))
    sns.barplot(data=df_react_with_reaction, x='toxic', y=reaction, ci=None)
    plt.title(f'Grouped Bar Chart of {reaction.capitalize()} Reactions for Comments with Reactions')
    plt.xlabel('Toxicity')
    plt.ylabel(f'{reaction.capitalize()} Reactions')
    plt.show()

-------------


<font color="red">

### I thinks Annotator Insight is not useful for our main file because we decided not to go deep into it. But we could have this for the records

</font>


## **Annotator Insights**

- **Annotator Count Analysis**: Examine the identity_annotator_count  to see if there’s any pattern in the number of annotators for different levels of toxicity or subgroups.

In [None]:
# annotator = ['identity_annotator_count', 'toxicity_annotator_count']

In [None]:
# pd.options.display.max_rows = 999

annotators = [4,10,6,5,7,9,11,8,1545,12,1814]

# df_merged['identity_annotator_count'].value_counts().head(6)

'''4       266136
10       90051
6        49506
5        19748
7          468
9          279
11         223
8           54
1545         7
12           5
1814         5'''

In [None]:
df_merged.head(10)
# identity_annotator_count - number of human beings checked the comment for identity
# toxicity_annotator_count - number of human beings checked for toxicity

In [None]:
df_annotator_merged = pd.merge(df_tox_annot, df_ident_annot, on='id', how='inner')

In [None]:
df_annotator_merged.shape

In [33]:
# Just for the records
main_columns = ['id', 'comment_text', 'split','toxicity']

subtype_columns = ['severe_toxicity', 'obscene', 'insult', 'threat','identity_attack','sexual_explicit']

identity_columns = ['male', 'female', 'transgender',
'other_gender', 'heterosexual', 'homosexual_gay_or_lesbian', 'bisexual',
'other_sexual_orientation', 'christian', 'jewish', 'muslim', 'hindu',
'buddhist', 'atheist', 'other_religion', 'black', 'white', 'asian',
'latino', 'other_race_or_ethnicity', 'physical_disability',
'intellectual_or_learning_disability', 'psychiatric_or_mental_illness',
'other_disability']

metadata_columns = ['created_date', 'publication_id',
'parent_id', 'article_id', 'rating', 'funny', 'wow', 'sad', 'likes',
'disagree', 'identity_annotator_count',
'toxicity_annotator_count']