# EDA and Visualization

## Import, install and read

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import missingno as msno
import seaborn as sns
from statsmodels.tsa.seasonal import seasonal_decompose

import warnings
warnings.filterwarnings("ignore")

In [None]:
'''df = pd.read_csv('data/all_data.csv')
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')
# df_sample = pd.read_csv('data/sample_submission.csv')
df_test_private = pd.read_csv('data/test_private_expanded.csv')
df_test_public = pd.read_csv('data/test_public_expanded.csv')
df_tox_annot = pd.read_csv('data/toxicity_individual_annotations.csv')
df_ident_annot = pd.read_csv('data/identity_individual_annotations.csv')'''
df_merged = pd.read_csv('data/merged_data.csv')

## EDA

In [None]:
df_merged.head(3)

In [None]:
# counting
counts = df_merged['rating'].value_counts()

# Plotting
plt.figure(figsize=(8, 5))
plt.bar(counts.index, counts.values, color=['green', 'red'])
plt.xticks(counts.index, ['Approved', 'Rejected'])
plt.title('Distribution of Approved vs Rejected Ratings')
plt.ylabel('Number of Comments')
plt.show()

In [None]:
df_merged.info()

## Missing values

In [None]:
# overview of missing values
msno.bar(df_merged)
msno.matrix(df_merged)
df_merged.isna().sum()

- 1 = toxic
- 0 = no toxic

## **Time-Based Analysis**

- **Trends Over Time**: trends of toxic comments over time. This can help identify any temporal patterns.

In [None]:
df_merged['date'] =df_merged['created_date'].str.split(' ').str[0]
df_merged['date'] = pd.to_datetime(df_merged['date'])

In [None]:
df_merged.sort_values(by='date', inplace=True)

In [None]:
plt.figure(figsize=(12, 6))
sns.lineplot(x='date', y='toxic', data=df_merged)

# Configurando rótulos e título
plt.title('Toxic comments over time')
plt.xlabel('Data')
plt.ylabel('Toxic comments')
plt.show

In [None]:
daily_toxic_counts = df_merged[df_merged['toxic'] == 1].groupby('date').size()

# Plot the distribution of toxic comments over time
plt.figure(figsize=(10, 6))
daily_toxic_counts.plot(kind='bar', color='red', alpha=0.7)

# Manually set x-axis labels to display every 7 days
plt.xticks(range(0, len(daily_toxic_counts), 20), [str(date.date()) for date in daily_toxic_counts.index[::20]], rotation=45)

plt.title('Distribution of Toxic Comments Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Toxic Comments')
plt.tight_layout()
plt.show()

In [None]:
# Plot the distribution of toxic comments over time
plt.figure(figsize=(10, 6))

# Use range(len(daily_toxic_counts)) as x values
plt.bar(range(len(daily_toxic_counts)), daily_toxic_counts, color='red', alpha=0.7)

# Manually set x-axis labels to display every 7 days
plt.xticks(range(0, len(daily_toxic_counts), 20), [str(date.date()) for date in daily_toxic_counts.index[::20]], rotation=45)

# Add a trendline
x = np.arange(len(daily_toxic_counts))
coefficients = np.polyfit(x, daily_toxic_counts, 1)
trendline = np.polyval(coefficients, x)
plt.plot(x, trendline, color='blue', label='Trendline')

plt.title('Distribution of Toxic Comments Over Time with Trendline')
plt.xlabel('Date')
plt.ylabel('Number of Toxic Comments')
plt.legend()
plt.tight_layout()
plt.show()

## **Reaction Metrics**

- **Reactions to Toxic Comments**: Analyze how users react (likes or disagree) to toxic vs non-toxic comments. Scatter plots or grouped bar charts could be useful here.

- React: 'funny', 'wow', 'sad', 'likes', 'disagree'

In [None]:
react = ['funny', 'wow', 'sad', 'likes', 'disagree']
# toxic_react = df_merged[df_merged['toxic'] == 1].groupby('react').size()

## **Annotator Insights**

- **Annotator Count Analysis**: Examine the identity_annotator_count  to see if there’s any pattern in the number of annotators for different levels of toxicity or subgroups.