# EDA Subgroups

**Analyzing Subgroups**

- heatmaps to explore correlations between different identity mentions and toxicity scores
- The mean toxicity score for each subgroup and use bar charts to see the difference

**Comparative Analysis**

- **Compare Subgroups**: Use side-by-side comparisons to see how different subgroups stack up against each other in terms of toxicity.

## <span style="color: yellow;">Imports</span>

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re
import nltk
from nltk.corpus import stopwords
import string 


## <span style="color: yellow;">Using merge_data.csv</span>

In [None]:
eda = pd.read_csv('data/merged_data.csv')

## <span style="color: Orange;"><b>Heatmaps</b></span>

### <span style="color: Orange;"><b>Define Subgroups</b></span>

In [None]:
# subgroup
gender_cols = ['male', 'female', 'bisexual', 'transgender', 'heterosexual', 'other_gender', 'homosexual_gay_or_lesbian', 'other_sexual_orientation']
race_cols = ['asian', 'black', 'white', 'latino', 'other_race_or_ethnicity']
religion_cols = ['hindu', 'buddhist', 'christian', 'muslim', 'jewish', 'atheist', 'other_religion']


### <span style="color: Orange;"><b>Correlation</b></span>

In [None]:
toxic_data = eda[eda['toxic'] == 1] # toxic data have values where toxic = 1

In [None]:
# Correlation heatmap using toxic = 1 and columns>= 0.5

t_gend = (toxic_data[gender_cols].values >= 0.5).any(axis=1)
corr_gender = toxic_data.loc[t_gend, gender_cols + ['toxicity']]

t_race = (toxic_data[race_cols].values >= 0.5).any(axis=1)
corr_race = toxic_data.loc[t_race, race_cols + ['toxicity']]

t_rel = (toxic_data[religion_cols].values >= 0.5).any(axis=1)
corr_rel = toxic_data.loc[t_rel, religion_cols + ['toxicity']]

In [None]:

# Calculate correlations
corr_gender_matrix = corr_gender.corr()
corr_race_matrix = corr_race.corr()
corr_rel_matrix = corr_rel.corr()

# Create subplots for each heatmap
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot correlation heatmaps
sns.heatmap(corr_gender_matrix, annot=True, cmap='coolwarm', fmt='.2f', ax=axes[0])
axes[0].set_title('Correlation Heatmap - Gender')

sns.heatmap(corr_race_matrix, annot=True, cmap='coolwarm', fmt='.2f', ax=axes[1])
axes[1].set_title('Correlation Heatmap - Race')

sns.heatmap(corr_rel_matrix, annot=True, cmap='coolwarm', fmt='.2f', ax=axes[2])
axes[2].set_title('Correlation Heatmap - Religion')

plt.tight_layout()

plt.show()

In [None]:
#correlation for all columns together

corr_gender_all = toxic_data.loc[t_gend, gender_cols + ['toxicity']]
corr_race_all = toxic_data.loc[t_race, race_cols]
corr_rel_all = toxic_data.loc[t_rel, religion_cols ]


all_data = pd.concat([corr_gender_all, corr_race_all, corr_rel_all], axis=1)

# Calculate overall correlation matrix
corr_all_data = all_data.corr()

# Plot combined correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_all_data, annot=True, cmap='coolwarm', fmt='.2f', annot_kws={"size": 8})
plt.title('Combined Correlation Heatmap - All Data')

plt.tight_layout()
plt.savefig('corr')
plt.show()


## <span style="color: Orange;"><b>Mean</b></span>

In [None]:
#Means:

#  gender columns
mean_gender = eda[gender_cols].mean()

# race columns
mean_race = eda[race_cols].mean()

# religion columns
mean_religion = eda[religion_cols].mean()

# Store mean in the list
means_list = [mean_gender, mean_race, mean_religion]

# Print means from the list
print("Mean of Gender Columns:")
print(mean_gender)

print("\nMean of Race Columns:")
print(mean_race)

print("\nMean of Religion Columns:")
print(mean_religion)


#### Mean toxicity for all subgroups for all the values

In [None]:

mean_gender = {
    'male': 0.108047,
    'female': 0.126652,
    'bisexual': 0.001893,
    'transgender': 0.006712,
    'heterosexual': 0.003248,
    'other_gender': 0.000882,
    'homosexual_gay_or_lesbian': 0.025378,
    'other_sexual_orientation': 0.001492
}

mean_race = {
    'asian': 0.011886,
    'black': 0.034276,
    'white': 0.056535,
    'latino': 0.006151,
    'other_race_or_ethnicity': 0.008158
}

mean_religion = {
    'hindu': 0.001443,
    'buddhist': 0.001393,
    'christian': 0.095184,
    'muslim': 0.049078,
    'jewish': 0.017910,
    'atheist': 0.003468,
    'other_religion': 0.006718
}

mean_gender_sorted1 = dict(sorted(mean_gender.items(), key=lambda item: item[1]))
mean_race_sorted1 = dict(sorted(mean_race.items(), key=lambda item: item[1]))
mean_religion_sorted1 = dict(sorted(mean_religion.items(), key=lambda item: item[1]))

plt.figure(figsize=(12, 6))

plt.bar(mean_gender_sorted1.keys(), mean_gender_sorted1.values(), label='Gender', color='#9CC8ED',hatch='/')
plt.bar(mean_race_sorted1.keys(), mean_race_sorted1.values(), label='Race', color='#F4E8AB',hatch='o')
plt.bar(mean_religion_sorted1.keys(), mean_religion_sorted1.values(), label='Religion', color='#AAE2C3',hatch='.')

plt.xlabel('Subgroups')
plt.ylabel('Mean Values')
plt.title('Mean Toxicity Scores for Different Subgroups')
plt.xticks(rotation=90)
plt.legend()


# Adding data labels to each bar
for data in [mean_gender_sorted1, mean_race_sorted1, mean_religion_sorted1]:
    for subgroup, value in data.items():
        plt.text(subgroup, value + 0.001, round(value, 3), ha='center', va='bottom', fontsize=8)

plt.tight_layout()
plt.show()


#### Mean toxicity for all subgroups for all the values of each column >= 0.5

In [None]:
#mean columns

mean_toxic_gender = toxic_data[gender_cols].apply(lambda x: x[x >= 0.5].mean())
mean_toxic_race = toxic_data[race_cols].apply(lambda x: x[x >= 0.5].mean())
mean_toxic_religion = toxic_data[religion_cols].apply(lambda x: x[x >= 0.5].mean())

In [None]:
print("Mean of Gender Columns (>= 0.5 where toxic is 1):")
print(mean_toxic_gender)

print("\nMean of Race Columns (>= 0.5 where toxic is 1):")
print(mean_toxic_race)

print("\nMean of Religion Columns (>= 0.5 where toxic is 1):")
print(mean_toxic_religion)

In [None]:
mean_gender = {
    'male': 0.832137,
    'female': 0.890529,
    'bisexual': 0.683539,
    'transgender': 0.821347,
    'heterosexual': 0.747159,
    'other_gender': 0.533333,
    'homosexual_gay_or_lesbian': 0.878025,
    'other_sexual_orientation': 0.533333
}

mean_race = {
    'asian': 0.758636,
    'black': 0.898275,
    'white': 0.896285,
    'latino': 0.729187,
    'other_race_or_ethnicity': 0.567602
}

mean_religion = {
    'hindu': 0.762626,
    'buddhist': 0.731650,
    'christian': 0.864552,
    'muslim': 0.905658,
    'jewish': 0.896070,
    'atheist': 0.847757,
    'other_religion': 0.528125
}

In [None]:
# in ascending order

mean_gender_sorted = dict(sorted(mean_gender.items(), key=lambda item: item[1]))
mean_race_sorted = dict(sorted(mean_race.items(), key=lambda item: item[1]))
mean_religion_sorted = dict(sorted(mean_religion.items(), key=lambda item: item[1]))

In [None]:
#plot

plt.figure(figsize=(12, 6))

plt.bar(mean_gender_sorted.keys(), mean_gender_sorted.values(), label='Gender', color='#9CC8ED',hatch='/')
plt.bar(mean_race_sorted.keys(), mean_race_sorted.values(), label='Race', color='#F4E8AB',hatch='o')
plt.bar(mean_religion_sorted.keys(), mean_religion_sorted.values(), label='Religion', color='#AAE2C3',hatch='.')


plt.xlabel('Subgroups')
plt.ylabel('Mean Values')
plt.title('Mean Toxicity Scores for Different Subgroups')
plt.xticks(rotation=90)
plt.legend()

# Adding data labels to each bar
for data in [mean_gender, mean_race, mean_religion]:
    for subgroup, value in data.items():
        plt.text(subgroup, value + 0.001, round(value, 3), ha='center', va='bottom', fontsize=8)

plt.tight_layout()
plt.savefig('mean_toxicity.png')

plt.show()

## <span style="color: Orange;"><b>Most common words </b></span>

In [None]:
toxic_gender = toxic_data.loc[t_gend, gender_cols + ['comment_text']]
toxic_race = toxic_data.loc[t_race, race_cols + ['comment_text']]
toxic_religion = toxic_data.loc[t_rel, religion_cols + ['comment_text']]

In [None]:
import string 


# Initialize the Porter Stemmer
stemmer = PorterStemmer()

def most_common_words(data):
    stop_words = set(stopwords.words('english'))  

    all_words = ' '.join(data['comment_text']).lower()
    
    # patterns to remove
    additional_exclusions = ['’', '...', 's',"n't",'get','one','would']  
    
    # remove punctuations
    pattern = re.compile(r'[^\w\s]')
    
    words = word_tokenize(all_words)
    # Remove stopwords, punctuation, and apply stemming
    filtered_words = [
        stemmer.stem(word)
        for word in words
        if word not in stop_words
        and word not in string.punctuation
        and word not in additional_exclusions
        and word not in ['like']  # Exclude specific word 'like'
        and not pattern.match(word)
    ]
    words_count = Counter(filtered_words)
    return words_count.most_common(10)  # Get the 10 most common words

# Get the most common stemmed words for each subgroup
most_common_gender = most_common_words(toxic_gender)
most_common_race = most_common_words(toxic_race)
most_common_religion = most_common_words(toxic_religion)

# Prepare data for plotting
common_words = {
    'Gender': most_common_gender,
    'Race': most_common_race,
    'Religion': most_common_religion
}

# Plotting
plt.figure(figsize=(12, 6))
colors = {'Gender': '#9CC8ED', 'Race': '#F4E8AB', 'Religion': '#AAE2C3'}

for subgroup, common_words in common_words.items():
    words, counts = zip(*common_words)
    plt.barh([f'{subgroup}: {word}' for word in words], counts, label=subgroup,color=colors[subgroup])

plt.xlabel('Frequency')
plt.title('Top 10 Most Common Stemmed Words in Toxic Comments for Different Subgroups')
plt.legend()
plt.tight_layout()
plt.savefig('most_common.png')
plt.show()


Note: words like People, Trump, White, Right are used in all three subcategories