REDDIT SENTIMENT ANALYSIS - DS I PROJECT

Research Question: How do men and women differ in their evaluations of workplace culture?
Hypothesis: Women prioritize culture; Men prioritize pay/promotion



In [2]:
pip install gender-guesser

Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import numpy as np
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import gender_guesser.detector as gender
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

print("="*80)
print("REDDIT WORKPLACE SENTIMENT ANALYSIS")
print("DS I Project - Sentiment Analysis Component")
print("="*80)


REDDIT WORKPLACE SENTIMENT ANALYSIS
DS I Project - Sentiment Analysis Component


STEP 1: Load the data

In [6]:
print("\n STEP 1: Loading Reddit Data...")
print("-" * 80)

import pandas as pd

df = pd.read_csv('reddit_data.csv')

print(f" Loaded {len(df):,} comments")
print(f" From subreddits: {df['subreddit'].unique().tolist()}")
print(f" Date range: {df['comment_created_utc'].min()} to {df['comment_created_utc'].max()}")

# Using the full dataset
SAMPLE_SIZE = None   

if SAMPLE_SIZE and SAMPLE_SIZE < len(df):
    print(f"\n Using sample of {SAMPLE_SIZE:,} comments")
    df = df.sample(n=SAMPLE_SIZE, random_state=42)
else:
    print(f"\n Using full dataset of {len(df):,} comments")

print(f" Final dataset: {len(df):,} comments")



 STEP 1: Loading Reddit Data...
--------------------------------------------------------------------------------
 Loaded 25,148 comments
 From subreddits: ['AskWomen', 'AskMen']
 Date range: 2018-03-16T22:49:40 to 2025-11-25T05:11:11

 Using full dataset of 25,148 comments
 Final dataset: 25,148 comments


STEP 2: Data Cleaning 

In [8]:

print("\n STEP 2: Data Cleaning...")
print("-" * 80)

initial_count = len(df)

# Remove duplicates
df = df.drop_duplicates(subset=['comment_id'])
print(f" Removed {initial_count - len(df)} duplicates")

# Handle missing values
df = df.dropna(subset=['comment_body', 'author_username'])
print(f" Removed comments with missing text or author")

# Remove deleted/removed comments
df = df[~df['comment_body'].isin(['[deleted]', '[removed]', ''])]
print(f" Removed deleted/removed comments")

# Basic text cleaning
df['comment_body'] = df['comment_body'].str.strip()
df = df[df['comment_body'].str.len() > 10]  # At least 10 characters
print(f" Removed very short comments")

print(f"\n Clean dataset: {len(df):,} comments")


 STEP 2: Data Cleaning...
--------------------------------------------------------------------------------
 Removed 0 duplicates
 Removed comments with missing text or author
 Removed deleted/removed comments
 Removed very short comments

 Clean dataset: 24,301 comments


#STEP 3: Gender Inference

In [9]:

print("\n STEP 3: Gender Inference...")
print("-" * 80)
print("Method: Using gender-guesser (alternative to Genderize API)")
print("Combining: Username analysis + Subreddit heuristics")

# Initialize gender detector
detector = gender.Detector()

def infer_gender_from_username(username):
    """Extract potential name from username and infer gender"""
    if pd.isna(username):
        return 'unknown', 'none'
    
    # Try to extract name-like patterns from username
    username = str(username).lower()
    
    # Common name patterns in usernames
    import re
    # Remove numbers and special characters
    clean_name = re.sub(r'[^a-z]', '', username)
    
    # Try first part of username (often contains name)
    if len(clean_name) >= 3:
        # Try different lengths
        for length in [len(clean_name), 8, 6, 4]:
            name_candidate = clean_name[:length]
            gender_guess = detector.get_gender(name_candidate)
            if gender_guess in ['male', 'female']:
                return gender_guess, 'username_high'
            elif gender_guess in ['mostly_male', 'mostly_female']:
                return gender_guess.replace('mostly_', ''), 'username_medium'
    
    return 'unknown', 'none'

print("Inferring gender from usernames...")
gender_results = []

for idx, row in df.iterrows():
    username = row['author_username']
    subreddit = row['subreddit']
    
    # First try username
    username_gender, username_confidence = infer_gender_from_username(username)
    
    # If unknown, use subreddit as fallback (per proposal heuristic)
    if username_gender == 'unknown':
        if subreddit == 'AskWomen':
            final_gender = 'female'
            confidence = 'subreddit'
        elif subreddit == 'AskMen':
            final_gender = 'male'
            confidence = 'subreddit'
        else:
            final_gender = 'unknown'
            confidence = 'none'
    else:
        final_gender = username_gender
        confidence = username_confidence
    
    gender_results.append({
        'inferred_gender': final_gender,
        'gender_confidence': confidence
    })
    
    if (idx + 1) % 5000 == 0:
        print(f"  Processed {idx + 1:,}/{len(df):,}...")

# Add to dataframe
gender_df = pd.DataFrame(gender_results)
df = pd.concat([df.reset_index(drop=True), gender_df], axis=1)

print(f"\n✓ Gender inference complete!")
print(f"\nGender Distribution:")
gender_counts = df['inferred_gender'].value_counts()
for gender, count in gender_counts.items():
    pct = count / len(df) * 100
    print(f"  {gender.capitalize()}: {count:,} ({pct:.1f}%)")

print(f"\nConfidence Levels:")
conf_counts = df['gender_confidence'].value_counts()
for conf, count in conf_counts.items():
    pct = count / len(df) * 100
    print(f"  {conf}: {count:,} ({pct:.1f}%)")



 STEP 3: Gender Inference...
--------------------------------------------------------------------------------
Method: Using gender-guesser (alternative to Genderize API)
Combining: Username analysis + Subreddit heuristics
Inferring gender from usernames...
  Processed 5,000/24,301...
  Processed 10,000/24,301...
  Processed 15,000/24,301...
  Processed 20,000/24,301...
  Processed 25,000/24,301...

✓ Gender inference complete!

Gender Distribution:
  Male: 16,156 (66.5%)
  Female: 8,145 (33.5%)

Confidence Levels:
  subreddit: 24,301 (100.0%)



## STEP 4: Sentiment Analysis


In [10]:
 
print("\n STEP 4: Sentiment Analysis (VADER)...")
print("-" * 80)
print("Per proposal: 'sentiment analysis using VADER and TextBlob libraries'")
print("Using: VADER (Valence Aware Dictionary and sEntiment Reasoner)")

analyzer = SentimentIntensityAnalyzer()

print(f"\nAnalyzing sentiment for {len(df):,} comments...")

sentiment_results = []

for idx, row in df.iterrows():
    text = row['comment_body']
    
    # Get VADER scores
    scores = analyzer.polarity_scores(str(text))
    
    # Classify sentiment
    if scores['compound'] >= 0.05:
        label = 'Positive'
    elif scores['compound'] <= -0.05:
        label = 'Negative'
    else:
        label = 'Neutral'
    
    sentiment_results.append({
        'sentiment_score': scores['compound'],
        'sentiment_label': label,
        'sentiment_pos': scores['pos'],
        'sentiment_neu': scores['neu'],
        'sentiment_neg': scores['neg']
    })
    
    if (idx + 1) % 5000 == 0:
        print(f"  Analyzed {idx + 1:,}/{len(df):,}...")

# Add to dataframe
sentiment_df = pd.DataFrame(sentiment_results)
df = pd.concat([df.reset_index(drop=True), sentiment_df], axis=1)

print(f"\n✓ Sentiment analysis complete!")
print(f"\nOverall Sentiment Distribution:")
sent_counts = df['sentiment_label'].value_counts()
for label, count in sent_counts.items():
    pct = count / len(df) * 100
    print(f"  {label}: {count:,} ({pct:.1f}%)")

print(f"\nAverage Sentiment Score: {df['sentiment_score'].mean():.3f}")



 STEP 4: Sentiment Analysis (VADER)...
--------------------------------------------------------------------------------
Per proposal: 'sentiment analysis using VADER and TextBlob libraries'
Using: VADER (Valence Aware Dictionary and sEntiment Reasoner)

Analyzing sentiment for 24,301 comments...
  Analyzed 5,000/24,301...
  Analyzed 10,000/24,301...
  Analyzed 15,000/24,301...
  Analyzed 20,000/24,301...

✓ Sentiment analysis complete!

Overall Sentiment Distribution:
  Positive: 12,162 (50.0%)
  Negative: 7,207 (29.7%)
  Neutral: 4,932 (20.3%)

Average Sentiment Score: 0.142



## STEP 5: Gender-based Comparison 

In [11]:


print("\n" + "="*80)
print("STEP 5: GENDER-BASED ANALYSIS")
print("Research Question: How do men and women differ in evaluations?")
print("="*80)

# Filter to known genders only
df_gendered = df[df['inferred_gender'].isin(['male', 'female'])].copy()

print(f"\nAnalyzing {len(df_gendered):,} comments with known gender")
print(f"  Male: {(df_gendered['inferred_gender']=='male').sum():,}")
print(f"  Female: {(df_gendered['inferred_gender']=='female').sum():,}")




STEP 5: GENDER-BASED ANALYSIS
Research Question: How do men and women differ in evaluations?

Analyzing 24,301 comments with known gender
  Male: 16,156
  Female: 8,145




# 5.1: Sentiment Comparison


In [13]:

# Statistical test 
male_scores = df_gendered[df_gendered['inferred_gender']=='male']['sentiment_score']
female_scores = df_gendered[df_gendered['inferred_gender']=='female']['sentiment_score']

t_stat, p_value = stats.ttest_ind(male_scores, female_scores)

print(f"\n STATISTICAL TEST (T-Test):")
print(f"  t-statistic: {t_stat:.4f}")
print(f"  p-value: {p_value:.4f}")

if p_value < 0.001:
    print(f"   HIGHLY SIGNIFICANT (p < 0.001)")
elif p_value < 0.05:
    print(f"   SIGNIFICANT (p < 0.05)")
else:
    print(f"  → Not statistically significant (p >= 0.05)")

# Effect size
mean_diff = female_scores.mean() - male_scores.mean()
pooled_std = np.sqrt((male_scores.std()**2 + female_scores.std()**2) / 2)
cohens_d = mean_diff / pooled_std

print(f"\n  Effect size (Cohen's d): {cohens_d:.3f}")
if abs(cohens_d) < 0.2:
    print(f"     Small effect")
elif abs(cohens_d) < 0.5:
    print(f"     Medium effect")
else:
    print(f"     Large effect")



 STATISTICAL TEST (T-Test):
  t-statistic: 5.3900
  p-value: 0.0000
   HIGHLY SIGNIFICANT (p < 0.001)

  Effect size (Cohen's d): -0.071
     Small effect



# 5.2: Topic Focus - Culture vs Pay 

In [14]:


print("\n" + "-"*80)
print("5.2: TOPIC FOCUS - Testing Hypothesis")
print("Hypothesis: Women prioritize culture; Men prioritize pay")
print("-"*80)

# Define keywords (from proposal)
CULTURE_KEYWORDS = [
    'culture', 'respect', 'toxic', 'supportive', 'inclusive', 'discrimination',
    'harassment', 'sexism', 'atmosphere', 'environment', 'team', 'colleague',
    'manager', 'boss', 'leadership', 'communication', 'work-life', 'balance',
    'flexible', 'flexibility', 'remote', 'mentor', 'mentorship', 'support'
]

PAY_KEYWORDS = [
    'pay', 'salary', 'wage', 'compensation', 'bonus', 'raise', 'money',
    'benefits', 'insurance', 'pension', '401k', 'stock', 'equity', 'pto',
    'vacation', 'time off', 'promotion', 'advancement', 'career', 'opportunity'
]

def contains_keywords(text, keywords):
    """Check if text contains any keywords"""
    if pd.isna(text):
        return False
    text_lower = str(text).lower()
    return any(keyword in text_lower for keyword in keywords)

# Categorize comments
df_gendered['mentions_culture'] = df_gendered['comment_body'].apply(
    lambda x: contains_keywords(x, CULTURE_KEYWORDS)
)
df_gendered['mentions_pay'] = df_gendered['comment_body'].apply(
    lambda x: contains_keywords(x, PAY_KEYWORDS)
)

print("\nTOPIC MENTION RATES:")

for gender in ['male', 'female']:
    gender_df = df_gendered[df_gendered['inferred_gender'] == gender]
    
    culture_pct = (gender_df['mentions_culture'].sum() / len(gender_df)) * 100
    pay_pct = (gender_df['mentions_pay'].sum() / len(gender_df)) * 100
    
    print(f"\n{gender.upper()}:")
    print(f"  Mention culture/environment: {culture_pct:.1f}%")
    print(f"  Mention pay/benefits:        {pay_pct:.1f}%")
    print(f"  Ratio (culture/pay):         {culture_pct/pay_pct if pay_pct > 0 else 0:.2f}")

# Calculate differences
female_df = df_gendered[df_gendered['inferred_gender'] == 'female']
male_df = df_gendered[df_gendered['inferred_gender'] == 'male']

women_culture_pct = (female_df['mentions_culture'].sum() / len(female_df)) * 100
men_culture_pct = (male_df['mentions_culture'].sum() / len(male_df)) * 100
women_pay_pct = (female_df['mentions_pay'].sum() / len(female_df)) * 100
men_pay_pct = (male_df['mentions_pay'].sum() / len(male_df)) * 100

print(f"\n COMPARISON:")
culture_diff = women_culture_pct - men_culture_pct
pay_diff = women_pay_pct - men_pay_pct

print(f"  Women mention culture {abs(culture_diff):.1f}% {'MORE' if culture_diff > 0 else 'LESS'} than men")
print(f"  Women mention pay     {abs(pay_diff):.1f}% {'MORE' if pay_diff > 0 else 'LESS'} than men")

print(f"\n HYPOTHESIS TEST:")
if culture_diff > 5:
    print(f"   CONFIRMED: Women focus MORE on culture ({culture_diff:+.1f}%)")
else:
    print(f"   NOT SUPPORTED: No major difference in culture focus")

if pay_diff < -5:
    print(f"   CONFIRMED: Men focus MORE on pay ({abs(pay_diff):.1f}%)")
else:
    print(f"   NOT SUPPORTED: No major difference in pay focus")




--------------------------------------------------------------------------------
5.2: TOPIC FOCUS - Testing Hypothesis
Hypothesis: Women prioritize culture; Men prioritize pay
--------------------------------------------------------------------------------

TOPIC MENTION RATES:

MALE:
  Mention culture/environment: 16.9%
  Mention pay/benefits:        13.9%
  Ratio (culture/pay):         1.21

FEMALE:
  Mention culture/environment: 30.5%
  Mention pay/benefits:        9.6%
  Ratio (culture/pay):         3.18

 COMPARISON:
  Women mention culture 13.6% MORE than men
  Women mention pay     4.4% LESS than men

 HYPOTHESIS TEST:
   CONFIRMED: Women focus MORE on culture (+13.6%)
   NOT SUPPORTED: No major difference in pay focus


In [16]:

print("\n" + "="*80)
print("STEP 6: Creating Visualizations")
print("="*80)

sns.set_style("whitegrid")
fig = plt.figure(figsize=(20, 12))
fig.suptitle('Reddit Workplace Sentiment Analysis - Gender Comparison', 
             fontsize=18, fontweight='bold', y=0.995) 

# Chart 1: Sentiment Distribution by Gender
ax1 = plt.subplot(2, 3, 1)
male_counts = male_df['sentiment_label'].value_counts()
female_counts = female_df['sentiment_label'].value_counts()
x = np.arange(3)
width = 0.35
categories = ['Positive', 'Negative', 'Neutral']
male_values = [male_counts.get(cat, 0) for cat in categories]
female_values = [female_counts.get(cat, 0) for cat in categories]
ax1.bar(x - width/2, male_values, width, label='Men', color='#3498db', alpha=0.8)
ax1.bar(x + width/2, female_values, width, label='Women', color='#e74c3c', alpha=0.8)
ax1.set_title('Sentiment Distribution by Gender', fontsize=14, fontweight='bold')
ax1.set_xticks(x)
ax1.set_xticklabels(categories)
ax1.set_ylabel('Number of Comments')
ax1.legend()
ax1.grid(alpha=0.3, axis='y')

# Chart 2: Average Sentiment Score
ax2 = plt.subplot(2, 3, 2)
genders = ['Men', 'Women']
scores = [male_scores.mean(), female_scores.mean()]
colors = ['#3498db', '#e74c3c']
bars = ax2.bar(genders, scores, color=colors, alpha=0.8)
ax2.set_title('Average Sentiment Score', fontsize=14, fontweight='bold')
ax2.set_ylabel('Sentiment Score')
ax2.set_ylim([-0.2, 0.3])
ax2.axhline(0, color='black', linestyle='-', linewidth=0.8)
for i, v in enumerate(scores):
    ax2.text(i, v + 0.01, f'{v:.3f}', ha='center', fontweight='bold', fontsize=12)
ax2.grid(alpha=0.3, axis='y')

# Chart 3: Sentiment Percentages
ax3 = plt.subplot(2, 3, 3)
male_pcts = male_df['sentiment_label'].value_counts(normalize=True) * 100
female_pcts = female_df['sentiment_label'].value_counts(normalize=True) * 100
categories = ['Positive', 'Negative', 'Neutral']
male_pct_values = [male_pcts.get(cat, 0) for cat in categories]
female_pct_values = [female_pcts.get(cat, 0) for cat in categories]
x = np.arange(3)
width = 0.35
ax3.bar(x - width/2, male_pct_values, width, label='Men', color='#3498db', alpha=0.8)
ax3.bar(x + width/2, female_pct_values, width, label='Women', color='#e74c3c', alpha=0.8)
ax3.set_title('Sentiment Percentage by Gender', fontsize=14, fontweight='bold')
ax3.set_xticks(x)
ax3.set_xticklabels(categories)
ax3.set_ylabel('Percentage (%)')
ax3.legend()
ax3.grid(alpha=0.3, axis='y')

# Chart 4: Topic Focus - Culture vs Pay
ax4 = plt.subplot(2, 3, 4)
topics = ['Culture\nMentions', 'Pay\nMentions']
women_values = [women_culture_pct, women_pay_pct]
men_values = [men_culture_pct, men_pay_pct]
x = np.arange(2)
width = 0.35
ax4.bar(x - width/2, men_values, width, label='Men', color='#3498db', alpha=0.8)
ax4.bar(x + width/2, women_values, width, label='Women', color='#e74c3c', alpha=0.8)
ax4.set_title('Topic Focus: Culture vs Pay (% mentioning)', fontsize=14, fontweight='bold')
ax4.set_xticks(x)
ax4.set_xticklabels(topics)
ax4.set_ylabel('Percentage (%)')
ax4.legend()
ax4.grid(alpha=0.3, axis='y')
for i in range(2):
    ax4.text(i - width/2, men_values[i] + 0.5, f'{men_values[i]:.1f}%', 
            ha='center', fontsize=10)
    ax4.text(i + width/2, women_values[i] + 0.5, f'{women_values[i]:.1f}%', 
            ha='center', fontsize=10)

# Chart 5: Score Distribution Histogram
ax5 = plt.subplot(2, 3, 5)
ax5.hist(male_scores, bins=50, alpha=0.5, label='Men', color='#3498db', density=True)
ax5.hist(female_scores, bins=50, alpha=0.5, label='Women', color='#e74c3c', density=True)
ax5.axvline(male_scores.mean(), color='#3498db', linestyle='--', linewidth=2)
ax5.axvline(female_scores.mean(), color='#e74c3c', linestyle='--', linewidth=2)
ax5.set_title('Sentiment Score Distribution', fontsize=14, fontweight='bold')
ax5.set_xlabel('Sentiment Score')
ax5.set_ylabel('Density')
ax5.legend()
ax5.grid(alpha=0.3)

# Chart 6: Negative Sentiment Analysis
ax6 = plt.subplot(2, 3, 6)
women_neg = female_df[female_df['sentiment_label'] == 'Negative']
men_neg = male_df[male_df['sentiment_label'] == 'Negative']
topics_neg = ['Culture', 'Pay']
women_neg_values = [
    (women_neg['mentions_culture'].sum() / len(women_neg) * 100),
    (women_neg['mentions_pay'].sum() / len(women_neg) * 100)
]
men_neg_values = [
    (men_neg['mentions_culture'].sum() / len(men_neg) * 100),
    (men_neg['mentions_pay'].sum() / len(men_neg) * 100)
]
x = np.arange(2)
width = 0.35
ax6.bar(x - width/2, men_neg_values, width, label='Men', color='#3498db', alpha=0.8)
ax6.bar(x + width/2, women_neg_values, width, label='Women', color='#e74c3c', alpha=0.8)
ax6.set_title('When NEGATIVE: What They Discuss', fontsize=14, fontweight='bold')
ax6.set_xticks(x)
ax6.set_xticklabels(topics_neg)
ax6.set_ylabel('% of Negative Comments')
ax6.legend()
ax6.grid(alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('reddit_sentiment_analysis_final.png', dpi=300, bbox_inches='tight')
print("\n Saved: reddit_sentiment_analysis_final.png")
plt.close()



STEP 6: Creating Visualizations

 Saved: reddit_sentiment_analysis_final.png
