In [3]:
!pip install vaderSentiment
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import f_oneway
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import warnings
warnings.filterwarnings('ignore')

# Install required packages (run once)
# pip install textblob vaderSentiment

from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

print("="*80)
print("MIMIC-III SENTIMENT ANALYSIS")
# LOAD DATA
print(" Loading data...")

df = pd.read_csv('MIMICIII_Cleaned_Merged.csv')
print(f"✓ Loaded {len(df):,} records")

# Filter to top 5 ethnicities
top_5_ethnicities = [
    'WHITE',
    'BLACK/AFRICAN AMERICAN',
    'HISPANIC OR LATINO',
    'OTHER',
    'ASIAN'
]

df_filtered = df[df['ETHNICITY'].isin(top_5_ethnicities)].copy()
print(f"✓ Filtered to {len(df_filtered):,} records across 5 ethnicities")

# SENTIMENT ANALYSIS METHODS
print("Initializing sentiment analyzers")

# Initialize VADER (Valence Aware Dictionary and sEntiment Reasoner)
# VADER is designed for social media but works well for clinical text
vader_analyzer = SentimentIntensityAnalyzer()

print(" VADER sentiment analyzer initialized")
print(" TextBlob sentiment analyzer initialized")

#  EXTRACT SENTIMENT FEATURES
print("\n" + "="*80)
print("STEP 3: Extracting sentiment features from clinical notes...")
print("="*80)
print("This may take 10-15 minutes for 42,926 notes...")

def get_vader_sentiment(text):
    """Get VADER sentiment scores"""
    if pd.isna(text) or len(str(text).strip()) == 0:
        return {'neg': 0, 'neu': 0, 'pos': 0, 'compound': 0}
    try:
        scores = vader_analyzer.polarity_scores(str(text))
        return scores
    except:
        return {'neg': 0, 'neu': 0, 'pos': 0, 'compound': 0}

def get_textblob_sentiment(text):
    """Get TextBlob sentiment (polarity and subjectivity)"""
    if pd.isna(text) or len(str(text).strip()) == 0:
        return {'polarity': 0, 'subjectivity': 0}
    try:
        blob = TextBlob(str(text))
        return {
            'polarity': blob.sentiment.polarity,
            'subjectivity': blob.sentiment.subjectivity
        }
    except:
        return {'polarity': 0, 'subjectivity': 0}

def classify_sentiment(compound_score):
    """Classify sentiment based on VADER compound score"""
    if compound_score >= 0.05:
        return 'Positive'
    elif compound_score <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

# Extract VADER sentiment
print("\nExtracting VADER sentiment scores...")
vader_results = df_filtered['TEXT'].apply(get_vader_sentiment)

df_filtered['vader_negative'] = vader_results.apply(lambda x: x['neg'])
df_filtered['vader_neutral'] = vader_results.apply(lambda x: x['neu'])
df_filtered['vader_positive'] = vader_results.apply(lambda x: x['pos'])
df_filtered['vader_compound'] = vader_results.apply(lambda x: x['compound'])
df_filtered['vader_sentiment'] = df_filtered['vader_compound'].apply(classify_sentiment)

print(" sentiment extracted")

# Extract TextBlob sentiment
print("\nExtracting TextBlob sentiment scores")
textblob_results = df_filtered['TEXT'].apply(get_textblob_sentiment)

df_filtered['textblob_polarity'] = textblob_results.apply(lambda x: x['polarity'])
df_filtered['textblob_subjectivity'] = textblob_results.apply(lambda x: x['subjectivity'])

print(" sentiment extracted")

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2
MIMIC-III SENTIMENT ANALYSIS
 Loading data...
✓ Loaded 50,000 records
✓ Filtered to 42,926 records across 5 ethnicities
Initializing sentiment analyzers
 VADER sentiment analyzer initialized
 TextBlob sentiment analyzer initialized

STEP 3: Extracting sentiment features from clinical notes...
This may take 10-15 minutes for 42,926 notes...

Extracting VADER sentiment scores...
 sentiment extracted

Extracting TextBlob sentiment scores
 sentiment extracted


In [7]:
# STATISTICAL ANALYSIS (ANOVA)
print(" Statistical tests for sentiment differences")

# Prepare groups
groups = {eth: df_filtered[df_filtered['ETHNICITY'] == eth] for eth in top_5_ethnicities}

# Features to test
sentiment_features = [
    ('vader_negative', 'VADER Negative Score'),
    ('vader_neutral', 'VADER Neutral Score'),
    ('vader_positive', 'VADER Positive Score'),
    ('vader_compound', 'VADER Compound Score'),
    ('textblob_polarity', 'TextBlob Polarity'),
    ('textblob_subjectivity', 'TextBlob Subjectivity')
]

sentiment_anova_results = []

for feature, feature_name in sentiment_features:
    # Extract data for each group
    group_data = [groups[eth][feature].dropna() for eth in top_5_ethnicities]
    
    # Run ANOVA
    f_stat, p_value = f_oneway(*group_data)
    
    # Calculate effect size (eta-squared)
    all_data = df_filtered[feature].dropna()
    grand_mean = all_data.mean()
    ss_total = np.sum((all_data - grand_mean) ** 2)
    ss_between = sum(len(g) * (g.mean() - grand_mean) ** 2 for g in group_data)
    eta_squared = ss_between / ss_total if ss_total > 0 else 0
    
    df_between = len(top_5_ethnicities) - 1
    df_within = len(all_data) - len(top_5_ethnicities)
    
    sentiment_anova_results.append({
        'Feature': feature_name,
        'F-statistic': f_stat,
        'df_between': df_between,
        'df_within': df_within,
        'p-value': p_value,
        'eta_squared': eta_squared,
        'Significant': 'Yes' if p_value < 0.05 else 'No'
    })

sentiment_anova_df = pd.DataFrame(sentiment_anova_results)
sentiment_anova_df['F-statistic'] = sentiment_anova_df['F-statistic'].round(3)
sentiment_anova_df['p-value'] = sentiment_anova_df['p-value'].round(4)
sentiment_anova_df['eta_squared'] = sentiment_anova_df['eta_squared'].round(6)

print("\n ANOVA Results for Sentiment Features")
print(sentiment_anova_df.to_string(index=False))


 Statistical tests for sentiment differences

 ANOVA Results for Sentiment Features
              Feature  F-statistic  df_between  df_within  p-value  eta_squared Significant
 VADER Negative Score       59.548           4      42921      0.0     0.005519         Yes
  VADER Neutral Score       30.832           4      42921      0.0     0.002865         Yes
 VADER Positive Score       22.059           4      42921      0.0     0.002052         Yes
 VADER Compound Score       61.385           4      42921      0.0     0.005688         Yes
    TextBlob Polarity        7.757           4      42921      0.0     0.000722         Yes
TextBlob Subjectivity       47.622           4      42921      0.0     0.004419         Yes


In [None]:

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd

# POST-HOC TESTS
print(" Post-hoc comparisons for significant features")

# Post-hoc for VADER Compound (overall sentiment)
if sentiment_anova_df[sentiment_anova_df['Feature'] == 'VADER Compound Score']['Significant'].values[0] == 'Yes':
    print(" VADER Compound Score")
    tukey_compound = pairwise_tukeyhsd(
        endog=df_filtered['vader_compound'],
        groups=df_filtered['ETHNICITY'],
        alpha=0.05
    )
    print(tukey_compound)

# Post-hoc for TextBlob Polarity
if sentiment_anova_df[sentiment_anova_df['Feature'] == 'TextBlob Polarity']['Significant'].values[0] == 'Yes':
    print("\n\nPost-hoc: TextBlob Polarity")
    print("-" * 80)
    tukey_polarity = pairwise_tukeyhsd(
        endog=df_filtered['textblob_polarity'],
        groups=df_filtered['ETHNICITY'],
        alpha=0.05
    )
    print(tukey_polarity)

# VISUALIZATIONS
print(" Creating sentiment visualizations")
# Mapping dictionary for ethnicities
Ethnicity_Label = {
    "WHITE": "White",
    "BLACK/AFRICAN AMERICAN": "Black",
    "ASIAN": "Asian",
    "HISPANIC OR LATINO": "Hispanic",
    "OTHER": "Other"
}

df_filtered['Ethnicity_Short'] = df_filtered['ETHNICITY'].map(Ethnicity_Label)

# Figure 1: VADER Sentiment Components

analyzer = SentimentIntensityAnalyzer()

# 2 — Function to compute VADER sentiment
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
def get_vader_scores(text):
    if pd.isna(text):
        return pd.Series([0,0,0,0])
    scores = analyzer.polarity_scores(str(text))
    return pd.Series([
        scores['neg'],
        scores['pos'],
        scores['neu'],
        scores['compound']
    ])
    scores = analyzer.polarity_scores(str(text))
    return pd.Series([
        scores['neg'],
        scores['pos'],
        scores['neu'],
        scores['compound']
    ])

df_filtered[['VADER_Negative', 'VADER_Positive', 'VADER_Neutral', 'VADER_Compound']] = \
    df_filtered['TEXT'].apply(get_vader_scores)


sentiment_df = df_filtered.groupby('Ethnicity_Short')[[
    'VADER_Negative', 
    'VADER_Positive', 
    'VADER_Neutral', 
    'VADER_Compound'
]].mean().reset_index()

# Negative sentiment
sns.barplot(data=sentiment_df, x='Ethnicity', y='VADER_Negative', 
            palette='Reds_d', ax=axes[0,0])
axes[0,0].set_title('VADER Negative Sentiment by Ethnicity', fontsize=12, fontweight='bold')
axes[0,0].set_ylabel('Mean Score', fontsize=10)
axes[0,0].tick_params(axis='x', rotation=45)

# Neutral sentiment
sns.barplot(data=sentiment_df, x='Ethnicity', y='VADER_Neutral',
            palette='Greys_d', ax=axes[0,1])
axes[0,1].set_title('VADER Neutral Sentiment by Ethnicity', fontsize=12, fontweight='bold')
axes[0,1].set_ylabel('Mean Score', fontsize=10)
axes[0,1].tick_params(axis='x', rotation=45)

# Positive sentiment
sns.barplot(data=sentiment_df, x='Ethnicity', y='VADER_Positive',
            palette='Greens_d', ax=axes[1,0])
axes[1,0].set_title('VADER Positive Sentiment by Ethnicity', fontsize=12, fontweight='bold')
axes[1,0].set_ylabel('Mean Score', fontsize=10)
axes[1,0].tick_params(axis='x', rotation=45)

# Compound sentiment
sns.barplot(data=sentiment_df, x='Ethnicity', y='VADER_Compound',
            palette='Blues_d', ax=axes[1,1])
axes[1,1].set_title('VADER Compound Sentiment by Ethnicity', fontsize=12, fontweight='bold')
axes[1,1].set_ylabel('Mean Score', fontsize=10)
axes[1,1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig('vader_sentiment_by_ethnicity.png', dpi=300, bbox_inches='tight')
print("vader_sentiment_by_ethnicity.png")
plt.show()

# Figure 2: TextBlob Sentiment
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

sns.barplot(data=sentiment_df, x='Ethnicity', y='TextBlob_Polarity',
            palette='RdYlGn', ax=ax1)
ax1.set_title('TextBlob Polarity by Ethnicity', fontsize=14, fontweight='bold')
ax1.set_ylabel('Polarity Score (-1 to +1)', fontsize=12)
ax1.tick_params(axis='x', rotation=45)
ax1.axhline(y=0, color='black', linestyle='--', linewidth=0.5)

sns.barplot(data=sentiment_df, x='Ethnicity', y='TextBlob_Subjectivity',
            palette='Purples_d', ax=ax2)
ax2.set_title('TextBlob Subjectivity by Ethnicity', fontsize=14, fontweight='bold')
ax2.set_ylabel('Subjectivity Score (0 to 1)', fontsize=12)
ax2.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig('textblob_sentiment_by_ethnicity.png', dpi=300, bbox_inches='tight')
print(" textblob_sentiment_by_ethnicity.png")
plt.show()

# Figure 3: Sentiment Category Distribution
fig, ax = plt.subplots(figsize=(12, 6))

sentiment_counts = df_filtered.groupby(['Ethnicity_Short', 'vader_sentiment']).size().unstack(fill_value=0)
sentiment_counts_pct = sentiment_counts.div(sentiment_counts.sum(axis=1), axis=0) * 100

sentiment_counts_pct.plot(kind='bar', stacked=True, ax=ax, 
                         color=['#d62728', '#7f7f7f', '#2ca02c'])
ax.set_title('Sentiment Distribution by Ethnicity (%)', fontsize=14, fontweight='bold')
ax.set_xlabel('Ethnicity', fontsize=12)
ax.set_ylabel('Percentage', fontsize=12)
ax.legend(title='Sentiment', bbox_to_anchor=(1.05, 1), loc='upper left')
ax.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig('sentiment_distribution_by_ethnicity.png', dpi=300, bbox_inches='tight')
print(" sentiment_distribution_by_ethnicity.png")
plt.show()

 Post-hoc comparisons for significant features
 VADER Compound Score
                Multiple Comparison of Means - Tukey HSD, FWER=0.05                 
        group1                 group2         meandiff p-adj   lower   upper  reject
------------------------------------------------------------------------------------
                 ASIAN BLACK/AFRICAN AMERICAN  -0.3417    0.0 -0.4146 -0.2687   True
                 ASIAN     HISPANIC OR LATINO  -0.2695    0.0 -0.3552 -0.1839   True
                 ASIAN                  OTHER  -0.1625    0.0 -0.2505 -0.0744   True
                 ASIAN                  WHITE  -0.3304    0.0 -0.3973 -0.2635   True
BLACK/AFRICAN AMERICAN     HISPANIC OR LATINO   0.0721 0.0147  0.0094  0.1348   True
BLACK/AFRICAN AMERICAN                  OTHER   0.1792    0.0  0.1132  0.2452   True
BLACK/AFRICAN AMERICAN                  WHITE   0.0113 0.8823 -0.0216  0.0441  False
    HISPANIC OR LATINO                  OTHER   0.1071 0.0023  0.0273  0.1868   T