# Emotional and Linguistic Framing of Digital Detox

### Notebook 4: BERTopicEmotion and Sentiment Analysis of Text Using TF-IDF, VAD Lexicon, TextBlob, and PCA

This notebook performs emotion and sentiment analysis on Reddit posts using:

- **TF-IDF + VAD Lexicon**: Captures emotional norms (Valence, Arousal, Dominance) at word level.
- **TextBlob**: Provides polarity and subjectivity scores on raw text.
- **PCA**: Reduces the high-dimensional emotion space into two principal components for visualization.

The analysis helps understand how emotions are thematically expressed across posts labeled as digital detox vs. general/control.


In [1]:
# required packages
!pip install pandas scikit-learn textblob seaborn

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
import ast

Collecting textblob
  Using cached textblob-0.19.0-py3-none-any.whl.metadata (4.4 kB)
Using cached textblob-0.19.0-py3-none-any.whl (624 kB)
Installing collected packages: textblob
Successfully installed textblob-0.19.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
# Load detox and control datasets
detox_df = pd.read_csv("/home/jovyan/XXX/Back up/XXX/detox_with_topics.csv")
control_df = pd.read_csv("/home/jovyan/XXX/Back up/XXX/control_with_topics.csv")

### Calculate VAD from Tokens (unweighted)

In [3]:
# code adapted from notebook - data classification
# load NRC VAD Lexicon
vad_nrc = pd.read_csv("/home/jovyan/XXX/Back up/NRC-VAD-Lexicon-v2.1.txt", sep='\t')
vad_nrc.columns = vad_nrc.columns.str.lower()
vad_nrc.set_index('term', inplace=True)
vad_nrc.index = vad_nrc.index.str.lower()
vad_nrc = vad_nrc.dropna(subset=['valence', 'arousal', 'dominance'])

# store for lookup
vad_norms_data = vad_nrc

In [4]:
# ensure VAD lexicon index is lowercase
vad_norms_data.index = vad_norms_data.index.str.lower()

def avg_vad_for_tokens(tokens):
    tokens = [t.lower() for t in tokens if isinstance(t, str)]
    vad_values = vad_norms_data.loc[vad_norms_data.index.intersection(tokens)]
    if not vad_values.empty:
        return vad_values.mean()
    else:
        return pd.Series({'valence': None, 'arousal': None, 'dominance': None})

# apply to df
for df in [detox_df, control_df]:
    df[['valence', 'arousal', 'dominance']] = df['body_tokens'].apply(avg_vad_for_tokens)

### TF-IDF Vectorization

In [5]:
# convert a string representation of a list back into list
def str_to_list(s):
    try:
        return ast.literal_eval(s)
    except:
        return []

def fix_token_string(token_string):
    tokens = str_to_list(token_string)
    return ' '.join([''.join(token.split()) for token in tokens])

# apply to both df
for df in [detox_df, control_df]:
    df['body_tokens'] = df['body_tokens'].apply(str_to_list)
    df['clean_text'] = df['body_tokens'].apply(lambda tokens: ' '.join(tokens))

# combine all posts and labels
combined_df = pd.concat([control_df, detox_df], ignore_index=True)
all_texts = combined_df['clean_text'].tolist()
labels = ['control'] * len(control_df) + ['detox'] * len(detox_df)
combined_df['label'] = labels

### Combine TF-IDF × VAD (VAD-weighted TF-IDF)

In [6]:
# TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.9, min_df=2)
tfidf_matrix = vectorizer.fit_transform(all_texts)
vocab = vectorizer.get_feature_names_out()
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=vocab)

# filter to words present in VAD
common_words = df_tfidf.columns.intersection(vad_nrc.index)
tfidf_common = df_tfidf[common_words]
vad_common = vad_nrc.loc[common_words]

# multiply TF-IDF with VAD
tfidf_valence = tfidf_common * vad_common['valence'].values
tfidf_arousal = tfidf_common * vad_common['arousal'].values
tfidf_dominance = tfidf_common * vad_common['dominance'].values

# aggregate document-level emotion scores
df_vad_weighted = pd.DataFrame({
    'valence': tfidf_valence.sum(axis=1),
    'arousal': tfidf_arousal.sum(axis=1),
    'dominance': tfidf_dominance.sum(axis=1),
})

### TextBlob sentiment code

In [7]:
# Apply textblob in for each row of a dataframe. (n.d.). Stack Overflow. https://stackoverflow.com/questions/43485469/apply-textblob-in-for-each-row-of-a-dataframe
# calculate sentiment polarity and subjectivity for each cleaned text using textblob
combined_df['polarity'] = combined_df['clean_text'].apply(lambda x: TextBlob(x).sentiment.polarity)
combined_df['subjectivity'] = combined_df['clean_text'].apply(lambda x: TextBlob(x).sentiment.subjectivity)

df_vad_weighted['polarity'] = combined_df['polarity'].values
df_vad_weighted['subjectivity'] = combined_df['subjectivity'].values
df_vad_weighted['label'] = combined_df['label'].values
df_vad_weighted['topic'] = combined_df['topic'].values

### PCA

In [8]:
# code adapted from NLP revison notebook
# drop any rows with missing values
df_vad_clean = df_vad_weighted.dropna(subset=['valence', 'arousal', 'dominance'])

# run PCA
pca = PCA(n_components=2)
vad_features = df_vad_clean[['valence', 'arousal', 'dominance']]
vad_pca = pca.fit_transform(vad_features)

# create PCA df
df_pca = pd.DataFrame(vad_pca, columns=['PC1', 'PC2'])
df_pca['label'] = df_vad_clean['label'].values
df_pca['topic'] = df_vad_clean['topic'].values

In [10]:
df_vad_weighted.to_csv("df_vad_weighted_with_sentiment.csv", index=False)
df_pca.to_csv("df_vad_pca.csv", index=False)