# Linear Regression Analysis: Sentiment and Economic Anxiety

This notebook performs a minimal linear regression analysis to test two hypotheses:
1.  **Sentiment in lyrics has decreased significantly over the years.** (`lm(sentiment ~ year)`)
2.  **Economic anxiety in songs has increased over the years.** (`lm(economic_anxiety ~ year)`)

We use the lyric dataset from `recession_pop_index.ipynb` and the economic frequency data from `freq_economic_by_year.csv`. Sentiment modeling is based on the approach in `recession_pop_analysis_classify.ipynb`.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import kagglehub
import os
import ast
import re
from scipy import stats
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm

# Set plot style
sns.set_theme(style="whitegrid")
plt.rcParams["figure.figsize"] = (12, 6)

# 1. Load Lyrics Data (from recession_pop_index.ipynb source)
print("Loading lyrics dataset...")
path = kagglehub.dataset_download("rhaamrozenberg/billboards-top-100-song-1946-to-2022-lyrics")
csv_files = [f for f in os.listdir(path) if f.endswith('.csv')]
csv_path = os.path.join(path, csv_files[0])
df_lyrics = pd.read_csv(csv_path)

# Rename columns
rename_map = {
    'Hot100 Ranking Year': 'year',
    'Hot100 Rank': 'rank',
    'Lyrics': 'lyrics',
    'Song': 'title',
    'Artist Names': 'artist'
}
df_lyrics.rename(columns=rename_map, inplace=True)

# Clean Lyrics
def clean_lyrics(text):
    if not isinstance(text, str):
        return ""
    try:
        if text.strip().startswith('[') and text.strip().endswith(']'):
            words = ast.literal_eval(text)
            if isinstance(words, list):
                text = ' '.join(words)
    except (ValueError, SyntaxError):
        pass
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'[^a-z0-9\s\.,\'\?!]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df_lyrics['lyrics_clean'] = df_lyrics['lyrics'].apply(clean_lyrics)
df_lyrics = df_lyrics.dropna(subset=['lyrics_clean', 'year'])
df_lyrics['year'] = pd.to_numeric(df_lyrics['year'], errors='coerce')
df_lyrics = df_lyrics.dropna(subset=['year'])
df_lyrics['year'] = df_lyrics['year'].astype(int)

# Filter for relevant years (e.g., 2000-2023 to match other analysis, or full range)
# The user mentioned "Over the years", implying a trend. Let's keep 2000-2023 as in other notebooks, 
# or maybe the full range if available. The economic data goes from 2000 to 2022.
# Let's filter to match the economic data range roughly (2000-2022).
df_lyrics = df_lyrics[(df_lyrics['year'] >= 2000) & (df_lyrics['year'] <= 2022)]

print(f"Lyrics dataset shape (2000-2022): {df_lyrics.shape}")

# 2. Load Economic Anxiety Data
print("Loading economic anxiety data...")
df_econ = pd.read_csv("freq_economic_by_year.csv")
print(f"Economic data shape: {df_econ.shape}")
df_econ.head()

In [None]:
# 3. Compute Sentiment Scores
# Using cardiffnlp/twitter-roberta-base-sentiment-latest as in recession_pop_analysis_classify.ipynb

def get_device() -> torch.device:
    if torch.cuda.is_available():
        return torch.device("cuda")
    elif torch.backends.mps.is_available():
        return torch.device("mps")
    else:
        return torch.device("cpu")

def load_sentiment_model(model_name: str, device: torch.device):
    print(f"Loading sentiment model: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    model.eval()
    model.to(device)
    return tokenizer, model

def continuous_valence_score(probs: torch.Tensor) -> torch.Tensor:
    # probs: [Negative, Neutral, Positive]
    # Score = P(Positive) - P(Negative)
    return probs[:, 2] - probs[:, 0]

def compute_sentiment_batch(texts: list[str], model_name: str, batch_size: int = 32, device: torch.device = None) -> np.ndarray:
    if device is None:
        device = get_device()
    print(f"Using device: {device}")
    tokenizer, model = load_sentiment_model(model_name, device)
    all_scores = []
    
    for i in tqdm(range(0, len(texts), batch_size), desc="Sentiment batches"):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
            probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
            valence = continuous_valence_score(probs)
            all_scores.extend(valence.cpu().numpy())
    return np.array(all_scores)

MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment-latest"

# Run sentiment analysis
# Note: This may take some time.
print("Computing sentiment scores...")
sentiment_scores = compute_sentiment_batch(df_lyrics['lyrics_clean'].tolist(), model_name=MODEL_NAME, batch_size=32)
df_lyrics['sentiment'] = sentiment_scores

# Aggregate sentiment by year
yearly_sentiment = df_lyrics.groupby('year')['sentiment'].mean().reset_index()
print("Yearly sentiment aggregated.")
yearly_sentiment.head()

In [None]:
# 4. Linear Regression Analysis

# Merge datasets
df_analysis = pd.merge(yearly_sentiment, df_econ, on='year', how='inner')
print(f"Analysis dataset shape: {df_analysis.shape}")

# --- Regression 1: Sentiment ~ Year ---
slope_s, intercept_s, r_value_s, p_value_s, std_err_s = stats.linregress(df_analysis['year'], df_analysis['sentiment'])

print("\n--- Regression 1: Sentiment ~ Year ---")
print(f"Slope: {slope_s:.5f}")
print(f"P-value: {p_value_s:.5e}")
print(f"R-squared: {r_value_s**2:.5f}")
if p_value_s < 0.05:
    print("Result: Significant trend.")
else:
    print("Result: No significant trend.")

# --- Regression 2: Economic Anxiety ~ Year ---
slope_e, intercept_e, r_value_e, p_value_e, std_err_e = stats.linregress(df_analysis['year'], df_analysis['mean_freq_economic'])

print("\n--- Regression 2: Economic Anxiety ~ Year ---")
print(f"Slope: {slope_e:.5f}")
print(f"P-value: {p_value_e:.5e}")
print(f"R-squared: {r_value_e**2:.5f}")
if p_value_e < 0.05:
    print("Result: Significant trend.")
else:
    print("Result: No significant trend.")

# --- Visualization ---
fig, axes = plt.subplots(1, 2, figsize=(18, 6))

# Plot 1: Sentiment
sns.scatterplot(data=df_analysis, x='year', y='sentiment', ax=axes[0], color='purple', s=100)
sns.regplot(data=df_analysis, x='year', y='sentiment', ax=axes[0], scatter=False, color='red', line_kws={'label': f'Slope={slope_s:.4f}, p={p_value_s:.3f}'})
axes[0].set_title("Sentiment in Lyrics Over Time")
axes[0].legend()

# Plot 2: Economic Anxiety
sns.scatterplot(data=df_analysis, x='year', y='mean_freq_economic', ax=axes[1], color='green', s=100)
sns.regplot(data=df_analysis, x='year', y='mean_freq_economic', ax=axes[1], scatter=False, color='orange', line_kws={'label': f'Slope={slope_e:.4f}, p={p_value_e:.3f}'})
axes[1].set_title("Economic Anxiety (Frequency) Over Time")
axes[1].legend()

plt.tight_layout()
plt.show()

# --- Conclusion ---
print("\n--- Conclusion ---")
if slope_s < 0 and p_value_s < 0.05:
    print("Hypothesis 1 Supported: Sentiment has significantly decreased.")
else:
    print("Hypothesis 1 Not Supported: Sentiment has not significantly decreased.")

if slope_e > 0 and p_value_e < 0.05:
    print("Hypothesis 2 Supported: Economic anxiety has significantly increased.")
else:
    print("Hypothesis 2 Not Supported: Economic anxiety has not significantly increased.")

if (slope_s < 0 and p_value_s < 0.05) and (slope_e > 0 and p_value_e < 0.05):
    print("Overall: The data supports the hypothesis that as sentiment decreases, economic anxiety increases.")
else:
    print("Overall: The data does not fully support the combined hypothesis.")

# Linear Regression Analysis: Sentiment and Economic Anxiety in Lyrics

This notebook performs a linear regression analysis to investigate two hypotheses:
1.  Has sentiment in lyrics decreased significantly over the years?
2.  Has economic anxiety in songs gone up over the years?
3.  Is there an interaction effect between the two?

We will reconstruct the dataset by loading the lyrics, cleaning them, and extracting the necessary features (Sentiment and Economic Anxiety) to ensure we have aligned data for the interaction model.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import kagglehub
import os
import re
import ast
import statsmodels.api as sm
import statsmodels.formula.api as smf
from transformers import pipeline
from tqdm import tqdm

# Configure Plotting
sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# 1. Load Data
print("Loading dataset...")
path = kagglehub.dataset_download("rhaamrozenberg/billboards-top-100-song-1946-to-2022-lyrics")
csv_path = os.path.join(path, "Billboard_Hot_100_with_features.csv")

if not os.path.exists(csv_path):
    files = os.listdir(path)
    csv_files = [f for f in files if f.endswith('.csv')]
    if csv_files:
        csv_path = os.path.join(path, csv_files[0])

df = pd.read_csv(csv_path)

# Rename columns
rename_map = {
    'Hot100 Ranking Year': 'year',
    'Hot100 Rank': 'rank',
    'Lyrics': 'lyrics',
    'Song': 'title',
    'Artist Names': 'artist'
}
df.rename(columns=rename_map, inplace=True)

# Filter Year (2000-2023)
df['year'] = pd.to_numeric(df['year'], errors='coerce')
df = df[(df['year'] >= 2000) & (df['year'] <= 2023)]

# Clean Lyrics
def clean_lyrics(text):
    if not isinstance(text, str):
        return ""
    try:
        if text.strip().startswith('[') and text.strip().endswith(']'):
            words = ast.literal_eval(text)
            if isinstance(words, list):
                text = ' '.join(words)
    except:
        pass
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'[^a-z0-9\s\.,\'\?!]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['lyrics_clean'] = df['lyrics'].apply(clean_lyrics)
df = df[df['lyrics_clean'].str.len() > 50]
print(f"Data loaded: {len(df)} songs.")

Loading dataset...
Data loaded: 2134 songs.


In [3]:
# 2. Extract Features

# Economic Anxiety (Lexicon-based)
economic_lexicon = set([
    "economy", "economic", "recession", "unemployment", "inflation", "poverty",
    "debt", "crisis", "financial", "market", "jobless", "layoff", "bankruptcy",
    "foreclosure", "tax", "wage", "salary", "cost", "living", "budget", "spending",
    "savings", "investment", "interest", "loan", "mortgage", "credit", "depression",
    "stimulus", "aid", "funding", "deficit", "surplus", "trade", "tariff", "import", "export"
])

def get_economic_freq(text):
    words = text.split()
    if not words:
        return 0.0
    count = sum(1 for w in words if w in economic_lexicon)
    return count / len(words)

df['freq_economic'] = df['lyrics_clean'].apply(get_economic_freq)

# Sentiment (RoBERTa)
print("Initializing Sentiment Model...")
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model="cardiffnlp/twitter-roberta-base-sentiment-latest",
    tokenizer="cardiffnlp/twitter-roberta-base-sentiment-latest",
    max_length=512,
    truncation=True,
    device=-1 # Use CPU to be safe, or 0 if GPU available
)

def get_sentiment_score(text):
    # Truncate to 512 tokens roughly
    results = sentiment_pipeline(text[:2000]) # Limit char length to avoid excessive tokenization time
    # Result is [{'label': 'positive', 'score': 0.9}]
    # We want a continuous score. 
    # This model returns labels: positive, neutral, negative.
    # We can map them: positive=1, neutral=0, negative=-1 * score?
    # Or better: get all scores and do pos - neg.
    return results

# To get continuous valence, we need top_k=None
sentiment_pipeline_full = pipeline(
    "sentiment-analysis",
    model="cardiffnlp/twitter-roberta-base-sentiment-latest",
    tokenizer="cardiffnlp/twitter-roberta-base-sentiment-latest",
    max_length=512,
    truncation=True,
    top_k=None,
    device=-1
)

print("Calculating sentiment (this may take a while)...")
sentiments = []
batch_size = 32
texts = df['lyrics_clean'].tolist()

for i in tqdm(range(0, len(texts), batch_size)):
    batch = texts[i:i+batch_size]
    results = sentiment_pipeline_full(batch)
    for res in results:
        scores = {item['label']: item['score'] for item in res}
        valence = scores.get('positive', 0) - scores.get('negative', 0)
        sentiments.append(valence)

df['sentiment'] = sentiments
print("Feature extraction complete.")

Initializing Sentiment Model...


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu
Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing Robe

Calculating sentiment (this may take a while)...


 24%|██▍       | 16/67 [02:12<07:02,  8.28s/it]


KeyboardInterrupt: 

In [None]:
# 3. Linear Regression Analysis

# Model 1: Sentiment ~ Year
print("\n--- Model 1: Sentiment ~ Year ---")
model_sentiment = smf.ols("sentiment ~ year", data=df).fit()
print(model_sentiment.summary())

# Model 2: Economic Anxiety ~ Year
print("\n--- Model 2: Economic Anxiety ~ Year ---")
model_econ = smf.ols("freq_economic ~ year", data=df).fit()
print(model_econ.summary())

# Model 3: Interaction Effect
# Does the relationship between Year and Sentiment depend on Economic Anxiety?
# Or: sentiment ~ year * freq_economic
print("\n--- Model 3: Interaction (Sentiment ~ Year * Economic Anxiety) ---")
model_interaction = smf.ols("sentiment ~ year * freq_economic", data=df).fit()
print(model_interaction.summary())

# Visualization
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot 1: Sentiment over Time
sns.regplot(x='year', y='sentiment', data=df, ax=axes[0], scatter_kws={'alpha':0.1}, line_kws={'color':'red'})
axes[0].set_title("Sentiment over Years")

# Plot 2: Economic Anxiety over Time
sns.regplot(x='year', y='freq_economic', data=df, ax=axes[1], scatter_kws={'alpha':0.1}, line_kws={'color':'red'})
axes[1].set_title("Economic Anxiety over Years")

# Plot 3: Interaction Visualization (Binning Economic Anxiety)
# We'll split Economic Anxiety into High/Low to visualize the interaction
df['econ_anxiety_level'] = pd.qcut(df['freq_economic'], q=2, labels=['Low', 'High'], duplicates='drop')
sns.lmplot(x='year', y='sentiment', hue='econ_anxiety_level', data=df, height=6, aspect=1.5, scatter_kws={'alpha':0.1})
plt.title("Interaction: Sentiment ~ Year by Economic Anxiety Level")

plt.tight_layout()
plt.show()