In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier



# Load the SONG-LEVEL data
df_songs = pd.read_csv(r'C:\Users\joshu\OneDrive\Desktop\CS74\Final_Project\Data\songs_with_topic_probabilities.csv')
VISUALS_DIR = r'C:\Users\joshu\OneDrive\Desktop\CS74\Final_Project\visuals'

print(f"\n‚úì Loaded: {len(df_songs):,} individual songs")

# Check date column
date_cols = [col for col in df_songs.columns if 'date' in col.lower() or 'week' in col.lower()]
print(f"\nüìÖ Date columns: {date_cols}")

# Show sample
print(f"\nüìä Sample songs:")
print(df_songs[['Song', 'Performer', 'Week Position', 'WeekID']].head(10))

# Check for missing data
feature_cols = ['danceability', 'energy', 'valence', 'tempo', 
                'acousticness', 'instrumentalness', 'speechiness', 'loudness']

print(f"\nüîç Data completeness:")
for col in feature_cols + ['Song', 'Performer']:
    missing = df_songs[col].isna().sum()
    pct = (missing / len(df_songs)) * 100
    print(f"   {col:20s}: {len(df_songs) - missing:6,} valid ({100-pct:.1f}%)")

In [None]:
print("="*70)
print("MERGING WITH RECESSION DATA")
print("="*70)

# Load your monthly recession data
df_monthly = pd.read_csv(r'C:\Users\joshu\OneDrive\Desktop\CS74\Final_Project\music_econ_topics_merged.csv')

# Parse dates
df_songs['week_date'] = pd.to_datetime(df_songs['WeekID'])
df_monthly['week_date'] = pd.to_datetime(df_monthly['week_date'])

# Merge to get recession status for each song
df_songs_merged = df_songs.merge(
    df_monthly[['week_date', 'USREC']], 
    on='week_date', 
    how='left'
)

print(f"\n‚úì Merged successfully")
print(f"   Total songs: {len(df_songs_merged):,}")
print(f"   Songs during recessions: {(df_songs_merged['USREC']==1).sum():,}")
print(f"   Songs during normal times: {(df_songs_merged['USREC']==0).sum():,}")

# Clean data
df_songs_clean = df_songs_merged[feature_cols + ['Song', 'Performer', 'week_date', 'USREC', 'Week Position']].dropna()

print(f"\n‚úì Clean dataset: {len(df_songs_clean):,} songs with complete data")

In [None]:
print("="*70)
print("TRAINING MODEL & SCORING SONGS")
print("="*70)

# Prepare training data
X = df_songs_clean[feature_cols]
y = df_songs_clean['USREC']

# Train XGBoost
scale_pos_weight = (y==0).sum() / (y==1).sum()

xgb_model = XGBClassifier(
    n_estimators=150,
    max_depth=3,
    learning_rate=0.05,
    min_child_weight=5,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    eval_metric='logloss'
)

xgb_model.fit(X, y)

# Score every song
df_songs_clean['recession_score'] = xgb_model.predict_proba(X)[:, 1]

print("‚úì All songs scored!")

# Summary
print(f"\nüìä Recession Score Distribution:")
print(f"   Mean:   {df_songs_clean['recession_score'].mean():.3f}")
print(f"   Median: {df_songs_clean['recession_score'].median():.3f}")
print(f"   Min:    {df_songs_clean['recession_score'].min():.3f}")
print(f"   Max:    {df_songs_clean['recession_score'].max():.3f}")

# Distribution by actual recession status
print(f"\nüìà Average Score by Actual Status:")
print(f"   Recession songs:  {df_songs_clean[df_songs_clean['USREC']==1]['recession_score'].mean():.3f}")
print(f"   Normal songs:     {df_songs_clean[df_songs_clean['USREC']==0]['recession_score'].mean():.3f}")

In [None]:
print("="*70)
print("üî¥ TOP 20 MOST RECESSION-LIKE SONGS")
print("="*70)
print("(The Billboard Hot 100 songs that SOUND most like a recession)")

most_recession = df_songs_clean.nlargest(20, 'recession_score')

for idx, (i, row) in enumerate(most_recession.iterrows(), 1):
    song = row['Song']
    artist = row['Performer']
    score = row['recession_score']
    date = pd.to_datetime(row['week_date']).strftime('%B %Y')
    position = int(row['Week Position']) if not pd.isna(row['Week Position']) else 'N/A'
    actual = "‚úì RECESSION" if row['USREC'] == 1 else "‚úó Normal"
    
    print(f"\n{idx}. \"{song}\" by {artist}")
    print(f"   Recession Score: {score:.1%}")
    print(f"   Chart Date: {date} (Peak: #{position})")
    print(f"   Actual Period: {actual}")
    print(f"   Profile:")
    print(f"      Acousticness:  {row['acousticness']:.3f}  {'‚¨ÜÔ∏è' if row['acousticness'] > 0.3 else ''}")
    print(f"      Danceability:  {row['danceability']:.3f}  {'‚¨áÔ∏è' if row['danceability'] < 0.5 else ''}")
    print(f"      Energy:        {row['energy']:.3f}  {'‚¨áÔ∏è' if row['energy'] < 0.5 else ''}")
    print(f"      Valence:       {row['valence']:.3f}  {'‚¨ÜÔ∏è' if row['valence'] > 0.6 else '‚¨áÔ∏è' if row['valence'] < 0.4 else ''}")
    print(f"      Loudness:      {row['loudness']:.1f} dB")

In [None]:
print("="*70)
print("üîµ TOP 20 LEAST RECESSION-LIKE SONGS")
print("="*70)
print("(The Billboard Hot 100 songs that sound like pure celebration)")

least_recession = df_songs_clean.nsmallest(20, 'recession_score')

for idx, (i, row) in enumerate(least_recession.iterrows(), 1):
    song = row['Song']
    artist = row['Performer']
    score = row['recession_score']
    date = pd.to_datetime(row['week_date']).strftime('%B %Y')
    position = int(row['Week Position']) if not pd.isna(row['Week Position']) else 'N/A'
    actual = "‚úì RECESSION" if row['USREC'] == 1 else "‚úó Normal"
    
    print(f"\n{idx}. \"{song}\" by {artist}")
    print(f"   Recession Score: {score:.1%}")
    print(f"   Chart Date: {date} (Peak: #{position})")
    print(f"   Actual Period: {actual}")
    print(f"   Profile:")
    print(f"      Danceability:  {row['danceability']:.3f}  {'‚¨ÜÔ∏è' if row['danceability'] > 0.7 else ''}")
    print(f"      Energy:        {row['energy']:.3f}  {'‚¨ÜÔ∏è' if row['energy'] > 0.7 else ''}")
    print(f"      Valence:       {row['valence']:.3f}  {'‚¨ÜÔ∏è' if row['valence'] > 0.6 else ''}")
    print(f"      Acousticness:  {row['acousticness']:.3f}  {'‚¨áÔ∏è' if row['acousticness'] < 0.1 else ''}")
    print(f"      Loudness:      {row['loudness']:.1f} dB  {'‚¨ÜÔ∏è' if row['loudness'] > -5 else ''}")

In [None]:
print("="*70)
print("üü° FALSE POSITIVES: Party Songs During Recessions")
print("="*70)
print("(Songs that sound happy despite being during recessions)")

# False positives: Low recession score during actual recession
false_positives = df_songs_clean[
    (df_songs_clean['USREC'] == 1) & 
    (df_songs_clean['recession_score'] < 0.3)
].nsmallest(10, 'recession_score')

if len(false_positives) > 0:
    for idx, (i, row) in enumerate(false_positives.iterrows(), 1):
        song = row['Song']
        artist = row['Performer']
        score = row['recession_score']
        date = pd.to_datetime(row['week_date']).strftime('%B %Y')
        
        print(f"\n{idx}. \"{song}\" by {artist}")
        print(f"   Recession Score: {score:.1%} (sounds NORMAL)")
        print(f"   Date: {date} (ACTUAL RECESSION)")
        print(f"   Why it sounds upbeat:")
        print(f"      Danceability: {row['danceability']:.3f}")
        print(f"      Energy: {row['energy']:.3f}")
        print(f"      Valence: {row['valence']:.3f}")
else:
    print("   No party songs found during recessions!")

print("\n" + "="*70)
print("üü¢ FALSE NEGATIVES: Sad Songs During Good Times")
print("="*70)
print("(Songs that sound depressing despite good economy)")

# False negatives: High recession score during normal times
false_negatives = df_songs_clean[
    (df_songs_clean['USREC'] == 0) & 
    (df_songs_clean['recession_score'] > 0.7)
].nlargest(10, 'recession_score')

if len(false_negatives) > 0:
    for idx, (i, row) in enumerate(false_negatives.iterrows(), 1):
        song = row['Song']
        artist = row['Performer']
        score = row['recession_score']
        date = pd.to_datetime(row['week_date']).strftime('%B %Y')
        
        print(f"\n{idx}. \"{song}\" by {artist}")
        print(f"   Recession Score: {score:.1%} (sounds like RECESSION)")
        print(f"   Date: {date} (ACTUAL NORMAL)")
        print(f"   Why it sounds recession-like:")
        print(f"      Acousticness: {row['acousticness']:.3f}")
        print(f"      Energy: {row['energy']:.3f}")
        print(f"      Valence: {row['valence']:.3f}")
else:
    print("   No depressing songs found during good times!")

In [None]:
print("="*70)
print("CREATING VISUALIZATIONS")
print("="*70)

fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Plot 1: Distribution of recession scores
ax1 = axes[0, 0]
ax1.hist(df_songs_clean[df_songs_clean['USREC']==0]['recession_score'], 
         bins=50, alpha=0.6, label='Normal Period Songs', color='blue', density=True)
ax1.hist(df_songs_clean[df_songs_clean['USREC']==1]['recession_score'], 
         bins=50, alpha=0.6, label='Recession Period Songs', color='red', density=True)
ax1.axvline(x=0.5, color='black', linestyle='--', linewidth=2, label='Threshold')
ax1.set_xlabel('Recession Score', fontweight='bold', fontsize=12)
ax1.set_ylabel('Density', fontweight='bold', fontsize=12)
ax1.set_title('Song Recession Score Distribution', fontweight='bold', fontsize=13)
ax1.legend(fontsize=10)
ax1.grid(True, alpha=0.3)

# Plot 2: Feature comparison - Top recession vs party songs
ax2 = axes[0, 1]
features_plot = ['danceability', 'energy', 'valence', 'acousticness']
recession_profile = most_recession.head(10)[features_plot].mean()
party_profile = least_recession.head(10)[features_plot].mean()

x = np.arange(len(features_plot))
width = 0.35

bars1 = ax2.bar(x - width/2, recession_profile, width, label='Top 10 Recession Songs', color='#e74c3c')
bars2 = ax2.bar(x + width/2, party_profile, width, label='Top 10 Party Songs', color='#3498db')

ax2.set_ylabel('Feature Value', fontweight='bold', fontsize=11)
ax2.set_title('Musical Profile: Recession vs Party Songs', fontweight='bold', fontsize=13)
ax2.set_xticks(x)
ax2.set_xticklabels([f.capitalize() for f in features_plot], rotation=20, ha='right')
ax2.legend(fontsize=10)
ax2.grid(True, alpha=0.3, axis='y')

# Plot 3: Scatter - Danceability vs Acousticness
ax3 = axes[1, 0]

# Plot all songs
recession_songs = df_songs_clean[df_songs_clean['USREC']==1]
normal_songs = df_songs_clean[df_songs_clean['USREC']==0]

ax3.scatter(normal_songs['danceability'], normal_songs['acousticness'], 
           alpha=0.3, s=10, c='blue', label='Normal Period Songs')
ax3.scatter(recession_songs['danceability'], recession_songs['acousticness'], 
           alpha=0.5, s=15, c='red', label='Recession Period Songs')

# Highlight extremes
top_recession_songs = most_recession.head(5)
top_party_songs = least_recession.head(5)

ax3.scatter(top_recession_songs['danceability'], top_recession_songs['acousticness'], 
           s=200, c='darkred', marker='*', edgecolors='black', linewidth=2, 
           label='Top 5 Recession Songs', zorder=10)
ax3.scatter(top_party_songs['danceability'], top_party_songs['acousticness'], 
           s=200, c='darkblue', marker='D', edgecolors='black', linewidth=2, 
           label='Top 5 Party Songs', zorder=10)

ax3.set_xlabel('Danceability', fontweight='bold', fontsize=12)
ax3.set_ylabel('Acousticness', fontweight='bold', fontsize=12)
ax3.set_title('Song Feature Space\n(Danceability vs Acousticness)', fontweight='bold', fontsize=13)
ax3.legend(fontsize=9, loc='best')
ax3.grid(True, alpha=0.3)

# Plot 4: Timeline - When did extreme songs appear?
ax4 = axes[1, 1]

dates = pd.to_datetime(df_songs_clean['week_date'])
ax4.scatter(dates, df_songs_clean['recession_score'], 
           c=df_songs_clean['USREC'], cmap='RdBu_r', alpha=0.3, s=5)

# Mark extreme songs
top_rec_dates = pd.to_datetime(most_recession.head(10)['week_date'])
top_party_dates = pd.to_datetime(least_recession.head(10)['week_date'])

ax4.scatter(top_rec_dates, most_recession.head(10)['recession_score'], 
           s=100, c='red', marker='v', edgecolors='black', linewidth=1.5, 
           label='Top 10 Recession Songs', zorder=10)
ax4.scatter(top_party_dates, least_recession.head(10)['recession_score'], 
           s=100, c='blue', marker='^', edgecolors='black', linewidth=1.5, 
           label='Top 10 Party Songs', zorder=10)

ax4.axhline(y=0.5, color='black', linestyle='--', linewidth=2, alpha=0.5)
ax4.set_xlabel('Year', fontweight='bold', fontsize=12)
ax4.set_ylabel('Recession Score', fontweight='bold', fontsize=12)
ax4.set_title('Song Recession Scores Over Time', fontweight='bold', fontsize=13)
ax4.legend(fontsize=9, loc='best')
ax4.grid(True, alpha=0.3)

plt.suptitle('Individual Song Analysis: Recession Anthems vs Party Songs', 
             fontsize=16, fontweight='bold')
plt.tight_layout()
plt.savefig(f'{VISUALS_DIR}/individual_songs_analysis.png', dpi=300, bbox_inches='tight')
plt.show()
print("\n‚úì Saved: individual_songs_analysis.png")