# RUN THURSDAY.ipynb
# ====================================================================
# üìä GUT SCORE INTEGRATION PIPELINE
# Run this on Thursday to update training data with your latest ratings
# ====================================================================

In [2]:
import os
import glob
from datetime import datetime

import pandas as pd

# ====================================================================
# üéØ THURSDAY: CLEAN OUTPUT VERSION
# Only shows NEW albums being processed
# ====================================================================

print("üóìÔ∏è RUN THURSDAY: Clean Output Version")
print("=" * 60)
print("üìÖ " + datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
print("=" * 60)

# --- STEP 1: LOAD GUT SCORES FROM MASTER FILE ---
print("\nüìä Loading gut scores from MASTER file...")

master_file = 'feedback/master_gut_scores.csv'

if not os.path.exists(master_file):
    print(f"‚ùå Master file not found: {master_file}")
    exit()

master_df = pd.read_csv(master_file)
master_df = master_df[master_df['gut_score'].notna()]

if len(master_df) == 0:
    print("‚ùå No gut scores found!")
    exit()

total_albums = len(master_df)
print(f"‚úÖ Found {total_albums} total rated albums in master file")

# --- STEP 2: LOAD TRAINING DATA ---
print("\nüìÅ Loading training data...")
training_file = 'data/2026_training_complete_with_features.csv'

if not os.path.exists(training_file):
    print(f"‚ùå Training file not found: {training_file}")
    exit()

df_training = pd.read_csv(training_file)
print(f"  Current: {len(df_training):,} tracks")

# Count existing gut-scored tracks
if 'source_type' in df_training.columns:
    existing_gut = df_training[df_training['source_type'] == 'gut_score_rated']
    existing_count = len(existing_gut)
    print(f"  Existing gut-scored tracks: {existing_count:,}")
else:
    existing_count = 0

# --- STEP 3: FIND AND ADD ONLY NEW TRACKS ---
print("\nüéµ Processing NEW gut-scored albums...")

# Get archives (newest first)
archive_files = sorted(glob.glob('data/archived_nmf_with_features/*.csv'), reverse=True)

if not archive_files:
    print("‚ùå No archive files found!")
    exit()

tracks_added = 0
new_albums = 0
new_albums_list = []

for _, fb_row in master_df.iterrows():
    artist = fb_row['Artist']
    album = fb_row['Album']
    score = fb_row['gut_score']
    
    # Skip if already in training as gut_score_rated
    already_in_training = df_training[
        (df_training['Album Name'] == album) &
        (df_training['Artist Name(s)'].str.contains(artist, na=False)) &
        (df_training['source_type'] == 'gut_score_rated')
    ]
    
    if len(already_in_training) > 0:
        continue  # SILENTLY skip - don't print anything!
    
    # ---- ONLY REACH HERE FOR NEW ALBUMS ----
    if new_albums == 0:  # First new album
        print("  Processing...")
    
    new_albums += 1
    new_albums_list.append((artist, album, score))
    
    # Search archives
    found_tracks = None
    for archive_file in archive_files:
        try:
            archive_df = pd.read_csv(archive_file)
            album_tracks = archive_df[
                (archive_df['Album Name'] == album) &
                (archive_df['Artist Name(s)'].str.contains(artist, na=False))
            ].copy()
            
            if len(album_tracks) > 0:
                found_tracks = album_tracks
                break
        except:
            continue
    
    if found_tracks is None:
        print(f"  ‚ùå Could not find tracks for: {artist} - {album}")
        continue
    
    # Add gut score
    found_tracks['liked'] = score
    found_tracks['source_type'] = 'gut_score_rated'
    found_tracks['gut_score_date'] = datetime.now().strftime('%Y-%m-%d')
    
    # Add to training
    df_training = pd.concat([df_training, found_tracks], ignore_index=True)
    tracks_added += len(found_tracks)

# --- STEP 4: DUPLICATE HANDLING (SILENT) ---
if tracks_added > 0:
    # Silent duplicate handling
    def clean_artist_name(artist_str):
        if pd.isna(artist_str): return ""
        artist = str(artist_str).strip()
        separators = [' feat. ', ' featuring ', ' ft. ', ' with ', ' & ', ' and ', ';', ',']
        for sep in separators:
            if sep in artist.lower():
                artist = artist.split(sep)[0].strip()
        return artist
    
    df_training['artist_clean'] = df_training['Artist Name(s)'].apply(clean_artist_name)
    df_training = df_training.sort_values('liked', ascending=False)
    df_training = df_training.drop_duplicates(
        subset=['Album Name', 'artist_clean', 'Track Name'],
        keep='first'
    )
    df_training = df_training.drop(columns=['artist_clean'], errors='ignore')

# --- STEP 5: SAVE AND REPORT ---
print("\n" + "=" * 60)
print("üìä WEEKLY REPORT")
print("=" * 60)

if tracks_added > 0:
    # Create backup
    backup_dir = 'data/backups'
    os.makedirs(backup_dir, exist_ok=True)
    backup_file = f"{backup_dir}/training_backup_{datetime.now().strftime('%Y%m%d')}.csv"
    pd.read_csv(training_file).to_csv(backup_file, index=False)
    
    # Save updated training
    df_training.to_csv(training_file, index=False)
    
    print(f"‚úÖ ADDED THIS WEEK:")
    print("-" * 60)
    for artist, album, score in new_albums_list:
        print(f"  ‚Ä¢ {artist[:25]:<25} - {album[:25]:<25} ‚Üí {score}")
    
    print("-" * 60)
    print(f"  Tracks added: {tracks_added:,}")
    print(f"  Albums added: {new_albums}")
    
else:
    print("‚ÑπÔ∏è No new gut scores to add this week")

# Final stats
print(f"\nüìà OVERALL TOTALS:")
print("-" * 60)
print(f"  Total rated albums: {total_albums}")
print(f"  Gut-scored tracks: {len(df_training[df_training['source_type'] == 'gut_score_rated']):,}")
print(f"  Total training tracks: {len(df_training):,}")

print("\n" + "=" * 60)
print("‚úÖ THURSDAY PROCESSING COMPLETE")
print("=" * 60)

üóìÔ∏è RUN THURSDAY: Clean Output Version
üìÖ 2026-01-16 20:52:02

üìä Loading gut scores from MASTER file...
‚úÖ Found 21 total rated albums in master file

üìÅ Loading training data...
  Current: 10,284 tracks
  Existing gut-scored tracks: 165

üéµ Processing NEW gut-scored albums...
  Processing...
  ‚ùå Could not find tracks for: A$AP Rocky - Don't Be Dumb

üìä WEEKLY REPORT
‚úÖ ADDED THIS WEEK:
------------------------------------------------------------
  ‚Ä¢ Julianna Barwick;Mary Lat - Tragic Magic              ‚Üí 61.0
  ‚Ä¢ Tyler Ramsey;Carl Broemel - Celestun                  ‚Üí 54.0
  ‚Ä¢ The Sha La Das            - Your Picture              ‚Üí 53.0
  ‚Ä¢ A$AP Rocky                - Don't Be Dumb             ‚Üí 57.0
  ‚Ä¢ Robbie Williams;Tony Iomm - BRITPOP                   ‚Üí 44.0
  ‚Ä¢ Jana Horn                 - Jana Horn                 ‚Üí 50.0
  ‚Ä¢ Courtney Marie Andrews    - Valentine                 ‚Üí 65.0
-----------------------------------------------

In [3]:
# ====================================================================
# üéØ THURSDAY: ENHANCED WITH RELEASE DATES
# Shows which NMF week each album came from
# ====================================================================

print("üóìÔ∏è RUN THURSDAY: Enhanced with Release Dates")
print("=" * 60)
print("üìÖ " + datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
print("=" * 60)

# --- STEP 1: LOAD GUT SCORES FROM MASTER FILE ---
print("\nüìä Loading gut scores from MASTER file...")

master_file = 'feedback/master_gut_scores.csv'

if not os.path.exists(master_file):
    print(f"‚ùå Master file not found: {master_file}")
    exit()

master_df = pd.read_csv(master_file)
master_df = master_df[master_df['gut_score'].notna()]

if len(master_df) == 0:
    print("‚ùå No gut scores found!")
    exit()

total_albums = len(master_df)
print(f"‚úÖ Found {total_albums} total rated albums in master file")

# --- STEP 2: LOAD TRAINING DATA ---
print("\nüìÅ Loading training data...")
training_file = 'data/2026_training_complete_with_features.csv'

if not os.path.exists(training_file):
    print(f"‚ùå Training file not found: {training_file}")
    exit()

df_training = pd.read_csv(training_file)
print(f"  Current: {len(df_training):,} tracks")

# Count existing gut-scored tracks
if 'source_type' in df_training.columns:
    existing_gut = df_training[df_training['source_type'] == 'gut_score_rated']
    existing_count = len(existing_gut)
    print(f"  Existing gut-scored tracks: {existing_count:,}")
else:
    existing_count = 0

# --- STEP 3: MAP ARCHIVE FILES TO DATES ---
print("\nüóÇÔ∏è  Scanning archive files...")
archive_files = sorted(glob.glob('data/archived_nmf_with_features/*.csv'), reverse=True)

if not archive_files:
    print("‚ùå No archive files found!")
    exit()

# Create mapping: filename -> human readable date
archive_date_map = {}
for archive_file in archive_files:
    filename = os.path.basename(archive_file)
    # Extract date: "2026-01-11_nmf_complete.csv" -> "2026-01-11"
    date_str = filename.split('_')[0]
    
    try:
        # Convert to readable format: "Jan 11, 2026"
        date_obj = datetime.strptime(date_str, '%Y-%m-%d')
        readable_date = date_obj.strftime('%b %d, %Y')
        archive_date_map[archive_file] = readable_date
    except:
        archive_date_map[archive_file] = date_str

print(f"  Found {len(archive_files)} archive weeks")

# --- STEP 4: PROCESS NEW ALBUMS WITH RELEASE DATES ---
print("\nüéµ Processing NEW gut-scored albums...")

tracks_added = 0
new_albums = 0
new_albums_info = []  # Store (artist, album, score, release_date, track_count)

for _, fb_row in master_df.iterrows():
    artist = fb_row['Artist']
    album = fb_row['Album']
    score = fb_row['gut_score']
    
    # Skip if already in training
    already_in_training = df_training[
        (df_training['Album Name'] == album) &
        (df_training['Artist Name(s)'].str.contains(artist, na=False)) &
        (df_training['source_type'] == 'gut_score_rated')
    ]
    
    if len(already_in_training) > 0:
        continue
    
    # Search archives
    found_tracks = None
    found_archive = None
    release_date = "Unknown"
    
    for archive_file in archive_files:
        try:
            archive_df = pd.read_csv(archive_file)
            album_tracks = archive_df[
                (archive_df['Album Name'] == album) &
                (archive_df['Artist Name(s)'].str.contains(artist, na=False))
            ].copy()
            
            if len(album_tracks) > 0:
                found_tracks = album_tracks
                found_archive = archive_file
                release_date = archive_date_map.get(archive_file, "Unknown")
                break
        except Exception as e:
            continue
    
    if found_tracks is None:
        print(f"  ‚ùå Could not find tracks for: {artist} - {album}")
        continue
    
    new_albums += 1
    new_albums_info.append({
        'artist': artist,
        'album': album, 
        'score': score,
        'release_date': release_date,
        'tracks': len(found_tracks)
    })
    
    # Add gut score
    found_tracks['liked'] = score
    found_tracks['source_type'] = 'gut_score_rated'
    found_tracks['gut_score_date'] = datetime.now().strftime('%Y-%m-%d')
    found_tracks['nmf_release_date'] = release_date
    
    # Add to training
    df_training = pd.concat([df_training, found_tracks], ignore_index=True)
    tracks_added += len(found_tracks)

# --- STEP 5: SILENT DUPLICATE HANDLING ---
if tracks_added > 0:
    def clean_artist_name(artist_str):
        if pd.isna(artist_str): return ""
        artist = str(artist_str).strip()
        separators = [' feat. ', ' featuring ', ' ft. ', ' with ', ' & ', ' and ', ';', ',']
        for sep in separators:
            if sep in artist.lower():
                artist = artist.split(sep)[0].strip()
        return artist
    
    df_training['artist_clean'] = df_training['Artist Name(s)'].apply(clean_artist_name)
    df_training = df_training.sort_values('liked', ascending=False)
    df_training = df_training.drop_duplicates(
        subset=['Album Name', 'artist_clean', 'Track Name'],
        keep='first'
    )
    df_training = df_training.drop(columns=['artist_clean'], errors='ignore')

# --- STEP 6: ENHANCED REPORT WITH RELEASE DATES ---
print("\n" + "=" * 60)
print("üìä WEEKLY REPORT")
print("=" * 60)

if tracks_added > 0:
    # Create backup
    backup_dir = 'data/backups'
    os.makedirs(backup_dir, exist_ok=True)
    backup_file = f"{backup_dir}/training_backup_{datetime.now().strftime('%Y%m%d')}.csv"
    pd.read_csv(training_file).to_csv(backup_file, index=False)
    
    # Save updated training
    df_training.to_csv(training_file, index=False)
    
    print(f"‚úÖ ADDED THIS WEEK ({len(new_albums_info)} albums):")
    print("-" * 60)
    
    # Group by release date
    albums_by_date = {}
    for info in new_albums_info:
        date = info['release_date']
        if date not in albums_by_date:
            albums_by_date[date] = []
        albums_by_date[date].append(info)
    
    # Show by release date (most recent first)
    for date in sorted(albums_by_date.keys(), reverse=True):
        print(f"\nüìÖ {date}:")
        for info in albums_by_date[date]:
            print(f"  ‚Ä¢ {info['artist'][:22]:<22} - {info['album'][:22]:<22}")
            print(f"      Score: {info['score']} | Tracks: {info['tracks']}")
    
    print("-" * 60)
    print(f"  Total tracks added: {tracks_added:,}")
    
else:
    print("‚ÑπÔ∏è No new gut scores to add this week")

# Final stats with breakdown
print(f"\nüìà OVERALL TOTALS:")
print("-" * 60)

# Get all gut-scored tracks
if 'nmf_release_date' in df_training.columns:
    gut_scored = df_training[df_training['source_type'] == 'gut_score_rated']
    
    if len(gut_scored) > 0:
        # Count by release date
        date_counts = gut_scored['nmf_release_date'].value_counts()
        
        print(f"  Gut-scored tracks by release week:")
        for date, count in date_counts.head(5).items():  # Show top 5
            print(f"    {date}: {count:,} tracks")
        
        if len(date_counts) > 5:
            print(f"    ... and {len(date_counts) - 5} more weeks")
    
    print(f"  Total gut-scored tracks: {len(gut_scored):,}")
else:
    gut_scored = df_training[df_training['source_type'] == 'gut_score_rated']
    print(f"  Total gut-scored tracks: {len(gut_scored):,}")

print(f"  Total rated albums: {total_albums}")
print(f"  Total training tracks: {len(df_training):,}")

print("\n" + "=" * 60)
print("‚úÖ THURSDAY PROCESSING COMPLETE")
print("=" * 60)

üóìÔ∏è RUN THURSDAY: Enhanced with Release Dates
üìÖ 2026-01-16 20:52:04

üìä Loading gut scores from MASTER file...
‚úÖ Found 21 total rated albums in master file

üìÅ Loading training data...
  Current: 10,333 tracks
  Existing gut-scored tracks: 214

üóÇÔ∏è  Scanning archive files...
  Found 2 archive weeks

üéµ Processing NEW gut-scored albums...
  ‚ùå Could not find tracks for: A$AP Rocky - Don't Be Dumb

üìä WEEKLY REPORT
‚ÑπÔ∏è No new gut scores to add this week

üìà OVERALL TOTALS:
------------------------------------------------------------
  Total gut-scored tracks: 214
  Total rated albums: 21
  Total training tracks: 10,333

‚úÖ THURSDAY PROCESSING COMPLETE


In [8]:
# ====================================================================
# üï∞Ô∏è CELL 3: HISTORICAL ALBUM PROCESSOR (FIXED FOR SPECIAL CHARACTERS)
# For re-rating old albums and adding to training
# ====================================================================

print("üï∞Ô∏è CELL 3: HISTORICAL ALBUM PROCESSOR (FIXED VERSION)")
print("=" * 60)

import re

def safe_contains(series, text):
    """Safely check if series contains text, escaping regex special chars"""
    if pd.isna(text):
        return pd.Series([False] * len(series))
    
    escaped_text = re.escape(str(text))
    return series.str.contains(escaped_text, na=False, regex=True)

def find_album_tracks_safely(df, artist, album):
    """Safely find album tracks accounting for special characters and variations"""
    results = []
    
    # Try exact match first
    mask = (
        (df['Album Name'] == album) &
        safe_contains(df['Artist Name(s)'], artist)
    )
    exact_matches = df[mask]
    
    if len(exact_matches) > 0:
        return exact_matches
    
    # Try fuzzy matching for album name
    for album_col in ['Album Name', 'Album']:
        if album_col in df.columns:
            for artist_col in ['Artist Name(s)', 'Artist']:
                if artist_col in df.columns:
                    # Check for partial matches
                    album_mask = df[album_col].str.contains(re.escape(album), na=False, regex=True, case=False)
                    artist_mask = safe_contains(df[artist_col], artist)
                    
                    matches = df[album_mask & artist_mask]
                    if len(matches) > 0:
                        return matches
    
    return pd.DataFrame()  # Empty if not found

# Load ALL gut scores (including from historical ratings)
master_file = 'feedback/master_gut_scores.csv'
master_df = pd.read_csv(master_file)
master_df = master_df[master_df['gut_score'].notna()]

print(f"üìä Total gut scores in master: {len(master_df)}")

# Load training to see what's already there
training_file = 'data/2026_training_complete_with_features.csv'
df_training = pd.read_csv(training_file)

# Identify NEW albums not yet in training (including historical)
print("\nüéØ Identifying NEW albums (including historical)...")

new_albums_to_process = []
missing_albums = []

for _, fb_row in master_df.iterrows():
    artist = fb_row['Artist']
    album = fb_row['Album']
    score = fb_row['gut_score']
    
    # Check if already in training as gut_score_rated
    already_in_training = df_training[
        (df_training['Album Name'] == album) &
        safe_contains(df_training['Artist Name(s)'], artist) &
        (df_training['source_type'] == 'gut_score_rated')
    ]
    
    if len(already_in_training) > 0:
        continue  # Already processed
    
    new_albums_to_process.append({
        'artist': artist,
        'album': album,
        'score': score,
        'source_file': fb_row.get('source_file', 'unknown'),
        'raw_artist': artist,  # Keep original for debugging
        'raw_album': album     # Keep original for debugging
    })

print(f"Found {len(new_albums_to_process)} albums needing processing")

# Search for these albums in ALL possible sources
print("\nüîç Searching for album data (with regex escaping)...")

# 1. Recent archives (with full features)
archive_files = sorted(glob.glob('data/archived_nmf_with_features/*.csv'), reverse=True)
# 2. Old prediction files (metadata only)
prediction_files = sorted(glob.glob('predictions/*_Album_Recommendations.csv'), reverse=True)

processed_count = 0
ready_for_features = []
needs_features = []
special_char_albums = []

for album_info in new_albums_to_process:
    artist = album_info['artist']
    album = album_info['album']
    
    # Check for special characters
    special_chars = re.findall(r'[^\w\s\-\.\']', artist + album)
    if special_chars:
        special_char_albums.append({
            'artist': artist,
            'album': album,
            'chars': special_chars
        })
    
    found = False
    
    # Search in recent archives (has features)
    for archive_file in archive_files:
        try:
            archive_df = pd.read_csv(archive_file)
            tracks = find_album_tracks_safely(archive_df, artist, album)
            
            if len(tracks) > 0:
                # Found with features - can add directly
                ready_for_features.append({
                    'artist': artist,
                    'album': album,
                    'score': album_info['score'],
                    'tracks': tracks,
                    'source': 'archive',
                    'source_file': archive_file,
                    'special_chars': special_chars
                })
                found = True
                break
        except Exception as e:
            continue
    
    if not found:
        # Search in old predictions (metadata only)
        for pred_file in prediction_files:
            try:
                pred_df = pd.read_csv(pred_file)
                
                # Handle column name variations
                album_col = 'Album' if 'Album' in pred_df.columns else 'Album Name'
                artist_col = 'Artist' if 'Artist' in pred_df.columns else 'Artist Name(s)'
                
                # Use safe search
                if album_col in pred_df.columns and artist_col in pred_df.columns:
                    album_mask = pred_df[album_col].str.contains(re.escape(album), na=False, regex=True, case=False)
                    artist_mask = safe_contains(pred_df[artist_col], artist)
                    
                    album_rows = pred_df[album_mask & artist_mask]
                    
                    if len(album_rows) > 0:
                        # Found metadata but needs audio features
                        needs_features.append({
                            'artist': artist,
                            'album': album,
                            'score': album_info['score'],
                            'source': 'prediction',
                            'source_file': pred_file,
                            'special_chars': special_chars,
                            'prediction_data': album_rows.iloc[0].to_dict()
                        })
                        found = True
                        break
            except Exception as e:
                continue
    
    if not found:
        missing_albums.append({
            'artist': artist,
            'album': album,
            'special_chars': special_chars
        })

# Report findings
print("\n" + "=" * 60)
print("üìä SEARCH RESULTS")
print("=" * 60)

print(f"‚úÖ Ready to add (has audio features): {len(ready_for_features)} albums")
print(f"‚ö†Ô∏è  Needs audio features: {len(needs_features)} albums")
print(f"‚ùå Not found in any source: {len(missing_albums)} albums")

if special_char_albums:
    print(f"\n‚ö†Ô∏è  Albums with special characters ({len(special_char_albums)}):")
    print("-" * 40)
    for i, item in enumerate(special_char_albums):
        chars_str = ', '.join([f"'{c}'" for c in item['chars'][:3]])
        if len(item['chars']) > 3:
            chars_str += f" (+{len(item['chars']) - 3} more)"
        print(f"{i+1:2d}. {item['artist'][:20]:<20} - {item['album'][:20]:<20}")
        print(f"     Special chars: {chars_str}")

if ready_for_features:
    print(f"\nüéµ Albums ready for immediate addition:")
    print("-" * 40)
    for i, item in enumerate(ready_for_features[:10]):
        filename = os.path.basename(item['source_file'])
        char_note = "‚ö†Ô∏è" if item['special_chars'] else ""
        print(f"{i+1:2d}. {item['artist'][:22]:<22} - {item['album'][:22]:<22} {char_note}")
        print(f"     Score: {item['score']} | Tracks: {len(item['tracks'])} | Source: {filename[:15]}...")
    if len(ready_for_features) > 10:
        print(f"     ... and {len(ready_for_features) - 10} more")

if needs_features:
    print(f"\nüîß Albums needing feature fetching:")
    print("-" * 40)
    for i, item in enumerate(needs_features[:5]):
        char_note = "‚ö†Ô∏è" if item['special_chars'] else ""
        print(f"{i+1:2d}. {item['artist'][:22]:<22} - {item['album'][:22]:<22} {char_note}")
        print(f"     Score: {item['score']} | Source: prediction file")

if missing_albums:
    print(f"\n‚ùå Albums not found (need investigation):")
    print("-" * 40)
    for i, missing in enumerate(missing_albums[:5]):
        char_note = "‚ö†Ô∏è" if missing['special_chars'] else ""
        print(f"{i+1:2d}. {missing['artist'][:25]:<25} - {missing['album'][:25]:<25} {char_note}")
        if missing['special_chars']:
            chars_str = ', '.join([f"'{c}'" for c in missing['special_chars'][:3]])
            print(f"     Special chars: {chars_str}")
    if len(missing_albums) > 5:
        print(f"     ... and {len(missing_albums) - 5} more")

# Auto-add ready albums if we found any
if ready_for_features:
    print(f"\nüíæ Auto-adding {len(ready_for_features)} albums to training...")
    
    for item in ready_for_features:
        tracks = item['tracks'].copy()
        tracks['liked'] = item['score']
        tracks['source_type'] = 'gut_score_rated'
        tracks['gut_score_date'] = datetime.now().strftime('%Y-%m-%d')
        tracks['historical_source'] = 're_rated'
        tracks['special_chars_note'] = str(item['special_chars']) if item['special_chars'] else ''
        
        df_training = pd.concat([df_training, tracks], ignore_index=True)
        print(f"  ‚úì Added: {item['artist'][:25]:<25} - {item['album'][:25]:<25}")
    
    # Save updated training
    df_training.to_csv(training_file, index=False)
    print(f"\n‚úÖ Successfully added {len(ready_for_features)} albums to training!")
    
    # Show updated stats
    updated_training = pd.read_csv(training_file)
    gut_scored = updated_training[updated_training['source_type'] == 'gut_score_rated']
    print(f"üìà New totals: {len(gut_scored):,} gut-scored tracks out of {len(updated_training):,} total")

print("\n" + "=" * 60)
print("üéØ RECOMMENDED NEXT STEPS:")
print("=" * 60)
print("1. Add 'Historical Rating' page to Streamlit app")
print("2. Check artist/album names for special characters ($, ., *, etc.)")
print("3. Consider fuzzy matching for artist names with variations")
print("4. Create batch Spotify API fetcher for albums needing features")

print("\n‚úÖ CELL 3 COMPLETE (with regex escaping fix)")

üï∞Ô∏è CELL 3: HISTORICAL ALBUM PROCESSOR (FIXED VERSION)
üìä Total gut scores in master: 21

üéØ Identifying NEW albums (including historical)...
Found 1 albums needing processing

üîç Searching for album data (with regex escaping)...

üìä SEARCH RESULTS
‚úÖ Ready to add (has audio features): 1 albums
‚ö†Ô∏è  Needs audio features: 0 albums
‚ùå Not found in any source: 0 albums

‚ö†Ô∏è  Albums with special characters (1):
----------------------------------------
 1. A$AP Rocky           - Don't Be Dumb       
     Special chars: '$'

üéµ Albums ready for immediate addition:
----------------------------------------
 1. A$AP Rocky             - Don't Be Dumb          ‚ö†Ô∏è
     Score: 57.0 | Tracks: 17 | Source: 2026-01-16_nmf_...

üíæ Auto-adding 1 albums to training...
  ‚úì Added: A$AP Rocky                - Don't Be Dumb            

‚úÖ Successfully added 1 albums to training!
üìà New totals: 231 gut-scored tracks out of 10,350 total

üéØ RECOMMENDED NEXT STEPS:
1. Add 'Hi

In [5]:
import pandas as pd
import glob
import os
from datetime import datetime, timedelta
import random

def get_album_history(artist, album):
    """Check if album appears in training data"""
    try:
        training_file = 'data/2026_training_complete_with_features.csv'
        training_df = pd.read_csv(training_file)
        
        album_tracks = training_df[
            (training_df['Album Name'] == album) &
            (training_df['Artist Name(s)'].str.contains(artist, na=False))
        ]
        
        if len(album_tracks) == 0:
            return "Never Rated"
        
        source_type = album_tracks.iloc[0]['source_type']
        history_map = {
            'top_100_ranked': "Top 100",
            'honorable_mention': "Honorable Mention",
            'mid': "Mid Albums",
            'not_liked': "Not Liked",
            'gut_score_rated': "Gut Scored"
        }
        
        return history_map.get(source_type, "Unknown")
        
    except Exception as e:
        print(f"Error checking history for {artist} - {album}: {e}")
        return "Unknown"

def update_hidden_gems():
    """Find all Never Rated albums with predicted score ‚â•75"""
    print("üîç Searching for Hidden Gems...")
    
    # Get all prediction files (last 12 months)
    prediction_files = []
    max_date_obj = None
    
    for file in glob.glob('predictions/*_Album_Recommendations.csv'):
        try:
            date_str = os.path.basename(file).split('_')[0]
            date_obj = datetime.strptime(date_str, '%m-%d-%y')
            
            if max_date_obj is None or date_obj > max_date_obj:
                max_date_obj = date_obj
                
            prediction_files.append((date_obj, file))
        except Exception as e:
            print(f"Error processing file {file}: {e}")
            continue
    
    if not prediction_files:
        print("‚ùå No prediction files found!")
        return 0
    
    # Sort by date
    prediction_files.sort(key=lambda x: x[0], reverse=True)
    
    # Get cutoff date (12 months ago)
    cutoff_date = max_date_obj - timedelta(days=30*12)
    
    all_hidden_gems = []
    
    for date_obj, file in prediction_files:
        if date_obj < cutoff_date:
            continue
            
        print(f"  Checking {date_obj.strftime('%Y-%m-%d')}...")
        
        try:
            # Load predictions
            df = pd.read_csv(file)
            
            # Skip if empty
            if len(df) == 0:
                continue
            
            # Standardize columns
            if 'Album' in df.columns:
                df['Album Name'] = df['Album']
            elif 'Album Name' in df.columns:
                df['Album'] = df['Album Name']
            
            if 'Artist Name(s)' in df.columns:
                df['Artist'] = df['Artist Name(s)']
            
            if 'Predicted_Score' in df.columns:
                df['avg_score'] = df['Predicted_Score']
            elif 'avg_score' not in df.columns:
                print(f"    Warning: No score column in {file}")
                continue
            
            # Check each album
            for _, row in df.iterrows():
                score = row.get('Predicted_Score', row.get('avg_score', 0))
                
                if pd.isna(score) or score < 75:
                    continue
                
                artist = row.get('Artist', '')
                album_name = row.get('Album', '')
                
                if not artist or not album_name:
                    continue
                
                history = get_album_history(artist, album_name)
                
                if history == "Never Rated":
                    all_hidden_gems.append({
                        'Artist': artist,
                        'Album': album_name,
                        'Predicted_Score': float(score),
                        'Source_Week': date_obj.strftime('%Y-%m-%d'),
                        'Genres': row.get('Genres', ''),
                        'Label': row.get('Label', '')
                    })
        
        except Exception as e:
            print(f"    Error loading {file}: {e}")
            continue
    
    # Create DataFrame and save
    if all_hidden_gems:
        gems_df = pd.DataFrame(all_hidden_gems)
        
        # Remove duplicates (keep highest score if multiple entries)
        gems_df = gems_df.sort_values('Predicted_Score', ascending=False)
        gems_df = gems_df.drop_duplicates(subset=['Artist', 'Album'], keep='first')
        
        # Save to CSV
        os.makedirs('data', exist_ok=True)
        gems_df.to_csv('data/hidden_gems_cache.csv', index=False)
        print(f"‚úÖ Found {len(gems_df)} Hidden Gems! Saved to data/hidden_gems_cache.csv")
        
        # Also save a sample of 20 for quick access
        if len(gems_df) >= 20:
            sample_gems = gems_df.sample(n=20, random_state=42)
            sample_gems.to_csv('data/hidden_gems_sample.csv', index=False)
            print(f"üìä Also saved 20 random samples to data/hidden_gems_sample.csv")
        else:
            # If less than 20, just save all as sample
            gems_df.to_csv('data/hidden_gems_sample.csv', index=False)
            print(f"üìä Saved {len(gems_df)} samples to data/hidden_gems_sample.csv")
    else:
        print("‚ùå No Hidden Gems found!")
    
    return len(all_hidden_gems) if all_hidden_gems else 0

if __name__ == "__main__":
    update_hidden_gems()

üîç Searching for Hidden Gems...
  Checking 2026-01-16...
  Checking 2026-01-09...
  Checking 2025-10-17...
  Checking 2025-10-10...
  Checking 2025-09-26...
  Checking 2025-09-19...
  Checking 2025-09-12...
  Checking 2025-09-05...
  Checking 2025-08-29...
  Checking 2025-08-22...
  Checking 2025-08-15...
  Checking 2025-08-08...
  Checking 2025-08-01...
  Checking 2025-07-25...
  Checking 2025-07-18...
  Checking 2025-07-11...
  Checking 2025-06-27...
  Checking 2025-06-20...
  Checking 2025-06-13...
  Checking 2025-06-06...
  Checking 2025-05-30...
  Checking 2025-05-23...
  Checking 2025-05-16...
  Checking 2025-05-09...
  Checking 2025-05-02...
  Checking 2025-04-25...
  Checking 2025-04-18...
  Checking 2025-04-11...
  Checking 2025-04-04...
  Checking 2025-03-28...
  Checking 2025-03-21...
  Checking 2025-03-14...
  Checking 2025-03-07...
  Checking 2025-02-28...
  Checking 2025-02-21...
  Checking 2025-02-14...
‚úÖ Found 210 Hidden Gems! Saved to data/hidden_gems_cache.csv
üì