In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# --- BILLBOARD SCRAPING FUNCTION ---
def scrape_billboard_hot_100():
    """Scrape the current Billboard Hot 100 chart"""
    print("Scraping Billboard Hot 100...")
    
    url = 'https://www.billboard.com/charts/hot-100/'
    
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        billboard_data = []
        
        # Find all chart items - this selector might need updating if Billboard changes their layout
        chart_items = soup.find_all('div', class_='o-chart-results-list-row-container')
        
        if not chart_items:
            print("Warning: Could not find chart items. Billboard site structure may have changed.")
            print("Please inspect the page and update the CSS selectors.")
            return None
        
        for item in chart_items:
            try:
                # Try to find rank
                rank_elem = item.find('span', class_='c-label')
                rank = rank_elem.get_text(strip=True) if rank_elem else 'N/A'
                
                # Try to find title
                title_elem = item.find('h3', class_='c-title')
                title = title_elem.get_text(strip=True) if title_elem else 'N/A'
                
                # Try to find artist
                artist_elem = item.find('span', class_='c-label')
                artist = artist_elem.get_text(strip=True) if artist_elem else 'N/A'
                
                # Clean up the data
                if rank.isdigit() and title != 'N/A' and artist != 'N/A':
                    billboard_data.append({
                        'billboard_rank': int(rank),
                        'title': title,
                        'artist': artist,
                        'source': 'Billboard'
                    })
                    
            except Exception as e:
                print(f"Error parsing chart item: {e}")
                continue
        
        print(f"Successfully scraped {len(billboard_data)} Billboard Hot 100 tracks")
        return pd.DataFrame(billboard_data)
        
    except Exception as e:
        print(f"Error scraping Billboard: {e}")
        return None

# --- COMPARISON VISUALIZATION ---
def create_comparison_visualization(spotify_df, billboard_df):
    """Create visualizations comparing Spotify and Billboard data"""
    
    if spotify_df is None or billboard_df is None:
        print("Cannot create comparison - missing data")
        return
    
    # Merge the datasets on title and artist (this is approximate)
    # In a real scenario, you'd need fuzzy matching
    merged_df = pd.merge(
        spotify_df, 
        billboard_df, 
        on=['title', 'artist'], 
        how='inner', 
        suffixes=('_spotify', '_billboard')
    )
    
    if merged_df.empty:
        print("No common tracks found between Spotify and Billboard")
        return
    
    print(f"Found {len(merged_df)} common tracks between Spotify and Billboard")
    
    # 1. Ranking Comparison Scatter Plot
    plt.figure(figsize=(12, 8))
    plt.scatter(merged_df['spotify_rank'], merged_df['billboard_rank'], alpha=0.7)
    
    # Add trend line
    z = np.polyfit(merged_df['spotify_rank'], merged_df['billboard_rank'], 1)
    p = np.poly1d(z)
    plt.plot(merged_df['spotify_rank'], p(merged_df['spotify_rank']), "r--")
    
    plt.xlabel('Spotify Global Rank')
    plt.ylabel('Billboard Hot 100 Rank')
    plt.title('Spotify vs Billboard Rankings Comparison\n(Lower numbers = higher ranking)', fontweight='bold')
    plt.gca().invert_xaxis()
    plt.gca().invert_yaxis()
    
    # Add labels for some points
    for i, row in merged_df.head(10).iterrows():
        plt.annotate(f"{row['title'][:15]}...", 
                    (row['spotify_rank'], row['billboard_rank']),
                    xytext=(5, 5), textcoords='offset points', fontsize=8)
    
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig('spotify_vs_billboard_rankings.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    # 2. Audio Features Comparison
    if all(col in merged_df.columns for col in ['danceability', 'energy', 'valence']):
        # Calculate averages for both datasets
        spotify_avg = spotify_df[['danceability', 'energy', 'valence']].mean()
        billboard_spotify_avg = merged_df[['danceability', 'energy', 'valence']].mean()
        
        comparison_data = pd.DataFrame({
            'All Spotify Tracks': spotify_avg,
            'Billboard Hot 100 on Spotify': billboard_spotify_avg
        })
        
        plt.figure(figsize=(10, 6))
        comparison_data.plot(kind='bar', ax=plt.gca())
        plt.title('Audio Features: All Spotify vs Billboard Hot 100 Tracks', fontweight='bold')
        plt.ylabel('Average Value')
        plt.xticks(rotation=0)
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.tight_layout()
        plt.savefig('audio_features_comparison.png', dpi=300, bbox_inches='tight')
        plt.show()

# --- UPDATED MAIN FUNCTION ---
def main_complete():
    """Complete main function with both Spotify and Billboard"""
    print("="*60)
    print("MUSIC DATA ANALYSIS: SPOTIFY API + BILLBOARD SCRAPING")
    print("="*60)
    
    # 1. Get Billboard data (scraping)
    print("\n1. Scraping Billboard Hot 100...")
    billboard_df = scrape_billboard_hot_100()
    
    # 2. Get Spotify data (API)
    print("\n2. Getting Spotify data...")
    # [Insert all the Spotify API code from previous working version here]
    # This would include get_spotify_token(), get_playlist_tracks(), etc.
    
    # For now, let's assume we have a function get_spotify_data() that returns a DataFrame
    spotify_df = get_spotify_data()  # You'd replace this with your actual Spotify code
    
    if spotify_df is not None and billboard_df is not None:
        # 3. Save both datasets
        spotify_df.to_csv('spotify_data.csv', index=False)
        billboard_df.to_csv('billboard_data.csv', index=False)
        print("\n3. Data saved to spotify_data.csv and billboard_data.csv")
        
        # 4. Create comparison visualizations
        print("\n4. Creating comparison visualizations...")
        create_comparison_visualization(spotify_df, billboard_df)
        
        # 5. Create individual visualizations
        print("\n5. Creating individual visualizations...")
        create_all_visualizations()  # From previous visualization code
        
    elif spotify_df is not None:
        # Only Spotify data available
        spotify_df.to_csv('spotify_data.csv', index=False)
        print("\n3. Spotify data saved to spotify_data.csv")
        print("4. Creating Spotify visualizations...")
        create_all_visualizations()
        
    elif billboard_df is not None:
        # Only Billboard data available
        billboard_df.to_csv('billboard_data.csv', index=False)
        print("\n3. Billboard data saved to billboard_data.csv")
        print("Billboard chart preview:")
        print(billboard_df.head())
        
    else:
        print("No data was collected from either source.")

# --- QUICK BILLBOARD TEST ---
def test_billboard_scraping():
    """Test the Billboard scraping function"""
    print("Testing Billboard scraping...")
    df = scrape_billboard_hot_100()
    
    if df is not None:
        print("Billboard data preview:")
        print(df.head(10))
        print(f"\nTotal tracks scraped: {len(df)}")
        
        # Save the test data
        df.to_csv('billboard_test.csv', index=False)
        print("Data saved to billboard_test.csv")
    else:
        print("Billboard scraping failed")

# Run the test
if __name__ == "__main__":
    # First test just the Billboard scraping
    test_billboard_scraping()
    
    # If Billboard works, you can then run the complete analysis:
    # main_complete()

Testing Billboard scraping...
Scraping Billboard Hot 100...
Successfully scraped 100 Billboard Hot 100 tracks
Billboard data preview:
   billboard_rank          title artist     source
0               1         Golden      1  Billboard
1               2       Ordinary      2  Billboard
2               3          Tears      3  Billboard
3               4       Manchild      4  Billboard
4               5      Your Idol      5  Billboard
5               6       Soda Pop      6  Billboard
6               7    What I Want      7  Billboard
7               8    Love Me Not      8  Billboard
8               9  How It's Done      9  Billboard
9              10   Lose Control     10  Billboard

Total tracks scraped: 100
Data saved to billboard_test.csv


In [4]:
import requests
import base64
import pandas as pd
import time
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from bs4 import BeautifulSoup

# --- CONFIGURATION ---
CLIENT_ID = '5f857178968840c6ac260b716ccccb4e'
CLIENT_SECRET = '95e7fed22a1849b7b2d9c1880877a2f6'
SPOTIFY_PLAYLIST_ID = '0oD5aLzUQiIomu805F3xDH'  # Your working playlist
BILLBOARD_URL = 'https://www.billboard.com/charts/hot-100/'

# --- SPOTIFY API FUNCTIONS ---
def get_spotify_token():
    """Get Spotify access token"""
    auth_url = 'https://accounts.spotify.com/api/token'
    auth_header = base64.b64encode(f"{CLIENT_ID}:{CLIENT_SECRET}".encode()).decode()
    auth_data = {'grant_type': 'client_credentials'}
    auth_headers = {
        'Authorization': f'Basic {auth_header}',
        'Content-Type': 'application/x-www-form-urlencoded'
    }

    response = requests.post(auth_url, data=auth_data, headers=auth_headers)
    response.raise_for_status()
    return response.json()['access_token']

def get_spotify_global_top_50(access_token):
    """Get Spotify Global Top 50 playlist data"""
    url = f'https://api.spotify.com/v1/playlists/{SPOTIFY_PLAYLIST_ID}/tracks'
    headers = {'Authorization': f'Bearer {access_token}'}
    params = {'limit': 50, 'offset': 0}
    all_tracks = []
    
    try:
        response = requests.get(url, headers=headers, params=params)
        response.raise_for_status()
        data = response.json()
        
        for idx, item in enumerate(data['items']):
            track_info = item['track']
            if track_info and track_info.get('id'):
                track_data = {
                    'spotify_rank': idx + 1,
                    'title': track_info['name'],
                    'artist': ', '.join([artist['name'] for artist in track_info['artists']]),
                    'track_id': track_info['id'],
                    'popularity': track_info['popularity'],
                    'duration_ms': track_info['duration_ms'],
                    'explicit': track_info['explicit']
                }
                all_tracks.append(track_data)
        
        return pd.DataFrame(all_tracks)
        
    except Exception as e:
        print(f"Error fetching Spotify data: {e}")
        return None

def get_audio_features(access_token, track_ids):
    """Get audio features for tracks"""
    url = 'https://api.spotify.com/v1/audio-features'
    headers = {'Authorization': f'Bearer {access_token}'}
    all_audio_features = []
    
    valid_track_ids = [tid for tid in track_ids if tid]
    
    for i in range(0, len(valid_track_ids), 100):
        chunk = valid_track_ids[i:i+100]
        ids_param = ','.join(chunk)
        
        try:
            response = requests.get(f'{url}?ids={ids_param}', headers=headers)
            response.raise_for_status()
            features_data = response.json()['audio_features']
            
            for feature in features_data:
                if feature is not None:
                    all_audio_features.append(feature)
            
            time.sleep(0.1)
            
        except Exception as e:
            print(f"Warning: Failed to get audio features for chunk: {e}")
            continue
    
    return all_audio_features

# --- BILLBOARD SCRAPING ---
def scrape_billboard_hot_100():
    """Scrape Billboard Hot 100"""
    print("Scraping Billboard Hot 100...")
    
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        
        response = requests.get(BILLBOARD_URL, headers=headers, timeout=10)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'html.parser')
        billboard_data = []
        
        # Updated CSS selectors for Billboard
        chart_items = soup.find_all('li', class_='lrv-u-width-100p')
        
        for item in chart_items:
            try:
                # Try to find rank
                rank_elem = item.find('span', class_='c-label')
                rank = rank_elem.get_text(strip=True) if rank_elem else None
                
                # Try to find title and artist
                title_elem = item.find('h3', class_='c-title')
                artist_elem = item.find('span', class_='c-label.a-no-trucate')
                
                if rank and title_elem and artist_elem:
                    title = title_elem.get_text(strip=True)
                    artist = artist_elem.get_text(strip=True)
                    
                    billboard_data.append({
                        'billboard_rank': int(rank),
                        'title': title,
                        'artist': artist,
                        'source': 'Billboard'
                    })
                    
            except Exception as e:
                continue
        
        print(f"Successfully scraped {len(billboard_data)} Billboard tracks")
        return pd.DataFrame(billboard_data)
        
    except Exception as e:
        print(f"Error scraping Billboard: {e}")
        return None

# --- COMPARISON ANALYSIS FUNCTIONS ---
def clean_text(text):
    """Clean text for better matching"""
    if pd.isna(text):
        return ""
    return str(text).lower().replace('&', 'and').strip()

def find_common_tracks(spotify_df, billboard_df):
    """Find tracks that appear on both charts"""
    # Create cleaned versions for matching
    spotify_df['title_clean'] = spotify_df['title'].apply(clean_text)
    spotify_df['artist_clean'] = spotify_df['artist'].apply(clean_text)
    
    billboard_df['title_clean'] = billboard_df['title'].apply(clean_text)
    billboard_df['artist_clean'] = billboard_df['artist'].apply(clean_text)
    
    # Find matches
    common_tracks = []
    for _, billboard_row in billboard_df.iterrows():
        matches = spotify_df[
            (spotify_df['title_clean'] == billboard_row['title_clean']) &
            (spotify_df['artist_clean'] == billboard_row['artist_clean'])
        ]
        
        if not matches.empty:
            spotify_row = matches.iloc[0]
            common_tracks.append({
                'title': billboard_row['title'],
                'artist': billboard_row['artist'],
                'spotify_rank': spotify_row['spotify_rank'],
                'billboard_rank': billboard_row['billboard_rank'],
                'popularity': spotify_row.get('popularity', None)
            })
    
    return pd.DataFrame(common_tracks)

def create_comparison_visualizations(spotify_df, billboard_df, common_df):
    """Create visualizations comparing the two charts"""
    
    print("\n" + "="*50)
    print("CREATING COMPARISON VISUALIZATIONS")
    print("="*50)
    
    # 1. Common tracks count
    plt.figure(figsize=(10, 6))
    labels = ['Only Spotify', 'Only Billboard', 'Both Charts']
    values = [
        len(spotify_df) - len(common_df),
        len(billboard_df) - len(common_df),
        len(common_df)
    ]
    
    plt.bar(labels, values, color=['blue', 'red', 'green'])
    plt.title('Track Overlap: Spotify Global vs Billboard Hot 100', fontweight='bold')
    plt.ylabel('Number of Tracks')
    
    # Add value labels
    for i, v in enumerate(values):
        plt.text(i, v + 0.5, str(v), ha='center', va='bottom')
    
    plt.tight_layout()
    plt.savefig('chart_overlap.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    # 2. Ranking comparison scatter plot
    if not common_df.empty:
        plt.figure(figsize=(12, 8))
        plt.scatter(common_df['spotify_rank'], common_df['billboard_rank'], alpha=0.7, s=100)
        
        # Add trend line
        z = np.polyfit(common_df['spotify_rank'], common_df['billboard_rank'], 1)
        p = np.poly1d(z)
        plt.plot(common_df['spotify_rank'], p(common_df['spotify_rank']), "r--", alpha=0.8)
        
        # Add labels for top tracks
        for _, row in common_df.head(10).iterrows():
            plt.annotate(f"{row['title'][:15]}...", 
                        (row['spotify_rank'], row['billboard_rank']),
                        xytext=(5, 5), textcoords='offset points', fontsize=8)
        
        plt.xlabel('Spotify Global Rank (Lower = Better)')
        plt.ylabel('Billboard Hot 100 Rank (Lower = Better)')
        plt.title('Ranking Comparison: Common Tracks on Both Charts', fontweight='bold')
        plt.gca().invert_xaxis()
        plt.gca().invert_yaxis()
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.savefig('ranking_comparison.png', dpi=300, bbox_inches='tight')
        plt.show()
        
        # 3. Correlation analysis
        correlation = common_df['spotify_rank'].corr(common_df['billboard_rank'])
        print(f"Rank correlation coefficient: {correlation:.3f}")
        
        if correlation > 0.5:
            print("Strong positive correlation: Tracks popular on one chart tend to be popular on the other")
        elif correlation < -0.5:
            print("Strong negative correlation: Tracks popular on one chart tend to be less popular on the other")
        else:
            print("Weak correlation: Popularity on one chart doesn't strongly predict popularity on the other")
    
    # 4. Top overlapping artists
    if not common_df.empty:
        artist_counts = common_df['artist'].value_counts().head(10)
        plt.figure(figsize=(12, 6))
        artist_counts.plot(kind='bar', color='purple', alpha=0.7)
        plt.title('Artists with Most Tracks on Both Charts', fontweight='bold')
        plt.xlabel('Artist')
        plt.ylabel('Number of Tracks')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.savefig('top_overlapping_artists.png', dpi=300, bbox_inches='tight')
        plt.show()

# --- MAIN COMPARISON ANALYSIS ---
def run_complete_analysis():
    """Run the complete Spotify vs Billboard analysis"""
    
    print("="*60)
    print("GLOBAL VS US MUSIC TRENDS: SPOTIFY vs BILLBOARD")
    print("="*60)
    
    # 1. Get Spotify data
    print("\n1. Fetching Spotify Global Top 50...")
    try:
        token = get_spotify_token()
        spotify_df = get_spotify_global_top_50(token)
        
        if spotify_df is not None:
            # Get audio features
            audio_features = get_audio_features(token, spotify_df['track_id'].tolist())
            if audio_features:
                audio_df = pd.DataFrame(audio_features)
                audio_df.rename(columns={'id': 'track_id'}, inplace=True)
                spotify_df = spotify_df.merge(audio_df, on='track_id', how='left')
            
            print(f"✓ Spotify data: {len(spotify_df)} tracks")
            spotify_df.to_csv('spotify_global_top_50.csv', index=False)
        else:
            print("✗ Failed to get Spotify data")
            return
            
    except Exception as e:
        print(f"✗ Spotify API error: {e}")
        return
    
    # 2. Get Billboard data
    print("\n2. Scraping Billboard Hot 100...")
    billboard_df = scrape_billboard_hot_100()
    
    if billboard_df is not None:
        print(f"✓ Billboard data: {len(billboard_df)} tracks")
        billboard_df.to_csv('billboard_hot_100.csv', index=False)
    else:
        print("✗ Failed to get Billboard data")
        # Continue with just Spotify data for some visualizations
        billboard_df = pd.DataFrame()
    
    # 3. Find common tracks
    print("\n3. Analyzing chart overlap...")
    if not spotify_df.empty and not billboard_df.empty:
        common_df = find_common_tracks(spotify_df, billboard_df)
        print(f"✓ Found {len(common_df)} tracks on both charts")
        
        if not common_df.empty:
            common_df.to_csv('common_tracks.csv', index=False)
            
            # 4. Create comparison visualizations
            create_comparison_visualizations(spotify_df, billboard_df, common_df)
            
            # 5. Show insights
            print("\n" + "="*50)
            print("KEY INSIGHTS")
            print("="*50)
            print(f"Spotify Global Top 50 tracks: {len(spotify_df)}")
            print(f"Billboard Hot 100 tracks: {len(billboard_df)}")
            print(f"Tracks appearing on both charts: {len(common_df)}")
            print(f"Overlap percentage: {(len(common_df)/min(len(spotify_df), len(billboard_df)))*100:.1f}%")
            
            # Show top common tracks
            print("\nTop tracks on both charts:")
            top_common = common_df.sort_values('spotify_rank').head(5)
            for _, row in top_common.iterrows():
                print(f"#{row['spotify_rank']} Spotify / #{row['billboard_rank']} Billboard - {row['title']} by {row['artist']}")
        
        else:
            print("No common tracks found between the charts")
    
    else:
        print("Insufficient data for comparison analysis")
    
    print("\n" + "="*60)
    print("ANALYSIS COMPLETE!")
    print("Check the generated CSV files and PNG visualizations")
    print("="*60)

# Run the complete analysis
if __name__ == "__main__":
    run_complete_analysis()

GLOBAL VS US MUSIC TRENDS: SPOTIFY vs BILLBOARD

1. Fetching Spotify Global Top 50...
✓ Spotify data: 50 tracks

2. Scraping Billboard Hot 100...
Scraping Billboard Hot 100...
Successfully scraped 0 Billboard tracks
✓ Billboard data: 0 tracks

3. Analyzing chart overlap...
Insufficient data for comparison analysis

ANALYSIS COMPLETE!
Check the generated CSV files and PNG visualizations
