In [None]:
import os
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from dotenv import load_dotenv

# --- Clean Initialization ---
load_dotenv()
client_id = os.getenv("SPOTIPY_CLIENT_ID")
client_secret = os.getenv("SPOTIPY_CLIENT_SECRET")

# Initialize the spotipy object
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=client_id, client_secret=client_secret))

# --- Test the connection ---
try:
    track = sp.track('3n3Ppam7vgaVa1iaRUc9Lp')
    print("🚀 SUCCESS! Spotify connection is working inside the notebook.")
    print(f"   -> Fetched: {track['name']}")
except Exception as e:
    print(f"❌ FAILED: Still an issue within the notebook environment.")
    print(f"   -> Error: {e}")

# You can now use the 'sp' object in the rest of your notebook.

In [None]:
import os

# This will list all files and folders in the directory your notebook is running from
print(os.listdir())

In [None]:
import pandas as pd

def filter_famous_tracks(input_csv_path, output_csv_path, popularity_threshold=43):
    """
    Reads a CSV file containing Spotify track data, filters it to keep only
    tracks with popularity above a certain threshold, and saves the result.

    Args:
        input_csv_path (str): Path to the input CSV file.
        output_csv_path (str): Path where the filtered CSV file will be saved.
        popularity_threshold (int): The minimum popularity score (0-100) 
                                     for a track to be considered "famous". 
                                     Defaults to 70.
    """
    print(f"Loading data from '{input_csv_path}'...")
    try:
        df = pd.read_csv(input_csv_path)
    except FileNotFoundError:
        print(f"❌ ERROR: Input file not found at '{input_csv_path}'.")
        return
    except Exception as e:
        print(f"❌ ERROR: Could not read the CSV file. {e}")
        return

    # Check if the 'popularity' column exists
    if 'popularity' not in df.columns:
        print("❌ ERROR: The input CSV file does not contain a 'popularity' column.")
        return
        
    # Ensure popularity is numeric, handling potential errors
    df['popularity'] = pd.to_numeric(df['popularity'], errors='coerce')
    original_count = len(df)
    df = df.dropna(subset=['popularity']) # Remove rows where popularity couldn't be converted
    
    print(f"Filtering tracks with popularity >= {popularity_threshold}...")
    famous_tracks_df = df[df['popularity'] >= popularity_threshold]
    
    filtered_count = len(famous_tracks_df)
    
    if filtered_count > 0:
        print(f"Found {filtered_count} famous tracks (out of {original_count} total).")
        try:
            famous_tracks_df.to_csv(output_csv_path, index=False)
            print(f"✅ Famous tracks saved successfully to '{output_csv_path}'.")
        except Exception as e:
            print(f"❌ ERROR: Could not save the filtered CSV file. {e}")
    else:
        print("ℹ️ No tracks met the popularity threshold.")

# --- Define file paths and threshold ---
input_file = 'spotify_tracks_distinct_2021_2025.csv'
output_file = 'famous_spotify_tracks_2021_2025.csv'
popularity_level = 43 # Adjust this threshold if you want (e.g., 60 for more songs, 80 for very famous)

# --- Run the filtering function ---
filter_famous_tracks(input_file, output_file, popularity_level)


In [None]:
import pandas as pd

def filter_favorite_genre_and_popularity(input_csv_path, output_csv_path, popularity_threshold=70):
    """
    Reads a CSV file containing Spotify track data, finds the genre with the
    highest average popularity, filters tracks by that genre and a popularity 
    threshold, removes the genre column, and saves the result.

    Args:
        input_csv_path (str): Path to the input CSV file.
        output_csv_path (str): Path where the filtered CSV file will be saved.
        popularity_threshold (int): The minimum popularity score (0-100) 
                                     for a track to be kept after genre filtering. 
                                     Defaults to 70.
    """
    print(f"Loading data from '{input_csv_path}'...")
    try:
        df = pd.read_csv(input_csv_path)
    except FileNotFoundError:
        print(f"❌ ERROR: Input file not found at '{input_csv_path}'.")
        return
    except Exception as e:
        print(f"❌ ERROR: Could not read the CSV file. {e}")
        return

    # Check required columns exist
    required_cols = ['popularity', 'genre']
    if not all(col in df.columns for col in required_cols):
        print(f"❌ ERROR: The input CSV file must contain 'popularity' and 'genre' columns.")
        return
        
    # Ensure popularity is numeric, handling potential errors
    df['popularity'] = pd.to_numeric(df['popularity'], errors='coerce')
    original_count = len(df)
    # Remove rows where popularity couldn't be converted or genre is missing
    df = df.dropna(subset=['popularity', 'genre']) 
    valid_count = len(df)
    print(f"Processing {valid_count} valid rows (out of {original_count} total).")

    # --- Find the most favorite genre based on average popularity ---
    print("Calculating average popularity per genre...")
    genre_popularity = df.groupby('genre')['popularity'].mean()
    
    if genre_popularity.empty:
        print("❌ ERROR: Could not calculate genre popularity. No valid genre data found.")
        return
        
    favorite_genre = genre_popularity.idxmax()
    highest_avg_popularity = genre_popularity.max()
    print(f"Most favorite genre (highest avg popularity): '{favorite_genre}' (Avg Pop: {highest_avg_popularity:.2f})")
    
    # --- Filter by the favorite genre ---
    print(f"Filtering tracks belonging to genre '{favorite_genre}'...")
    genre_filtered_df = df[df['genre'] == favorite_genre].copy() # Use .copy() to avoid SettingWithCopyWarning
    
    genre_filtered_count = len(genre_filtered_df)
    if genre_filtered_count == 0:
        print(f"ℹ️ No tracks found for the favorite genre '{favorite_genre}'.")
        return
    print(f"Found {genre_filtered_count} tracks in the favorite genre.")

    # --- Filter by popularity threshold ---
    print(f"Filtering these tracks with popularity >= {popularity_threshold}...")
    final_filtered_df = genre_filtered_df[genre_filtered_df['popularity'] >= popularity_threshold]
    
    filtered_count = len(final_filtered_df)
    
    if filtered_count > 0:
        print(f"Found {filtered_count} tracks meeting both criteria.")
        
        # --- Remove the genre column ---
        print("Removing the 'genre' column...")
        final_filtered_df = final_filtered_df.drop(columns=['genre'])
        
        try:
            final_filtered_df.to_csv(output_csv_path, index=False)
            print(f"✅ Filtered tracks saved successfully to '{output_csv_path}'.")
        except Exception as e:
            print(f"❌ ERROR: Could not save the filtered CSV file. {e}")
    else:
        print(f"ℹ️ No tracks in the favorite genre ('{favorite_genre}') met the popularity threshold >= {popularity_threshold}.")

# --- Define file paths and threshold ---
input_file = 'spotify_tracks_distinct_2021_2025.csv'
# Updated output file name to reflect the new logic
output_file = 'favorite_genre_popular_tracks_no_genre.csv' 
# You can adjust this threshold for the popularity filter applied *after* genre selection
popularity_level = 43 

# --- Run the filtering function ---
filter_favorite_genre_and_popularity(input_file, output_file, popularity_level)



In [None]:
import pandas as pd

# Define the path to your CSV file
input_file = 'spotify_tracks_distinct_2021_2025.csv'

try:
    # Load the dataset
    df = pd.read_csv(input_file)

    # Check if the 'genre' column exists
    if 'genre' in df.columns:
        # Get the unique genres and convert to a list
        unique_genres = df['genre'].dropna().unique().tolist()
        
        # Sort the list alphabetically for easier reading
        unique_genres.sort()
        
        print("Unique genres found in the dataset:")
        # Print each genre for better readability
        for genre in unique_genres:
            print(f"- {genre}")
            
        print(f"\nTotal unique genres: {len(unique_genres)}")
        
    else:
        print("❌ ERROR: The CSV file does not contain a 'genre' column.")

except FileNotFoundError:
    print(f"❌ ERROR: Input file not found at '{input_file}'.")
except Exception as e:
    print(f"❌ ERROR: Could not read or process the CSV file. {e}")

In [None]:
import pandas as pd

def filter_by_genres_and_popularity(input_csv_path, output_csv_path, genres_to_keep, popularity_threshold=70):
    """
    Reads a CSV file containing Spotify track data, filters tracks belonging 
    to a specified list of genres and meeting a popularity threshold, 
    and saves the result including the genre column.

    Args:
        input_csv_path (str): Path to the input CSV file.
        output_csv_path (str): Path where the filtered CSV file will be saved.
        genres_to_keep (list): A list of genre strings to keep.
        popularity_threshold (int): The minimum popularity score (0-100) 
                                     for a track to be kept after genre filtering. 
                                     Defaults to 70.
    """
    print(f"Loading data from '{input_csv_path}'...")
    try:
        df = pd.read_csv(input_csv_path)
    except FileNotFoundError:
        print(f"❌ ERROR: Input file not found at '{input_csv_path}'.")
        return
    except Exception as e:
        print(f"❌ ERROR: Could not read the CSV file. {e}")
        return

    # Check required columns exist
    required_cols = ['popularity', 'genre']
    if not all(col in df.columns for col in required_cols):
        print(f"❌ ERROR: The input CSV file must contain 'popularity' and 'genre' columns.")
        return
        
    # Ensure popularity is numeric, handling potential errors
    df['popularity'] = pd.to_numeric(df['popularity'], errors='coerce')
    original_count = len(df)
    # Remove rows where popularity couldn't be converted or genre is missing
    df = df.dropna(subset=['popularity', 'genre']) 
    valid_count = len(df)
    print(f"Processing {valid_count} valid rows (out of {original_count} total).")

    # --- Filter by the specified list of genres ---
    print(f"Filtering tracks belonging to genres: {', '.join(genres_to_keep)}...")
    # Keep rows where the 'genre' is in the provided list
    genre_filtered_df = df[df['genre'].isin(genres_to_keep)].copy() # Use .copy()
    
    genre_filtered_count = len(genre_filtered_df)
    if genre_filtered_count == 0:
        print(f"ℹ️ No tracks found for the specified genres.")
        return
    print(f"Found {genre_filtered_count} tracks in the specified genres.")

    # --- Filter by popularity threshold ---
    print(f"Filtering these tracks with popularity >= {popularity_threshold}...")
    final_filtered_df = genre_filtered_df[genre_filtered_df['popularity'] >= popularity_threshold]
    
    filtered_count = len(final_filtered_df)
    
    if filtered_count > 0:
        print(f"Found {filtered_count} tracks meeting both criteria.")
        
        # --- Keep the genre column (removed the .drop() line) ---
        print("Keeping the 'genre' column...")
        # final_filtered_df = final_filtered_df.drop(columns=['genre']) # This line was removed/commented out
        
        try:
            final_filtered_df.to_csv(output_csv_path, index=False)
            print(f"✅ Filtered tracks saved successfully to '{output_csv_path}'.")
        except Exception as e:
            print(f"❌ ERROR: Could not save the filtered CSV file. {e}")
    else:
        print(f"ℹ️ No tracks in the specified genres met the popularity threshold >= {popularity_threshold}.")

# --- Define file paths, genres, and threshold ---
input_file = 'spotify_tracks_distinct_2021_2025.csv'
# Updated output file name for clarity (reflecting genre is kept)
output_file = 'selected_genres_popular_tracks_with_genre.csv' 
# List of genres to keep (Removed 'dance' and 'rock')
selected_genres = ['pop', 'k-pop', 'electronic', 'indie']
# Popularity threshold applied after genre selection
popularity_level = 43 

# --- Run the filtering function ---
filter_by_genres_and_popularity(input_file, output_file, selected_genres, popularity_level)



In [1]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('spotify_tracks.csv')

# Define the languages to keep
languages_to_keep = ['English', 'Mandarin', 'Indonesia']

# Filter the dataframe to keep only songs in the specified languages
filtered_df = df[df['language'].isin(languages_to_keep)]

# Save the filtered dataframe to a new CSV file
filtered_df.to_csv('spotify_tracks_filtered.csv', index=False)

print(f"Original dataset had {len(df)} songs")
print(f"Filtered dataset has {len(filtered_df)} songs")
print(f"Removed {len(df) - len(filtered_df)} songs")

# Show language distribution in the filtered dataset
print("\nLanguage distribution in filtered dataset:")
print(filtered_df['language'].value_counts())

FileNotFoundError: [Errno 2] No such file or directory: 'spotify_tracks.csv'

In [2]:
file1_path = 'high_popularity_spotify_data.csv'
file2_path = 'spotifydataset.csv'

# Load the two files into dataframes
df1 = pd.read_csv(file1_path)
df2 = pd.read_csv(file2_path)

# Combine the two dataframes into one
combined_df = pd.concat([df1, df2], ignore_index=True)

# Show some basic information about the combined dataframe
combined_df_info = combined_df.info()

# Display the first few rows of the combined dataset
combined_df_head = combined_df.head()

combined_df_info, combined_df_head

FileNotFoundError: [Errno 2] No such file or directory: 'high_popularity_spotify_data.csv'

In [None]:
file3_path = 'spotify_tracks_filtered.csv'

# Load the file into a dataframe
df3 = pd.read_csv(file3_path)

# Combine the previously combined dataframe with this new dataframe
final_combined_df = pd.concat([combined_df, df3], ignore_index=True)

# Show some basic information about the final combined dataframe
final_combined_df_info = final_combined_df.info()

# Display the first few rows of the final combined dataset
final_combined_df_head = final_combined_df.head()

final_combined_df_info, final_combined_df_head

In [None]:
final_combined_df_path = 'final_combined_spotify_data.csv'
final_combined_df.to_csv(final_combined_df_path, index=False)

final_combined_df_path

In [3]:
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestClassifier

In [4]:
model = joblib.load('random_forest_mood_model_update.joblib')  


data = pd.read_csv('final_combined_spotify_data.csv')

  data = pd.read_csv('final_combined_spotify_data.csv')


In [5]:
from sklearn.preprocessing import StandardScaler
# Select the relevant features (same ones used during training)
features_to_include = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 
                       'instrumentalness', 'liveness', 'valence', 'tempo']

# Prepare the feature data (X)
X_new = data[features_to_include]

# Scale the features (same scaling as done in training)
scaler = StandardScaler()
X_new_scaled = scaler.fit_transform(X_new)

# Make predictions on the new dataset
predictions = model.predict(X_new_scaled)

# Add the predictions (mood labels) to the original dataset
data['Predicted_Mood'] = predictions

# Save the updated dataset with predicted mood labels
data.to_csv('predicted_mood_output_new_data.csv', index=False)

# Display the first few rows of the dataset with the predicted mood
print(data.head())

   energy    tempo  danceability playlist_genre  loudness  liveness  valence  \
0   0.592  157.969         0.521            pop    -7.777     0.122    0.535   
1   0.507  104.978         0.747            pop   -10.171     0.117    0.438   
2   0.808  108.548         0.554            pop    -4.169     0.159    0.372   
3   0.910  112.966         0.670            pop    -4.070     0.304    0.786   
4   0.783  149.027         0.777            pop    -4.477     0.355    0.939   

            track_artist  time_signature  speechiness  ...  artist_url  \
0  Lady Gaga, Bruno Mars             3.0       0.0304  ...         NaN   
1          Billie Eilish             4.0       0.0358  ...         NaN   
2          Gracie Abrams             4.0       0.0368  ...         NaN   
3      Sabrina Carpenter             4.0       0.0634  ...         NaN   
4       ROSÉ, Bruno Mars             4.0       0.2600  ...         NaN   

  album_name release_date explicit year popularity artwork_url track_url  

In [6]:
import pandas as pd

# Load the dataset with predictions (adjust the path if needed)
new_data = pd.read_csv('predicted_mood_output_new_data.csv')

# Mapping of numeric labels to mood labels
mood_mapping = {
    0: 'Calm',
    1: 'Happy',
    2: 'Sad',
    3: 'Energetic'
}

# Convert the 'Predicted_Mood' column from numeric to mood labels
new_data['Predicted_Mood'] = new_data['Predicted_Mood'].map(mood_mapping)

# Save the updated dataset with mood labels
new_data.to_csv('predicted_mood_with_labels.csv', index=False)

# Display the first few rows to check the update
print(new_data.head())


  new_data = pd.read_csv('predicted_mood_output_new_data.csv')


   energy    tempo  danceability playlist_genre  loudness  liveness  valence  \
0   0.592  157.969         0.521            pop    -7.777     0.122    0.535   
1   0.507  104.978         0.747            pop   -10.171     0.117    0.438   
2   0.808  108.548         0.554            pop    -4.169     0.159    0.372   
3   0.910  112.966         0.670            pop    -4.070     0.304    0.786   
4   0.783  149.027         0.777            pop    -4.477     0.355    0.939   

            track_artist  time_signature  speechiness  ...  artist_url  \
0  Lady Gaga, Bruno Mars             3.0       0.0304  ...         NaN   
1          Billie Eilish             4.0       0.0358  ...         NaN   
2          Gracie Abrams             4.0       0.0368  ...         NaN   
3      Sabrina Carpenter             4.0       0.0634  ...         NaN   
4       ROSÉ, Bruno Mars             4.0       0.2600  ...         NaN   

  album_name release_date explicit year popularity artwork_url track_url  

In [8]:
file4_path = 'MusicMoodFinal.csv'
file5_path = 'predicted_mood_with_labels.csv'
# Load the file into a dataframe
df4 = pd.read_csv(file4_path)
df5 = pd.read_csv(file5_path)
# Combine the previously combined dataframe with this new dataframe
song_df = pd.concat([df4, df5], ignore_index=True)

# Show some basic information about the final combined dataframe
song_df_info = song_df.info()

# Display the first few rows of the final combined dataset
song_df_head = song_df.head()

song_df_info, song_df_head

  df5 = pd.read_csv(file5_path)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 612750 entries, 0 to 612749
Data columns (total 48 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   id                        588358 non-null  object 
 1   name                      586601 non-null  object 
 2   popularity                610064 non-null  float64
 3   duration_ms               612750 non-null  float64
 4   explicit                  587672 non-null  object 
 5   artists                   586672 non-null  object 
 6   id_artists                586672 non-null  object 
 7   release_date              587672 non-null  object 
 8   danceability              612750 non-null  float64
 9   energy                    612750 non-null  float64
 10  key                       612750 non-null  float64
 11  loudness                  612750 non-null  float64
 12  mode                      612750 non-null  float64
 13  speechiness               612750 non-null  f

(None,
                        id                                 name  popularity  \
 0  35iwgR4jXetI318WEWsa1Q                                Carve         6.0   
 1  021ht4sdgPcrDgSk7JTbKY  Capítulo 2.16 - Banquero Anarquista         0.0   
 2  07A5yehtSnoedViJAZkNnc   Vivo para Quererte - Remasterizado         0.0   
 3  08FmqUhxtyLTn6pAh6bk45        El Prisionero - Remasterizado         0.0   
 4  08y9GfoqCWfOGsKdwojr5e                  Lady of the Evening         0.0   
 
    duration_ms explicit              artists                  id_artists  \
 0     126903.0        0              ['Uli']  ['45tIt06XoI0Iio4LBEVpls']   
 1      98200.0        0  ['Fernando Pessoa']  ['14jtPCOoNZwquk5wd9DxrY']   
 2     181640.0        0  ['Ignacio Corsini']  ['5LiOoJbxVSAMkBS2fUm3X2']   
 3     176907.0        0  ['Ignacio Corsini']  ['5LiOoJbxVSAMkBS2fUm3X2']   
 4     163080.0        0      ['Dick Haymes']  ['3BiJGZsyX9sJchTqcSA7Su']   
 
   release_date  danceability  energy  ...  genres  f

In [11]:
song_df_path = 'full_song_list.csv'
song_df.to_csv(song_df_path, index=False)

song_df_path




'full_song_list.csv'