In [None]:
!pip install -q pandas scikit-learn
import pandas as pd
spotify_data = pd.read_csv('spotify_music_data.csv')
spotify_data.head()

In [None]:
# Checking for missing values
missing_values = spotify_data.isnull().sum()

# Checking for duplicates
duplicates = spotify_data.duplicated().sum()

# Data types of columns
data_types = spotify_data.dtypes

missing_values, duplicates, data_types

In [None]:
# Handling missing values by dropping rows with missing values
spotify_data.dropna(inplace=True)

# Verifying if missing values are handled
missing_values_after = spotify_data.isnull().sum()

# Converting 'explicit' column to int type
spotify_data['explicit'] = spotify_data['explicit'].astype(int)

missing_values_after, spotify_data['explicit'].dtype

In [None]:
spotify_data.info()

In [None]:
# Selecting a subset of the dataset with 5000 rows
subset_songs = spotify_data.sample(n=5000, random_state=42)
subset_songs.shape

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
# Defining the features and the scaler again
features = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
scaler = StandardScaler()
# Extracting features for the subset
subset_songs_features = subset_songs[features]
# Standardizing the features for the subset
subset_songs_features_scaled = scaler.fit_transform(subset_songs_features)
# Computing the cosine similarity matrix for the subset
subset_similarity_matrix = cosine_similarity(subset_songs_features_scaled)
subset_similarity_matrix

This matrix represents the similarity between each pair of songs in our subset. A value close to 1 indicates high similarity, while a value close to -1 indicates low similarity.

In [None]:
# Selecting a random song from the subset using the correct indices
random_song_index = np.random.choice(subset_songs.index, 1)[0]
random_song = subset_songs.loc[random_song_index]
# Finding the most similar songs to the random song based on the cosine similarity matrix
subset_index = subset_songs.index.get_loc(random_song_index)
similarities = subset_similarity_matrix[subset_index]
top_similar_indices = np.argsort(similarities)[-6:-1][::-1]
random_song[['artists', 'track_name']], subset_songs.iloc[top_similar_indices][['artists', 'track_name']]

In [None]:
import numpy as np
# Number of users and songs for simulation
num_users = 100
num_songs = subset_songs.shape[0]
# Simulating the user-song interaction matrix with play counts ranging from 0 to 10
np.random.seed(42)  # for reproducibility
interaction_matrix = np.random.randint(0, 11, size=(num_users, num_songs))
# Convert to DataFrame for better visualization
interaction_df = pd.DataFrame(interaction_matrix, columns=subset_songs['track_name'])
interaction_df.head()