# Importing the libraries

In [115]:
import os
import pandas as pd
import numpy as np
import json
import spotipy
import spotipy.oauth2 as oauth2
from spotipy.oauth2 import SpotifyOAuth,SpotifyClientCredentials
import yaml
import re
from tqdm import tqdm
import multiprocessing as mp
import time
import random
import datetime

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

In [116]:
stream= open("spotify.yaml")
spotify_details = yaml.safe_load(stream)
auth_manager = SpotifyClientCredentials(client_id=spotify_details['Client_id'],
                                        client_secret=spotify_details['client_secret'])
sp = spotipy.client.Spotify(auth_manager=auth_manager)

# Importing the dataset

In [117]:
df=pd.read_csv('1m.csv')
audio_features=pd.read_csv('audio_features.csv')
track_features=pd.read_csv('track_features.csv')
artist_features=pd.read_csv('artist_features.csv')

  audio_features=pd.read_csv('audio_features.csv')
  track_features=pd.read_csv('track_features.csv')


In [118]:
audio_features.drop(columns=['type','id','track_href','analysis_url','duration_ms'],inplace=True)

In [119]:
track_features.rename(columns={"popularity": "track_popularity"}, inplace=True)

In [120]:
artist_features.rename(columns={"popularity": "artist_popularity"}, inplace=True)

In [121]:
track_features['id'] = 'spotify:track:' + track_features['id']
artist_features['Artist_uri'] = 'spotify:artist:' + artist_features['Artist_uri']

# Merging all dataframes

In [122]:
df = pd.merge(df,audio_features, left_on = "track_uri", right_on= "uri",how = 'inner')

In [123]:
print(len(df))

2249255


In [124]:
df = pd.merge(df,track_features, left_on = "track_uri", right_on= "id",how = 'inner')

In [125]:
print(len(df))

1541771


In [126]:
df = pd.merge(df,artist_features, left_on = "artist_uri", right_on= "Artist_uri",how = 'inner')

In [127]:
print(len(df))

1533971


# Droping Unwanted Columns Save Space

There were still 101 from audio_features and 576 from track_features extraction that were missing from the soptify api, so I had to drop them.

In [128]:
df.dropna(axis=0,inplace=True)

In [129]:
df.isna().sum().sum()

0

In [130]:
df.drop(columns=['Unnamed: 0','pos','Artist_uri','id','uri'], inplace=True)#,],axis=1,inplace=True)

In [131]:
df.columns

Index(['artist_name', 'track_uri', 'artist_uri', 'track_name', 'album_uri',
       'duration_ms', 'album_name', 'danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acoustic', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'time_signature', 'release_date',
       'track_popularity', 'artist_popularity', 'genres'],
      dtype='object')

In [132]:
df.head(1)

Unnamed: 0,artist_name,track_uri,artist_uri,track_name,album_uri,duration_ms,album_name,danceability,energy,key,...,acoustic,instrumentalness,liveness,valence,tempo,time_signature,release_date,track_popularity,artist_popularity,genres
0,Missy Elliott,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,226863,The Cookbook,0.904,0.813,4.0,...,0.0311,0.00697,0.0471,0.81,125.461,4.0,2005-07-04,70,70,dance_pop hip_hop hip_pop neo_soul pop_rap r&b...


## Data Preprocessing

Create five point buckets for track and artist popularity .

and 50 point buckets for the track release date.

In [133]:
df['track_popularity'] = df['track_popularity'].astype(int)

In [134]:
df['track_popularity'] = df['track_popularity'].apply(lambda x: int(x/5))
df['artist_popularity'] = df['artist_popularity'].apply(lambda x: int(x/5))

In [135]:
df['release_date'] = df['release_date'].apply(lambda x: x.split('-')[0])
df['release_date']=df['release_date'].astype('int16')
df['release_date'] = df['release_date'].apply(lambda x: int(x/50))

In [136]:
df.head(1)

Unnamed: 0,artist_name,track_uri,artist_uri,track_name,album_uri,duration_ms,album_name,danceability,energy,key,...,acoustic,instrumentalness,liveness,valence,tempo,time_signature,release_date,track_popularity,artist_popularity,genres
0,Missy Elliott,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,226863,The Cookbook,0.904,0.813,4.0,...,0.0311,0.00697,0.0471,0.81,125.461,4.0,40,14,14,dance_pop hip_hop hip_pop neo_soul pop_rap r&b...


In [137]:
df.to_csv('1M_unique_processed_data.csv',index=False)