# Importing the libraries

In [2]:
import os
import pandas as pd
import numpy as np
import json
import spotipy
import spotipy.oauth2 as oauth2
from spotipy.oauth2 import SpotifyOAuth,SpotifyClientCredentials
import yaml
import re
from tqdm import tqdm
import multiprocessing as mp
import time
import random
import datetime

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

In [3]:
stream= open("../../spotify.yaml")
spotify_details = yaml.safe_load(stream)
auth_manager = SpotifyClientCredentials(client_id=spotify_details['Client_id'],
                                        client_secret=spotify_details['client_secret'])
sp = spotipy.client.Spotify(auth_manager=auth_manager)

# Importing the dataset

In [None]:
# these datsets come from manual downloads
df=pd.read_csv('../../1mV3.csv')
artist_features=pd.read_csv('../../artist_features.csv')
audio_features=pd.read_csv('../../audio_features.csv')
track_features=pd.read_csv('../../track_features.csv')

# Merging all dataframes

In [None]:
df = pd.merge(df,audio_features, left_on = "track_uri", right_on= "id",how = 'outer')

In [None]:
df = pd.merge(df,track_features, left_on = "track_uri", right_on= "Track_uri",how = 'outer')

In [None]:
df = pd.merge(df,artist_features, left_on = "artist_uri", right_on= "Artist_uri",how = 'outer')

# Handling missing data 

In [None]:
df.isna().sum()

## Handling audio_features missing From extraction

In [None]:
missing_t_uri=df.track_uri[df.id.isna()]
missing_t_uri=missing_t_uri.unique()
random.shuffle(missing_t_uri)

In [None]:
f = open('../../audio_features.csv','a')
for i in tqdm(range(0,len(missing_t_uri),1)):
    try:
     track_feature = sp.audio_features(missing_t_uri[i:i+1])
     track_df = pd.DataFrame(track_feature)
     csv_data = track_df.to_csv(header=False,index=False)
     f.write(csv_data)
    except Exception as e:
        r = open("error-logs/extract_log0.txt", "a")
        r.write(datetime.datetime.now().strftime("%d.%b %Y %H:%M:%S")+": "+str(e)+'\n')
        r.close()
        time.sleep(1)
        continue
f.close()

## Handling track_features missing From extraction

In [None]:
missing_t_uri=df.track_uri[df.Track_uri.isna()]
missing_t_uri=missing_t_uri.unique()
random.shuffle(missing_t_uri)

In [None]:
f = open('../../track_features.csv','a')
for i in tqdm(range(0,len(missing_t_uri),1)):
    try:
        track_features = sp.tracks(missing_t_uri[i:i+1])
        for x in range(1):
            track_pop=pd.DataFrame([missing_t_uri[i+x]])
            track_pop['release_date']=track_features['tracks'][x]['album']['release_date']
            track_pop['pop'] = track_features['tracks'][x]["popularity"]
            csv_data = track_pop.to_csv(header=False,index=False)
            f.write(csv_data)
    except Exception as e:
        r = open("error-logs/extract_log.txt", "a")
        r.write(datetime.datetime.now().strftime("%d.%b %Y %H:%M:%S")+": "+str(e)+'\n')
        r.close()
        time.sleep(1)
        continue
f.close()

# Droping Unwanted Columns Save Space

There were still 101 from audio_features and 576 from track_features extraction that were missing from the soptify api, so I had to drop them.

In [None]:
df.dropna(axis=0,inplace=True)

In [None]:
df.isna().sum().sum()

In [None]:
df.columns

In [None]:
df.drop(columns=['Track_uri','Artist_uri','type','id','uri','track_href','analysis_url'],axis=1,inplace=True)

In [None]:
df.head(1)

## Data Preprocessing

Create five point buckets for track and artist popularity .

and 50 point buckets for the track release date.

In [None]:
df['Track_pop'] = df['Track_pop'].apply(lambda x: int(x/5))
df['Artist_pop'] = df['Artist_pop'].apply(lambda x: int(x/5))

In [None]:
df['Track_release_date'] = df['Track_release_date'].apply(lambda x: x.split('-')[0])
df['Track_release_date']=df['Track_release_date'].astype('int16')
df['Track_release_date'] = df['Track_release_date'].apply(lambda x: int(x/50))

In [None]:
df.head(1)

In [None]:
df.to_csv('../../1M_unique_processed_data.csv',index=False)