In [130]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from matplotlib import pyplot
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import seaborn as sns
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestClassifier  # For classification
from sklearn.ensemble import RandomForestRegressor  # For regression
from sklearn.metrics import accuracy_score 
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

## Process Data and Feature Extraction

In [131]:
#read in dataset, drop the duplicate rows where a song appears twice for same date with multiple artists
df = pd.read_csv("Spotify_Dataset_V3.csv", delimiter = ";")
df = df.drop_duplicates(subset = ['Title', 'Artists', 'Date'], keep = 'first')
df = df.reset_index(drop = True)

#convert date to datetime and sort in nondescending order
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values(by='Date')

#find a songs highest rank every in dataset (min value)
df['Highest Rank'] = df.groupby(['Title', 'Artists'])['Rank'].transform('min')

  df['Date'] = pd.to_datetime(df['Date'])


In [132]:
#create new columns number of songs previous and previous popularity. 
#Track if an artist is ever considered top ten (i.e. any of an artist's song rank reached <= 10)
#Track the number of songs an artist previously has, since we don't have an artist id, we will assume the artist name is unique
#At each row, update the new column based on the data we have to that point. This way we do not use future information to
#explain past results
artist_top_ten = {}
artist_songs_previous = {}
for index, row in df.iterrows():
    artist = row['Artists']
    song = row["Title"]
    if artist not in artist_top_ten:
        artist_top_ten[artist] = False
    if row["Rank"] <= 10 and (song in artist_songs_previous.get(artist, [])):
        artist_top_ten[artist] = True
    if artist_top_ten[artist] == True: 
        df.at[index, "Previous Popularity"] = 1
    else:
        df.at[index, "Previous Popularity"] = 0
        
    
    if artist in artist_songs_previous:
        if song not in artist_songs_previous[artist]:
            number = len(artist_songs_previous[artist])
            df.at[index, "Number Songs Previous"] = number 
            artist_songs_previous[artist].append(song)
        else:
            number = len(artist_songs_previous[artist])
            df.at[index, "Number Songs Previous"] = number
    else:
            # If artist is not in the dictionary, create a new entry for the artist
            artist_songs_previous[artist] = [song]
            df.at[index, "Number Songs Previous"] = 0 


In [133]:
#number of artists for a song
for index, row in df.iterrows():
    artists = row['Artists'].split(", ")
    df.at[index, "Number of Artists"] = len(artists)

In [134]:
#set top 1 true or false if highest rank is <= 1
df['Top 10'] = df['Highest Rank'].apply(lambda x: True if x <= 10 else False)

#only keep one song in df as grouped by title and artists keeping the first
df = df.drop_duplicates(subset = ['Title', 'Artists'], keep = 'first')
df = df.reset_index(drop = True)

#drop first month
df = df[df['Date'] >= "2017-02-01"]

In [135]:
#get numerical features and apply standard scaler to them
numerical_features = df.columns[4:11]
scaler = StandardScaler()
df_scaled_numerical = scaler.fit_transform(df[numerical_features])

In [136]:
#add in the scaled data to df and reset index
df.loc[:, numerical_features] = df_scaled_numerical
df = df.reset_index(drop = True)

In [137]:
#Track an artists previous average top rank of all their songs. We append the top rank of each song to the artist in dictionary
#and calculate the mean of the top ranks in the artists list to that point so we are still in respect to time
previous_song_average = {}
for index, row in df.iterrows():
    if row["Artists"] not in previous_song_average:
        previous_song_average[row["Artists"]] = []
        
    previous_song_average[row["Artists"]].append(row["Highest Rank"])
    df.at[index, "Previous Top Rank Avg"] = np.mean(previous_song_average[row["Artists"]])
    
#rename df to the cleaned version
df_clean = df


In [138]:
df_clean.head(3)

Unnamed: 0,Rank,Title,Artists,Date,Danceability,Energy,Loudness,Speechiness,Acousticness,Instrumentalness,...,Points (Total),Points (Ind for each Artist/Nat),id,Song URL,Highest Rank,Previous Popularity,Number Songs Previous,Number of Artists,Top 10,Previous Top Rank Avg
0,197,I Miss You,"Grey, Bahari",2017-02-01,-0.167946,0.749917,0.197372,-0.314832,0.523079,-0.167422,...,4,2.0,1sl5tSzfK1JrcQIcA8rYDy,https://open.spotify.com/track/1sl5tSzfK1JrcQI...,175,0.0,0.0,2.0,False,175.0
1,188,Location,Khalid,2017-02-01,0.408735,-1.086565,-1.768777,2.427089,0.371378,-0.167422,...,13,13.0,152lZdxL1OR0ZMW6KquMif,https://open.spotify.com/track/152lZdxL1OR0ZMW...,33,0.0,0.0,1.0,False,33.0
2,199,Magic,"Thomas Gold, Jillian Edwards",2017-02-01,-0.308764,1.437869,0.193593,-0.720736,-0.746421,-0.155459,...,2,1.0,5abwHwUhywLGNli5NDcEN7,https://open.spotify.com/track/5abwHwUhywLGNli...,168,0.0,0.0,2.0,False,168.0


In [None]:
#save clean dataset to a csv file
df_clean.to_csv("df_clean_top10.csv", index=False)