In [1]:
import graphviz
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.tree import export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

%matplotlib inline

# Read and clean dataset

In [2]:
# read
df = pd.read_csv("spotify-dataset.csv")

# lower the attribute
df['Top Genre'] = (df["Top Genre"].str.strip()).str.lower()

# drop genres that have less than 20 instances
to_remove = [genre for genre in df['Top Genre'].unique() if df['Top Genre'].value_counts()[genre] < 20] 
for r in to_remove:
    df = df[df['Top Genre'] != r]

# convert negative values to positive, because the percenption of sound is relative (-N dB == N dB in human ear)
df['Loudness (dB)'] = df['Loudness (dB)'].abs()

# convert duration to int
df['Length (Duration)'] = pd.to_numeric(df['Length (Duration)'].str.replace(',',''))

# drop not used columns
df.drop(columns = ['Index', 'Title', 'Artist', 'Year'], inplace=True)

In [3]:
df

Unnamed: 0,Top Genre,Beats Per Minute (BPM),Energy,Danceability,Loudness (dB),Liveness,Valence,Length (Duration),Acousticness,Speechiness,Popularity
0,adult standards,157,30,53,14,11,68,201,94,3,71
1,album rock,135,79,50,11,17,81,207,17,7,39
3,alternative metal,173,96,43,4,3,37,269,0,4,76
4,classic rock,106,82,58,5,10,87,256,1,3,59
6,pop,102,71,71,6,13,54,257,6,3,74
...,...,...,...,...,...,...,...,...,...,...,...
1987,adult standards,119,24,75,15,9,43,216,83,12,68
1988,adult standards,168,7,17,21,14,10,298,92,3,66
1989,adult standards,94,21,70,12,11,72,128,84,7,63
1990,adult standards,175,76,36,8,76,95,136,73,6,69


# Perform analysis

The analisis performed, conssits in a classification with random forest, that ....

### Prepare data

In [4]:
targets = ['Top Genre']
features = ['Beats Per Minute (BPM)', 'Energy', 'Danceability', 'Loudness (dB)', 'Liveness', 'Valence', 'Length (Duration)', 'Acousticness', 'Speechiness', 'Popularity']
X = df[features]
Y = df[targets]
train_X, val_X, train_y, val_y = train_test_split(X, Y, random_state=1)

### Train model

In [5]:
model = RandomForestClassifier(random_state=1, n_estimators=10, warm_start=True)
model.fit(train_X, train_y)

  


RandomForestClassifier(n_estimators=10, random_state=1, warm_start=True)

### Perform validation predictions

In [6]:
validation_predictions = model.predict(val_X)

# Plot model statistics