In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rcParams
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from itertools import product
import time
%matplotlib inline

In [None]:
df = pd.read_csv('../input/dataset-of-songs-in-spotify/genres_v2.csv')

In [None]:
df.info()

In [None]:
df.head()

In [None]:
print(f'There are {df.shape[0]} rows and {df.shape[1]} columns in dataset.\n')
df.isnull().sum() 

In [None]:
# drop unnecessary columns
df = df.drop(columns=['title', 'Unnamed: 0', 'id', 'uri', 'track_href', 'analysis_url'])  # axis=1

# drop duplicates
df = df.drop_duplicates()

print(f'There are {df.shape[0]} rows and {df.shape[1]} columns in dataset.\n')
df.isnull().sum()

## EDA

In [None]:
print(f"Number of genres in given dataset: {len(df['genre'].unique())}\n")

df["genre"].unique()

In [None]:
df['genre'].value_counts()

In [None]:
# Count of each genre
import plotly.express as px
px.histogram(df.genre)

In [None]:
# Top 10 genre pie chart
df_genre = df['genre'].value_counts().head(10)

fig = px.pie(df_genre, names=df_genre.index, values=df_genre.values, title='Distribution of popular genre', labels=df_genre.index)
fig.show()

## Preprocess the Data

In [None]:
# Creating a new dataframe with required features
df_x = df[df.columns[:11]]
df_x.head()

In [None]:
df_new = df_x.copy()
df_new['genre'] = df['genre']
df_new['time_signature'] = df['time_signature']
df_new['duration_ms'] = df['duration_ms']

df_new.head()

In [None]:
df_new['duration_min'] = df_new['duration_ms']/60000
df_new.drop('duration_ms',axis=1,inplace=True)

In [None]:
df_new

### Correlation analysys

In [None]:
# There is no highly correlated values - do not drop any features
from sklearn.preprocessing import LabelEncoder
df2Corr = df_new.copy()
df2Corr['genre'] = LabelEncoder().fit_transform(df2Corr['genre'])
corrMx = df2Corr.corr()
corrMx.style.background_gradient(cmap = "RdBu_r")

In [None]:
f,ax = plt.subplots(figsize=(12, 12))
mask = np.zeros_like(df2Corr.corr())
mask[np.triu_indices_from(mask)] = True
sns.heatmap(df2Corr.corr(), annot=True, linewidths=0.4,linecolor="white", fmt= '.1f',ax=ax,cmap="Blues",mask=mask)
plt.show() 

### Feature Scaling and Normalization

In [None]:
from sklearn.preprocessing import LabelEncoder
df_new['genre_enco'] = LabelEncoder().fit_transform(df_new['genre'])
df_new['genre_enco'].value_counts()

In [None]:
X1 = df_new.drop(columns=['genre','genre_enco'])
Y1 = df_new["genre_enco"]
print(X1.shape)

In [None]:
# feature scaling and normalization
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
X1_std = StandardScaler().fit_transform(X1)

### SMOTE

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()
X1, Y1 = smote.fit_resample(X1_std, Y1)

print(X1.shape)

## Model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X1, Y1, test_size=.2, random_state=1, shuffle=True)

In [None]:
print(X_train.shape)
print(X_test.shape)

In [None]:
scoring = [['model','accuracy']]

### Bulding models

In [None]:
def buildModel(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)  # train model
    yhat = model.predict(X_test) # predict
    score = accuracy_score(y_test, yhat)
    return {
      "predict": yhat,
      "accuracy": score
    }

In [None]:
def modelCrossValidation(X, Y, algo):
    validation = KFold(n_splits=5, shuffle=True, random_state=1)
    statsNames = ['accuracy', 'balanced_accuracy', 'f1_weighted', 'f1_macro']

    res = {}
    for sname in statsNames:
        res[sname] = round(cross_val_score(algo, X, Y, cv=validation, scoring=sname, n_jobs=-1).mean(), 4)
    return res

### Feature Importance

In [None]:
# plot feature importance manually
from numpy import loadtxt
from xgboost import XGBClassifier
from matplotlib import pyplot

# fit model no training data
model = XGBClassifier()
model.fit(X_train, y_train)

# feature importance
print(model.feature_importances_)

# plot
pyplot.bar(range(len(model.feature_importances_)), model.feature_importances_)
pyplot.show()

In [None]:
importance = model.feature_importances_
indices = np.argsort(importance)

fig, ax = plt.subplots()
ax.barh(range(len(importance)), importance[indices])
ax.set_yticks(range(len(importance)))
_ = ax.set_yticklabels(np.array(df_new.drop(columns=['genre','genre_enco']).columns)[indices])

### Model Selection

#### LogisticRegression

In [None]:
# LogisticRegression
from sklearn.linear_model import LogisticRegression
model_LR = LogisticRegression(max_iter=1000);
res_LR = buildModel(model_LR, X_train, X_test, y_train, y_test)

print(res_LR["accuracy"])

#### Naive Bayes

In [None]:
# Naive Bayes
from sklearn.naive_bayes import GaussianNB
model_NB = GaussianNB()
res_NB = buildModel(model_NB, X_train, X_test, y_train, y_test)
print(res_NB["accuracy"])

#### K-Nearest Neighbors

In [None]:
# K-Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier
model_KNN = KNeighborsClassifier(n_neighbors=4) # n_neighbors=3
res_KNN = buildModel(model_KNN, X_train, X_test, y_train, y_test)

scoring.append(['KNeighbors', res_KNN["accuracy"]])
scoring[1]

In [None]:
cm = confusion_matrix(y_test, res_KNN['predict'])
fig = plt.subplots(figsize=(12, 10))
ax = sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")

#### Decision Tree

In [None]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
model_DT = DecisionTreeClassifier(max_depth=10, min_samples_split=10, random_state=42)
res_DT = buildModel(model_DT, X_train, X_test, y_train, y_test)
print(res_DT["accuracy"])

In [None]:
# get importance
importance = model_DT.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))

In [None]:
# plot feature importance
indices = np.argsort(importance)

fig, ax = plt.subplots()
ax.barh(range(len(importance)), importance[indices])
ax.set_yticks(range(len(importance)))
_ = ax.set_yticklabels(np.array(df_new.columns)[indices])

#### Random Forest

In [None]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

# Train Accuracy 
k = 5
model_RF = RandomForestClassifier(n_estimators=200, max_depth=30, random_state=3)
cv_score = cross_val_score(model_RF, X_train, y_train, cv=k)
print('Cross_val Scores: ', cv_score)
print("Train Accuracy(average):", cv_score.mean())

# Test Accuracy
clf_RF = model_RF.fit(X_train, y_train)
y_pred = clf_RF.predict(X_test)
score_accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", score_accuracy)
scoring.append(['RandomForest', score_accuracy])

In [None]:
# get importance
importance = model_RF.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
pyplot.bar([x for x in range(len(importance))], importance)
pyplot.show()

In [None]:
indices = np.argsort(importance)

fig, ax = plt.subplots()
ax.barh(range(len(importance)), importance[indices])
ax.set_yticks(range(len(importance)))
_ = ax.set_yticklabels(np.array(df_new.drop(columns=['genre','genre_enco']).columns)[indices])

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
# y_pred = res_RF['predict']
cm = confusion_matrix(y_test, y_pred)
fig = plt.subplots(figsize=(12, 10))
ax = sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")

#### SVM

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

k = 3

# Train Accuracy 
model_svc = SVC(kernel="rbf", C=1000, gamma="scale") 
cv_score = cross_val_score(model_svc, X_train, y_train, cv=k)
print('Cross_val Scores: ', cv_score)
print("Train Accuracy(average):", cv_score.mean()) 

# Test Accuracy
clf_svc = model_svc.fit(X_train, y_train)
y_pred = clf_svc.predict(X_test)
score_accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", score_accuracy)
scoring.append(['SVC', score_accuracy])

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
# import seaborn as sns; sns.set()
cm = confusion_matrix(y_test, y_pred)
fig = plt.subplots(figsize=(12, 10))
ax = sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")

#### Bagging

In [None]:
# Bagging
from sklearn.ensemble import BaggingClassifier
k = 5

# Train Accuracy 
model_Bag = BaggingClassifier()
cv_score = cross_val_score(model_Bag, X_train, y_train, cv=k)
print('Cross_val Scores: ', cv_score)
print("Train Accuracy(average):", cv_score.mean()) 

# Test Accuracy
clf_bag = model_Bag.fit(X_train, y_train)
y_pred = clf_bag.predict(X_test)
score_accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", score_accuracy)
scoring.append(['Bagging', score_accuracy])

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred)
fig = plt.subplots(figsize=(12, 10))
ax = sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")

### Compare Model Result

In [None]:
from prettytable import PrettyTable
table = PrettyTable()
table.field_names = scoring[0]

for i in range(len(scoring)):
    if i!=0:
        table.add_row(scoring[i])
print(table)

In [None]:
# draw chart to compare algorithms

toChart = pd.DataFrame(scoring, columns =['algorithm', 'accuracy']) 
toChart.drop(0, inplace=True)

px.bar(toChart, x="algorithm", y="accuracy")