In [1]:
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

In [2]:
data1990 = pd.read_csv("https://hyd123.s3.us-east-2.amazonaws.com/updated_data/1990data_cleaned.csv")
data2000 = pd.read_csv("https://hyd123.s3.us-east-2.amazonaws.com/updated_data/2000data_cleaned.csv")
data2010 = pd.read_csv("https://hyd123.s3.us-east-2.amazonaws.com/updated_data/2010data_cleaned.csv")
data2019 = pd.read_csv("https://hyd123.s3.us-east-2.amazonaws.com/updated_data/2019data_cleaned.csv")

In [None]:
data1990.columns

In [3]:
# label all dfs by publish year
data1990['year'] = 0
data2000['year'] = 1
data2010['year'] = 2
data2019['year'] = 3
data1990.head()

Unnamed: 0.1,Unnamed: 0,acousticness,analysis_url,danceability,duration_ms,energy,id,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,track_href,type,uri,valence,year
0,0,0.736,https://api.spotify.com/v1/audio-analysis/1NCu...,0.78,322133,0.565,1NCuYqMc8hKMb4cpNTcJbD,0.00269,3,0.063,-7.32,0,0.0306,101.967,4,https://api.spotify.com/v1/tracks/1NCuYqMc8hKM...,audio_features,spotify:track:1NCuYqMc8hKMb4cpNTcJbD,0.365,0
1,1,0.0561,https://api.spotify.com/v1/audio-analysis/51q3...,0.631,330933,0.935,51q3nDYWAqPY4hlgKEbP6j,0.152,1,0.634,-8.34,0,0.0587,109.122,4,https://api.spotify.com/v1/tracks/51q3nDYWAqPY...,audio_features,spotify:track:51q3nDYWAqPY4hlgKEbP6j,0.678,0
2,2,0.657,https://api.spotify.com/v1/audio-analysis/5z3Z...,0.401,211973,0.285,5z3ZDMP02xF33yCvPFnct3,1.1e-05,11,0.158,-11.005,1,0.0327,130.465,4,https://api.spotify.com/v1/tracks/5z3ZDMP02xF3...,audio_features,spotify:track:5z3ZDMP02xF33yCvPFnct3,0.267,0
3,3,0.0287,https://api.spotify.com/v1/audio-analysis/0UAE...,0.862,215040,0.918,0UAEHlFR79k9CJvknSGUNf,1e-06,10,0.0492,-7.325,0,0.108,124.811,4,https://api.spotify.com/v1/tracks/0UAEHlFR79k9...,audio_features,spotify:track:0UAEHlFR79k9CJvknSGUNf,0.651,0
4,4,0.334,https://api.spotify.com/v1/audio-analysis/1s12...,0.713,274093,0.432,1s12gYsPeTNQ6znow46nmp,0.000299,11,0.0674,-12.639,0,0.03,118.955,4,https://api.spotify.com/v1/tracks/1s12gYsPeTNQ...,audio_features,spotify:track:1s12gYsPeTNQ6znow46nmp,0.445,0


In [4]:
# combine dfs into one
df = pd.concat([data1990, data2000,data2010,data2019], ignore_index=True,sort=False)

In [5]:
# only use the feature columns
df = df[['year','acousticness','danceability','duration_ms','energy','instrumentalness','key','liveness','loudness','mode','speechiness','tempo','time_signature','valence']]
df = df.dropna(how = 'any')
df.tail()

Unnamed: 0,year,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
1564,3,0.152,0.754,242573,0.646,1.8e-05,7,0.108,-5.795,1,0.317,176.089,4,0.429
1565,3,0.427,0.679,179973,0.443,0.0,10,0.106,-8.528,0,0.153,104.049,4,0.363
1566,3,0.0794,0.816,136366,0.582,6e-06,8,0.0916,-4.141,1,0.341,160.004,4,0.542
1567,3,0.363,0.63,174333,0.671,0.0,6,0.113,-6.659,0,0.256,89.921,4,0.333
1568,3,0.582,0.819,268893,0.458,0.173,4,0.128,-10.223,0,0.167,122.981,4,0.232


In [6]:
X = df[['acousticness','danceability','duration_ms','energy','instrumentalness','key','liveness','loudness','mode','speechiness','tempo','time_signature','valence']]
y = df["year"].values#.reshape(-1, 1)
print("Shape: ", X.shape, y.shape)

Shape:  (1569, 13) (1569,)


In [7]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [8]:
from sklearn.preprocessing import StandardScaler 
X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train) 
X_test_scaled = X_scaler.transform(X_test)

### Decision Tree Model

In [None]:
from sklearn import tree 
clf = tree.DecisionTreeClassifier()

In [None]:
clf = clf.fit(X_train_scaled, y_train)
clf.score(X_test_scaled, y_test)


In [None]:
predictions = clf.predict(X_test_scaled)

In [None]:
test_result_df = pd.DataFrame({"Predicted": y_test, "Actual": predictions})[["Predicted", "Actual"]]
test_result_df.head()

In [None]:
feature_names = X.columns
sorted(zip(clf.feature_importances_, feature_names), reverse=True)

In [None]:
import graphviz
import pydotplus

dot_data = tree.export_graphviz(clf, out_file=None, 
                     feature_names=feature_names, 
                     class_names=["1990","2000","2010","2019"], 
                     filled=True, rounded=True,  
                     special_characters=True)  
graph = graphviz.Source(dot_data)  
graph 


### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier 
rf = RandomForestClassifier(n_estimators=200) 
rf = rf.fit(X_train_scaled, y_train) 
rf.score(X_test_scaled, y_test)
feature_names = X.columns
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

In [None]:
rf.score(X_test_scaled, y_test)

### Neural Network

In [None]:
from numpy.random import seed 
seed(42)

In [None]:
from keras.utils import to_categorical
y_train_cate = to_categorical(y_train)
y_test_cate = to_categorical(y_test)

In [None]:
from tensorflow.keras.models import Sequential 
model_neural = Sequential()


In [None]:
from tensorflow.keras.layers import Dense 
number_inputs = 13 
number_hidden_nodes = 100 
model_neural.add(Dense(units=number_hidden_nodes, activation='relu', input_dim=number_inputs))

for i in range(11):
    model_neural.add(Dense(units=number_hidden_nodes, activation='relu')) 

number_classes = 4 
model_neural.add(Dense(units=number_classes, activation='softmax'))
model_neural.summary()


In [None]:
model_neural.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
model_neural.fit( X_train_scaled, y_train_cate, epochs=20, shuffle=True, verbose=2 )
model_loss, model_accuracy = model_neural.evaluate( X_test_scaled, y_test_cate, verbose=2) 
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")


### K Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier
train_scores = [] 
test_scores = []
for k in range(1, 20, 2):
      knn = KNeighborsClassifier(n_neighbors=k)
      knn.fit(X_train_scaled, y_train)  
      train_score = knn.score(X_train_scaled, y_train)
      test_score = knn.score(X_test_scaled, y_test) 
      train_scores.append(train_score) 
      test_scores.append(test_score)
      print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")


In [None]:
plt.plot(range(1, 20, 2), train_scores, marker='o') 
plt.plot(range(1, 20, 2), test_scores, marker="x") 
plt.xlabel("k neighbors") 
plt.ylabel("Testing accuracy Score")
plt.show()
plt.savefig("K_Neighbors.png")

In [None]:
knn = KNeighborsClassifier(n_neighbors=11)
knn.fit(X_train_scaled, y_train) 
print('k=11 Test Acc: %.3f' % knn.score(X_test_scaled, y_test))


In [None]:
from sklearn.metrics import confusion_matrix 
knn_predictions = knn.predict(X_test_scaled)
cm = confusion_matrix(y_test, knn_predictions) 
cm


### Gaussian Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train_scaled, y_train)
gnb.score(X_test_scaled, y_test)

In [None]:
gnb_predictions = gnb.predict(X_test_scaled)
cm = confusion_matrix(y_test, gnb_predictions) 
cm


### Support Vector Machine

In [9]:
from sklearn.svm import SVC
model_svc = SVC(kernel='linear') 
model_svc.fit(X_train_scaled, y_train)

from sklearn.metrics import classification_report
predictions = model_svc.predict(X_test_scaled)

print(classification_report(y_test, predictions, target_names=["1990", "2000","2010","2019"]))


              precision    recall  f1-score   support

        1990       0.41      0.41      0.41       109
        2000       0.32      0.26      0.29        88
        2010       0.39      0.40      0.39       101
        2019       0.48      0.55      0.51        95

    accuracy                           0.41       393
   macro avg       0.40      0.40      0.40       393
weighted avg       0.40      0.41      0.40       393



In [10]:
svm_predictions = model_svc.predict(X_test_scaled) 
from sklearn.metrics import confusion_matrix 
cm = confusion_matrix(y_test, svm_predictions) 
cm

array([[45, 22, 20, 22],
       [27, 23, 24, 14],
       [23, 18, 40, 20],
       [14, 10, 19, 52]], dtype=int64)

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression 
classifier = LogisticRegression()
classifier.fit(X_train_scaled, y_train)

print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")


In [None]:
from sklearn.linear_model import LogisticRegressionCV 
classifier = LogisticRegressionCV(multi_class="multinomial")
classifier.fit(X_train_scaled, y_train)

print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

In [11]:
from sklearn.linear_model import RidgeClassifierCV
classifier = RidgeClassifierCV()
classifier.fit(X_train_scaled, y_train)

print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

Training Data Score: 0.4549319727891156
Testing Data Score: 0.40966921119592875


In [12]:
classifier_predictions = classifier.predict(X_test_scaled) 
from sklearn.metrics import confusion_matrix 
cm = confusion_matrix(y_test, classifier_predictions) 
cm

array([[50, 19, 14, 26],
       [31, 17, 19, 21],
       [29, 16, 32, 24],
       [15,  6, 12, 62]], dtype=int64)