In [1]:
import numpy as np
import pandas as pd

In [2]:
DATA = pd.read_csv('clean_data/even_more_tracks.csv', skiprows=0, header=0)
DATA = pd.DataFrame(DATA).drop(columns=['id'])
column_names = list(DATA.columns[:-1])
DATA = np.array(DATA)


In [3]:
X = DATA [:,:-1]
y = DATA[:,-1]

## Logistic Regression Model

In [4]:
# Split data into 80% training and 20% testing
from sklearn.model_selection import train_test_split
np.random.seed(20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40)

In [46]:
from sklearn.linear_model import LogisticRegression

regrClf =  LogisticRegression(random_state=0)
regrClf.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [47]:
train_predictions = regrClf.predict(X_train)
print("Training Data Score:", regrClf.score(X_train, y_train))
test_predictions = regrClf.predict(X_test)
print("Testing Data Score:", regrClf.score(X_test, y_test))


Training Data Score: 0.8529067499024581
Testing Data Score: 0.85020479812756


In [48]:
feature_weights = {}
for i in range(len(column_names)):
    feat = column_names[i]
    feature_weights[feat] = regrClf.coef_[0][i]
max(regrClf.coef_[0])

0.09143079134265615

In [49]:
from sklearn.metrics import classification_report
print(classification_report(y_test, test_predictions))

              precision    recall  f1-score   support

         0.0       0.90      0.62      0.73       565
         1.0       0.84      0.96      0.90      1144

    accuracy                           0.85      1709
   macro avg       0.87      0.79      0.81      1709
weighted avg       0.86      0.85      0.84      1709



In [50]:
import operator
sorted_x = sorted(feature_weights.items(), key=operator.itemgetter(1))

In [51]:
sorted_x

[('year', -0.0030959087732987185),
 ('mode', -0.0006078006129145711),
 ('acousticness', -0.0005666278704953948),
 ('instrumentalness', -0.0002523318338487842),
 ('track_popularity', -1.2725493945578784e-05),
 ('duration_ms', -1.7586345422580072e-06),
 ('key', 1.3839557384331474e-05),
 ('liveness', 3.0234927096803396e-05),
 ('valence', 0.0001715207398166119),
 ('time_signature', 0.0002465774090625765),
 ('speechiness', 0.0002768305849364607),
 ('energy', 0.0003317572388672221),
 ('danceability', 0.0003754026494655891),
 ('tempo', 0.0029348771423836563),
 ('loudness', 0.00621825198497792),
 ('artist_popularity', 0.09143079134265615)]

## Decision Trees

In [52]:
from sklearn.tree import DecisionTreeClassifier

treeClf = DecisionTreeClassifier(random_state=0)
treeClf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=0, splitter='best')

In [53]:
print("Decision Tree Scores:", treeClf.score(X_test, y_test))

Decision Tree Scores: 0.8466939730836747


## Support Vector Machine 

In [None]:
import sklearn.svm as svm

C = [0.001, 0.01, 1.0, 10.0, 100.0, 1000.0]
G = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0 ]


for c in C:
    for g in G:
        clf = svm.SVC(kernel='linear', gamma=g, C=c)
        clf.fit(X_train, y_train)
        print('c:', c, 'g:', g, clf.score(X_test, y_test))

c: 0.001 g: 0.001 0.8970701619121049
c: 0.001 g: 0.01 0.8970701619121049
c: 0.001 g: 0.1 0.8970701619121049
c: 0.001 g: 1.0 0.8970701619121049
c: 0.001 g: 10.0 0.8970701619121049
c: 0.001 g: 100.0 0.8970701619121049
c: 0.001 g: 1000.0 0.8970701619121049
c: 0.01 g: 0.001 0.8959136468774094
c: 0.01 g: 0.01 0.8959136468774094
c: 0.01 g: 0.1 0.8959136468774094
c: 0.01 g: 1.0 0.8959136468774094
c: 0.01 g: 10.0 0.8959136468774094
c: 0.01 g: 100.0 0.8959136468774094
c: 0.01 g: 1000.0 0.8959136468774094
c: 1.0 g: 0.001 0.8978411719352352
c: 1.0 g: 0.01 0.8978411719352352
c: 1.0 g: 0.1 0.8978411719352352
c: 1.0 g: 1.0 0.8978411719352352
c: 1.0 g: 10.0 0.8978411719352352
c: 1.0 g: 100.0 0.8978411719352352
c: 1.0 g: 1000.0 0.8978411719352352
c: 10.0 g: 0.001 0.8943716268311488
c: 10.0 g: 0.01 0.8943716268311488
c: 10.0 g: 0.1 0.8943716268311488
c: 10.0 g: 1.0 0.8943716268311488
c: 10.0 g: 10.0 0.8943716268311488
c: 10.0 g: 100.0 0.8943716268311488
c: 10.0 g: 1000.0 0.8943716268311488


## Neural Networks

In [None]:
from sklearn.neural_network import MLPClassifier

