In [69]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [70]:
testCSV = pd.read_csv('Dataset(Analysis)(processed lyrics).csv')
df= pd.DataFrame(testCSV)
print(sum(df["Hit"] == 1))
print(sum(df["Hit"] == 0))
print(sum(df['Artist Popularity'] > 75))

351
352
385


# Bag of Words

In [71]:
bow = CountVectorizer(max_features=1000,
                      lowercase=True,
                      ngram_range=(1,2),
                      analyzer = "word").fit(testCSV['Lyrics'].values.astype(str))
print(len(bow.vocabulary_))

lyrics_bow = bow.transform(testCSV['Lyrics'].values.astype(str))
print('Shape of Sparse Matrix: ', lyrics_bow.shape)

1000
Shape of Sparse Matrix:  (703, 1000)


# TF-IDF Vectorization

In [72]:
tfidf_transformer = TfidfTransformer().fit(lyrics_bow)
lyrics_tfidf = tfidf_transformer.transform(lyrics_bow)

# Splitting

In [73]:
X_train, X_test, Y_train, Y_test = train_test_split(lyrics_bow, testCSV['Hit'], test_size=0.2,random_state = 938)

# Naive Bayes Classifier

In [74]:
modelNBC = MultinomialNB()
modelNBC.fit(X_train, Y_train)
predictionsNBC = modelNBC.predict(X_test)
cnf = confusion_matrix(Y_test, predictionsNBC)
print('Accuracy: ', (cnf[0][0]+cnf[1][1])/(cnf[0][0]+cnf[0][1]+cnf[1][0]+cnf[1][1]))
print(cnf)

Accuracy:  0.6950354609929078
[[34 35]
 [ 8 64]]


In [75]:
print(classification_report(Y_test, predictionsNBC))

             precision    recall  f1-score   support

          0       0.81      0.49      0.61        69
          1       0.65      0.89      0.75        72

avg / total       0.73      0.70      0.68       141



# Logistic Regression

In [76]:
modelLog = LogisticRegression()
modelLog.fit(X_train, Y_train)
predictionsLog = modelLog.predict(X_test)
cnf = confusion_matrix(Y_test, predictionsLog)
print('Accuracy: ', (cnf[0][0]+cnf[1][1])/(cnf[0][0]+cnf[0][1]+cnf[1][0]+cnf[1][1]))
print(cnf)

Accuracy:  0.7021276595744681
[[43 26]
 [16 56]]


In [77]:
print(classification_report(Y_test, predictionsLog))

             precision    recall  f1-score   support

          0       0.73      0.62      0.67        69
          1       0.68      0.78      0.73        72

avg / total       0.71      0.70      0.70       141



# Support Vector Classifier

In [78]:
modelSVC = SVC()
modelSVC.fit(X_train, Y_train)
predictionsSVC = modelSVC.predict(X_test)
cnf = confusion_matrix(Y_test, predictionsSVC)
print('Accuracy: ', (cnf[0][0]+cnf[1][1])/(cnf[0][0]+cnf[0][1]+cnf[1][0]+cnf[1][1]))
print(cnf)

Accuracy:  0.5531914893617021
[[57 12]
 [51 21]]


In [79]:
print(classification_report(Y_test, predictionsSVC))

             precision    recall  f1-score   support

          0       0.53      0.83      0.64        69
          1       0.64      0.29      0.40        72

avg / total       0.58      0.55      0.52       141



# Random Forest Classifier

In [80]:
modelRFC = RandomForestClassifier(n_estimators = 200, random_state=200)
modelRFC.fit(X_train, Y_train)
predictionsRFC = modelRFC.predict(X_test)
cnf = confusion_matrix(Y_test, predictionsRFC)
print('Accuracy: ', (cnf[0][0]+cnf[1][1])/(cnf[0][0]+cnf[0][1]+cnf[1][0]+cnf[1][1]))
print(cnf)

Accuracy:  0.6666666666666666
[[35 34]
 [13 59]]


In [81]:
print(classification_report(Y_test, predictionsRFC))

             precision    recall  f1-score   support

          0       0.73      0.51      0.60        69
          1       0.63      0.82      0.72        72

avg / total       0.68      0.67      0.66       141



In [14]:
# est_Ensemble = VotingClassifier(estimators=[('AB', model), ('Log', modelLog)],
#                         weights=[1, 1])