In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [2]:
csv = pd.read_csv('jugaad(processed lyrics).csv')
df= pd.DataFrame(csv)
print(sum(df["Hit"] == 1))
print(sum(df["Hit"] == 0))
print(sum(df['Artist Popularity'] > 75))

351
352
385


# Bag of Words

In [3]:
bow = CountVectorizer(max_features=1000,
                      lowercase=True,
                      ngram_range=(1,1),
                      analyzer = "word").fit(testCSV['Lyrics'].values.astype(str))
print(len(bow.vocabulary_))

lyrics_bow = bow.transform(testCSV['Lyrics'].values.astype(str))
print('Shape of Sparse Matrix: ', lyrics_bow.shape)

1000
Shape of Sparse Matrix:  (703, 1000)
<class 'scipy.sparse.csr.csr_matrix'>


# TF-IDF Vectorization

In [4]:
tfidf_transformer = TfidfTransformer().fit(lyrics_bow)
lyrics_tfidf = tfidf_transformer.transform(lyrics_bow)

# Splitting

In [5]:
X_train, X_test, Y_train, Y_test = train_test_split(lyrics_bow, testCSV['Hit'], test_size=0.2,random_state = 938)

# Naive Bayes Classifier

In [6]:
modelNBC = MultinomialNB()
modelNBC.fit(X_train, Y_train)
predictionsNBC = modelNBC.predict(X_test)
cnf = confusion_matrix(Y_test, predictionsNBC)
print('Accuracy: ', (cnf[0][0]+cnf[1][1])/(cnf[0][0]+cnf[0][1]+cnf[1][0]+cnf[1][1]))
print(cnf)

Accuracy:  0.6595744680851063
[[35 34]
 [14 58]]


In [7]:
print(classification_report(Y_test, predictionsNBC))

             precision    recall  f1-score   support

          0       0.71      0.51      0.59        69
          1       0.63      0.81      0.71        72

avg / total       0.67      0.66      0.65       141



# Logistic Regression

In [8]:
modelLog = LogisticRegression()
modelLog.fit(X_train, Y_train)
predictionsLog = modelLog.predict(X_test)
cnf = confusion_matrix(Y_test, predictionsLog)
print('Accuracy: ', (cnf[0][0]+cnf[1][1])/(cnf[0][0]+cnf[0][1]+cnf[1][0]+cnf[1][1]))
print(cnf)

Accuracy:  0.6879432624113475
[[41 28]
 [16 56]]


In [9]:
print(classification_report(Y_test, predictionsLog))

             precision    recall  f1-score   support

          0       0.72      0.59      0.65        69
          1       0.67      0.78      0.72        72

avg / total       0.69      0.69      0.69       141



# Support Vector Classifier

In [10]:
modelSVC = SVC()
modelSVC.fit(X_train, Y_train)
predictionsSVC = modelSVC.predict(X_test)
cnf = confusion_matrix(Y_test, predictionsSVC)
print('Accuracy: ', (cnf[0][0]+cnf[1][1])/(cnf[0][0]+cnf[0][1]+cnf[1][0]+cnf[1][1]))
print(cnf)

Accuracy:  0.5531914893617021
[[58 11]
 [52 20]]


In [11]:
print(classification_report(Y_test, predictionsSVC))

             precision    recall  f1-score   support

          0       0.53      0.84      0.65        69
          1       0.65      0.28      0.39        72

avg / total       0.59      0.55      0.52       141



# Random Forest Classifier

In [12]:
modelRFC = RandomForestClassifier()
modelRFC.fit(X_train, Y_train)
predictionsRFC = modelRFC.predict(X_test)
cnf = confusion_matrix(Y_test, predictionsRFC)
print('Accuracy: ', (cnf[0][0]+cnf[1][1])/(cnf[0][0]+cnf[0][1]+cnf[1][0]+cnf[1][1]))
print(cnf)

Accuracy:  0.6453900709219859
[[50 19]
 [31 41]]


In [13]:
print(classification_report(Y_test, predictionsRFC))

             precision    recall  f1-score   support

          0       0.62      0.72      0.67        69
          1       0.68      0.57      0.62        72

avg / total       0.65      0.65      0.64       141



In [14]:
# est_Ensemble = VotingClassifier(estimators=[('AB', model), ('Log', modelLog)],
#                         weights=[1, 1])