In [18]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from MultiTrain import MultiClassifier
import nltk
import pickle

In [2]:
df = pd.read_csv('train_music.csv')

In [3]:
df

Unnamed: 0,lyric,class
0,Can't drink without thinkin' about you,1
1,Now Lil Pump flyin' private jet (Yuh),0
2,"No, matter fact, you ain't help me when I had ...",0
3,"And you could find me, I ain't hidin'",0
4,From the way you talk to the way you move,1
...,...,...
51049,"I told her pour me some more, then she went ri...",0
51050,Hit the ground and crawl to the dresser,0
51051,Just keep breathin' and breathin' and breathin...,1
51052,"Down go the system, long live the king (King)",0


In [4]:
lem = WordNetLemmatizer()

df['new_lyrics'] = df['lyric'].apply(lambda x: ' '.join(lem.lemmatize(word) for word in x.split()))

In [5]:
df

Unnamed: 0,lyric,class,new_lyrics
0,Can't drink without thinkin' about you,1,Can't drink without thinkin' about you
1,Now Lil Pump flyin' private jet (Yuh),0,Now Lil Pump flyin' private jet (Yuh)
2,"No, matter fact, you ain't help me when I had ...",0,"No, matter fact, you ain't help me when I had ..."
3,"And you could find me, I ain't hidin'",0,"And you could find me, I ain't hidin'"
4,From the way you talk to the way you move,1,From the way you talk to the way you move
...,...,...,...
51049,"I told her pour me some more, then she went ri...",0,"I told her pour me some more, then she went ri..."
51050,Hit the ground and crawl to the dresser,0,Hit the ground and crawl to the dresser
51051,Just keep breathin' and breathin' and breathin...,1,Just keep breathin' and breathin' and breathin...
51052,"Down go the system, long live the king (King)",0,"Down go the system, long live the king (King)"


In [6]:
from numpy import arange
train = MultiClassifier(random_state=42, verbose=True, cores=-1)
model = LogisticRegression(verbose=6, n_jobs=-1, C=0.5, solver='sag')

#params = {'solver': ['sag'],
#          'C': arange(0, 0.2, 0.1),
#          }
#model = train.tune_parameters(model=lr,
#                              parameters=params,
#                              tune='half-random',
#                              use_cpu=-1,
#                              verbose=4
#                              )

In [7]:
print(model)

LogisticRegression(C=0.5, n_jobs=-1, solver='sag', verbose=6)


In [8]:



X_train, X_test, y_train, y_test = train_test_split(df['new_lyrics'], df['class'], test_size=0.1, random_state=42)

In [9]:
count = CountVectorizer(ngram_range=(2, 5))

In [10]:
count.fit(X_train)

In [11]:
new = count.transform(X_train)

In [12]:
model.fit(new, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


max_iter reached after 7 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    7.0s finished


In [13]:
test = count.transform(X_test)
pred = model.predict(test)

In [14]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score, f1_score, r2_score

print("Accuracy: ", accuracy_score(y_test, pred))
print("ROC AUC:", roc_auc_score(y_test, pred))
print("f1:", f1_score(y_test, pred))
print("r2:", r2_score(y_test, pred))



Accuracy:  0.8644731688209949
ROC AUC: 0.8559292664227435
f1: 0.835707502374169
r2: 0.4494038009636422


In [15]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.85      0.92      0.88      2870
           1       0.89      0.79      0.84      2236

    accuracy                           0.86      5106
   macro avg       0.87      0.86      0.86      5106
weighted avg       0.87      0.86      0.86      5106



In [16]:
print(confusion_matrix(y_test, pred))

[[2654  216]
 [ 476 1760]]


In [19]:
import lzma

with lzma.open('vector.xz', 'wb') as f:
    pickle.dump(count, f)
    
with lzma.open('logreg.xz', 'wb') as g:
    pickle.dump(model, g)