In [1]:
import argparse
import pickle

import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier 
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn import svm

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer

from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score
from sklearn.metrics import classification_report

In [2]:
tdifd_new_X = pickle.load(open('vector_lyrics_1.pkl', 'rb'))

In [3]:
tdifd_new_X_test = pickle.load(open('vector_lyrics_test_1.pkl', 'rb'))

In [4]:
tdifd_X = pickle.load(open('vector_lyrics_2.pkl', 'rb'))

In [5]:
tdifd_X_test = pickle.load(open('vector_lyrics_test_2.pkl', 'rb'))

In [6]:
y_train = pickle.load(open('y_train.pkl', 'rb'))

In [7]:
y_test = pickle.load(open('y_test.pkl', 'rb'))

#### Apply different models and meta estimator classifier on the data:

1. Logistic Regression.
2. Random Forest Classifier
3. Naive Bayes. 

<b> Logistic Regression </b>

In [8]:
lr_model = LogisticRegression(C=0.1)
#lr_model = LogisticRegression()#this gave 0.9333333
lr_model.fit(tdifd_new_X, y_train)

LogisticRegression(C=0.1)

In [9]:
print("Train score with LogReg:",lr_model.score(tdifd_new_X,y_train))

Train score with LogReg: 0.88


In [10]:
print("Test score with LogReg:",lr_model.score(tdifd_new_X_test,y_test))

Test score with LogReg: 0.9


#### Observation.

The previous linear model -without C=0.1- would gave as a test score 0.84. Eventhought 
0.9 seems a high score, the previous one had a big difference between train and test score , so probably is prone to
be high biased.

In [11]:
lr_model.predict_proba(tdifd_new_X_test)

array([[0.0064763 , 0.3867877 , 0.00643822, 0.60029778],
       [0.0067253 , 0.54818175, 0.00682005, 0.43827289],
       [0.0068627 , 0.49829364, 0.00682489, 0.48801878],
       [0.00652318, 0.39506276, 0.00654947, 0.59186459],
       [0.00662121, 0.60022685, 0.0065603 , 0.38659164],
       [0.00673759, 0.52240611, 0.00667314, 0.46418316],
       [0.00669878, 0.45408743, 0.00692364, 0.53229015],
       [0.00681862, 0.56338892, 0.00663937, 0.42315309],
       [0.00700043, 0.48167353, 0.00683709, 0.50448895],
       [0.00662713, 0.53951421, 0.0066663 , 0.44719237],
       [0.00700043, 0.48167353, 0.00683709, 0.50448895],
       [0.00678157, 0.53175765, 0.00686441, 0.45459637],
       [0.00682165, 0.54496921, 0.00674759, 0.44146155],
       [0.0065163 , 0.60254924, 0.00652916, 0.3844053 ],
       [0.00700043, 0.48167353, 0.00683709, 0.50448895],
       [0.00645595, 0.6153072 , 0.00643213, 0.37180471],
       [0.00662672, 0.46767073, 0.0065986 , 0.51910395],
       [0.00643954, 0.63417455,

#### Logistic Regression with the other data: X_train data.

In [12]:
lr_model = LogisticRegression(C=0.1)
lr_model.fit(tdifd_X, y_train)

LogisticRegression(C=0.1)

In [13]:
print("Train score with LogReg:",lr_model.score(tdifd_X,y_train))

Train score with LogReg: 0.96


In [14]:
print("Test score with LogReg:",lr_model.score(tdifd_X_test,y_test))

Test score with LogReg: 0.72


#### Observation.
Big difference between train and test score.

#### Perform cross validation to see which performs better.

In [15]:
cvs_X_new = cross_val_score(lr_model, tdifd_new_X, y_train)
cvs_X_new



array([0.8       , 0.76666667, 0.76666667, 0.86666667, 0.7       ])

In [16]:
cvs_X = cross_val_score(lr_model, tdifd_X, y_train)
cvs_X



array([0.83333333, 0.83333333, 0.83333333, 0.83333333, 0.86666667])

#### Observation.

The model with the X_train data seems to perform better, thereforE i will continue with 'tdifd_new_X' and 'tdifd_new_X_test' for the next models.

<b> Random Forest Classifier.</b>

In [17]:
clf = RandomForestClassifier()
clf.fit(tdifd_X, y_train)

RandomForestClassifier()

In [18]:
param_grid = {
    'max_depth': [80, 90, 100, 110],'max_features': [2, 3],'n_estimators' : [100,200]
}
grid_search = GridSearchCV(estimator = clf, param_grid = param_grid, 
                          cv = 2, verbose = True)
grid_search.fit(tdifd_X, y_train)
grid_search.best_params_, grid_search.best_score_

Fitting 2 folds for each of 16 candidates, totalling 32 fits




({'max_depth': 80, 'max_features': 3, 'n_estimators': 200}, 0.86)

In [28]:
clf = RandomForestClassifier(max_depth = 80, max_features = 3, n_estimators = 200)

In [29]:
clf.fit(tdifd_X, y_train)

RandomForestClassifier(max_depth=80, max_features=3, n_estimators=200)

In [30]:
print("Train score with Random Forest Classifier.:",clf.score(tdifd_X,y_train))

Train score with Random Forest Cl.: 1.0


In [31]:
print("Test score with Random Forest Classifier.:",clf.score(tdifd_X_test,y_test))

Test score with Random Forest Cl.: 0.9


In [66]:
clf.feature_importances_

array([0., 0., 0., ..., 0., 0., 0.])

<b> Naive Bayes </b>

In [38]:
nb = MultinomialNB()
parameters = {'alpha': (1, 0.1, 0.01, 0.001, 0.0001, 0.00001)}  
grid_nb_search= GridSearchCV(nb, parameters)
grid_nb_search.fit(tdifd_X, y_train)
grid_nb_search.best_params_, grid_search.best_score_



({'alpha': 0.1}, 0.86)

In [40]:
nb = MultinomialNB(alpha=0.1)
nb.fit(tdifd_X, y_train)

MultinomialNB(alpha=0.1)

In [41]:
print("Train score with Multinomial NB:",nb.score(tdifd_X, y_train))

Train score with Multinomial NB: 0.9866666666666667


In [42]:
print("Test score with Random Forest Classifier.:",nb.score(tdifd_X_test,y_test))

Test score with Random Forest Classifier.: 0.82


In [75]:
ypred = lr_model.predict(tdifd_X_test)
acc = round(accuracy_score(ypred,y_test), 3)
acc

0.72

In [86]:
tf_2 = pickle.load(open('tf_2.pkl', 'rb'))

<b> Command line interface program with the module args parse. </b>

In [None]:
parser = argparse.ArgumentParser(description='find the artist')
parser.add_argument('-l', '--lyrics', help='the lyrics to identify the artist', type=str, 
                    default=None, required=True)
parser.add_argument'-u','--url', help='URL of lyrics site',action='store', dest='url',default=None, required=True)
parser.add_argument('-v', '--verbose', action='count', default=0)
args = parser.parse_args()

lyrics = args.lyrics

In [143]:
def call_script_in_cli(lyrics)->str:
    #lyrics = args.lyrics
    
#convert the string taken from the command line to a list, so as to be accepted from the
#transform function
    lyric = [lyrics]
    string=tf_2.transform(lyric)
    artist = lr_model.predict(string)
    print(artist[0])
    
    if artist[0] == 1:
        print('Queen')
    elif artist.any() == 0:
        print('Pixies')
    else:
        print('Try again')

In [144]:
call_script_in_cli('oh oh you don t fool me you don t fool me you')

1  
Try again


#### Test if it works!

In [146]:
lyric = ['oh oh you don t fool me you don t fool me you']
string=tf_2.transform(lyric)
artist = lr_model.predict(string)
artist

array(['1  '], dtype=object)