In [40]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.express as px

from sklearn.model_selection import cross_validate, train_test_split
from skopt import BayesSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, plot_confusion_matrix
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from skopt.space import Real, Categorical, Integer
from sklearn.metrics import roc_auc_score

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

some code adapted from GA lessons

In [2]:
np.random.seed(42)

# for reproducibility

## Data Acquisition

In [3]:
ladies = pd.read_csv('lyrics_and_artist.csv', encoding='latin-1')
ladies.head()

Unnamed: 0,lyrics,artist
0,"I've been drinkin', I've been drinkin'",beyonce
1,I get filthy when that liquor get into me,beyonce
2,"I've been thinkin', I've been thinkin'",beyonce
3,Why can't I keep my fingers off it?,beyonce
4,"Baby, I want you, now-now",beyonce


In [4]:
ladies['lyrics'].replace({'<|endoftext|>': None}, inplace=True)
ladies.dropna(inplace=True)

In [5]:
ladies.shape

(9488, 2)

In [6]:
ladies['artist'] = ladies['artist'].map({'beyonce': 0, 'rihanna':1})
ladies.head()
# conversion of beyonce/rihanna into binary labels

Unnamed: 0,lyrics,artist
0,"I've been drinkin', I've been drinkin'",0
1,I get filthy when that liquor get into me,0
2,"I've been thinkin', I've been thinkin'",0
3,Why can't I keep my fingers off it?,0
4,"Baby, I want you, now-now",0


In [7]:
ladies = ladies[['artist','lyrics']]
ladies.head()

Unnamed: 0,artist,lyrics
0,0,"I've been drinkin', I've been drinkin'"
1,0,I get filthy when that liquor get into me
2,0,"I've been thinkin', I've been thinkin'"
3,0,Why can't I keep my fingers off it?
4,0,"Baby, I want you, now-now"


In [8]:
blanks = []

for ind, artist, lyrics in ladies.itertuples():
    if lyrics.isspace():
        blanks.append(ind)

blanks

[]

## Baseline Accuracy

In [9]:
ladies['artist'].value_counts(normalize=True)
# baseline

0    0.520341
1    0.479659
Name: artist, dtype: float64

## Modeling Setup

In [10]:
X = ladies['lyrics']
y = ladies['artist']

--------------------------------------------------------------------------------------------------------------------

## Modeling

In [11]:
tvec = TfidfVectorizer()

In [12]:
X_tvec = tvec.fit_transform(X)

In [13]:
X_df = pd.DataFrame(X_tvec.toarray()) # makes non sparse
X_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4080,4081,4082,4083,4084,4085,4086,4087,4088,4089
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
models = {'LR': LogisticRegression(),
         'NB': MultinomialNB(),
         'SVC': SVC(),
         'RF': RandomForestClassifier()}

In [15]:
results_df = pd.DataFrame()

counter=0

for tag,estimator in models.items():
    cv_results = cross_validate(estimator,X_df,y,scoring=['accuracy','f1','roc_auc']) # have every single score from all folds. compare
    
    for score in ['accuracy','f1','roc_auc']:
        counter +=1
        mean_cv_score = np.mean(cv_results[f'test_{score}']) 
        std_cv_score = np.std(cv_results[f'test_{score}'])
        results_df.loc[counter, 'model'] = tag
        results_df.loc[counter, 'score'] = score
        results_df.loc[counter, 'mean'] = mean_cv_score
        results_df.loc[counter, 'std'] = std_cv_score

In [16]:
results_df

Unnamed: 0,model,score,mean,std
1,LR,accuracy,0.55111,0.033226
2,LR,f1,0.495092,0.038647
3,LR,roc_auc,0.571344,0.047325
4,NB,accuracy,0.548373,0.020824
5,NB,f1,0.526952,0.039228
6,NB,roc_auc,0.579078,0.020886
7,SVC,accuracy,0.566814,0.032782
8,SVC,f1,0.47836,0.021005
9,SVC,roc_auc,0.587891,0.035312
10,RF,accuracy,0.564285,0.031909


In [18]:
px.bar(results_df,y='mean',error_y='std',color='score',x='model',barmode='group')

### ROC Curve

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X_df, y, test_size=.30)

In [31]:
nb = MultinomialNB()
rf = RandomForestClassifier()

In [33]:
nb.fit(X_train, y_train)
rf.fit(X_train, y_train)

RandomForestClassifier()

In [27]:
# true positive rate is recall
def calculate_roc_data(y_test_prob, y_test):
    fpr, precision, recall = [], [], []
    prob_thresholds = np.linspace(0.01,1, 2000)
    for i in prob_thresholds:
        # slcie out the prob
        yhat = y_test_prob[:,1] > i 
        correct = yhat.astype(int) == y_test.astype(int)
        y_test = y_test.astype(int)
        tp = correct & (y_test == 1)
        tn = correct & (y_test == 0)
        fp = ~correct & (y_test == 0)
        fn = ~correct & (y_test == 1)
        if sum(tp) ==0:
            precision.append(0)
            recall.append(0)
        else:
            precision.append(sum(tp)/(sum(tp)+sum(fp)))
            recall.append(sum(tp)/(sum(tp)+sum(fn)))

        if sum(fp) ==0:
            fpr.append(0)
        else:
            fpr.append(sum(fp)/(sum(fp)+sum(tn)))
            
    return fpr, recall, prob_thresholds, precision

In [34]:
list_roc_dfs = []
for model_name, model in {'RF': rf,'NB': nb}.items():
    y_test_prob = model.predict_proba(X_test)
    fpr, recall, prob_thresholds, precision = calculate_roc_data(y_test_prob, y_test)
    roc_df = pd.DataFrame(data ={'threshold':prob_thresholds,
                                 'true_positive_rate':recall, 
                                 'false_positive_rate':fpr, 
                                 'precision': precision, 
                                 'model':model_name})
    print(model_name, roc_df.shape)
    list_roc_dfs.append(roc_df)
    # todo print AUC_ROC
    
master_roc_df = pd.concat(list_roc_dfs)

px.scatter(master_roc_df, 
           x='true_positive_rate', 
           y='precision', 
           range_x=(0,1), 
           range_y=(0,1.05), 
           facet_col='model', 
           color='threshold')

RF (2000, 5)
NB (2000, 5)


#### Random Forest optimized

In [42]:
rf_opt = BayesSearchCV(RandomForestClassifier(), # what object are we optimizing?
                        {'n_estimators': Integer(100,1000), 
                        'min_samples_leaf':Integer(1,10)}, 
                        cv=5,
                        scoring=roc_auc_score)

# if this doesn't work, try string 'roc_auc' for scoring paramater

In [43]:
rf_opt.fit(X_train, y_train)


The objective has been evaluated at this point before.


The objective has been evaluated at this point before.


The objective has been evaluated at this point before.


The objective has been evaluated at this point before.


The objective has been evaluated at this point before.


The objective has been evaluated at this point before.


The objective has been evaluated at this point before.


The objective has been evaluated at this point before.


The objective has been evaluated at this point before.


The objective has been evaluated at this point before.


The objective has been evaluated at this point before.


The objective has been evaluated at this point before.


The objective has been evaluated at this point before.


The objective has been evaluated at this point before.


The objective has been evaluated at this point before.


The objective has been evaluated at this point before.


The objective has been evaluated at this point before.


The objective has been evaluat

BayesSearchCV(cv=5, estimator=RandomForestClassifier(),
              search_spaces={'min_samples_leaf': Integer(low=1, high=10, prior='uniform', transform='identity'),
                             'n_estimators': Integer(low=100, high=1000, prior='uniform', transform='identity')})

In [44]:
rf_opt.best_params_

OrderedDict([('min_samples_leaf', 1), ('n_estimators', 1000)])

In [45]:
rf_opt.best_score_

0.8274356271645836

In [46]:
rf_opt.score(X_test, y_test)

0.8426413768879523