In [7]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

In [5]:
np.random.seed(42)

## Data Curation

In [8]:
ladies = pd.read_csv('lyrics_and_artist.csv', encoding='latin-1')
ladies.head()

Unnamed: 0,lyrics,artist
0,"I've been drinkin', I've been drinkin'",beyonce
1,I get filthy when that liquor get into me,beyonce
2,"I've been thinkin', I've been thinkin'",beyonce
3,Why can't I keep my fingers off it?,beyonce
4,"Baby, I want you, now-now",beyonce


In [11]:
# Our dictionary will contain only of the top 7000 words appearing most frequently
top_words = 2500
# Now we split our data-set into training and test data
(X_train, y_train), (X_test, y_test) = ladies
# Looking at the nature of training data
print(X_train[0])
print(y_train[0])
print('Shape of training data: ')
print(X_train.shape)
print(y_train.shape)
print('Shape of test data: ')
print(X_test.shape)
print(y_test.shape)

ValueError: too many values to unpack (expected 2)

In [None]:
Shape of training data: 
(25000,)
(25000,)
Shape of test data: 
(25000,)
(25000,)

In [None]:
# Padding the data samples to a maximum review length in words
max_words = 450
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)
# Building the CNN Model
model = Sequential()      # initilaizing the Sequential nature for CNN model
# Adding the embedding layer which will take in maximum of 450 words as input and provide a 32 dimensional output of those words which belong in the top_words dictionary
model.add(Embedding(top_words, 32, input_length=max_words))
model.add(Conv1D(32, 3, padding='same', activation='relu'))
model.add(MaxPooling1D())
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
# Fitting the data onto model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=2, batch_size=128, verbose=2)
# Getting score metrics from our model
scores = model.evaluate(X_test, y_test, verbose=0)
# Displays the accuracy of correct sentiment prediction over test data
print("Accuracy: %.2f%%" % (scores[1]*100))

In [None]:
ladies.shape

In [None]:
ladies['artist'] = ladies['artist'].map({'beyonce': 0, 'rihanna':1})
ladies.head()
# conversion of beyonce/rihanna into binary labels

## Baseline Accuracy

In [None]:
ladies['artist'].value_counts(normalize=True)
# baseline

## Modeling Setup

In [None]:
X = ladies['lyrics']
y = ladies['artist']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.30,
                                                    stratify=y,
                                                    random_state=42)

## CV Pre-Processing w/ stopwords REMOVED

In [None]:
cvec = CountVectorizer(stop_words='english')
cvec.fit(X_train)

# fit cv on corpus

In [None]:
X_train = cvec.transform(X_train)
 # transform corpus

In [None]:
X_train.shape

In [None]:
# transform test
X_test = cvec.transform(X_test)

In [None]:
X_test.shape

In [None]:
# convert traning data to dataframe

X_train_ladies = pd.DataFrame(X_train.todense(), 
                          columns=cvec.get_feature_names())

# plot top occuring words
X_train_ladies.sum().sort_values(ascending=False).head(10).plot(kind='barh');

## Modeling Setup

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.30,
                                                    stratify=y,
                                                    random_state=42)

## Baseline Accuracy

In [None]:
y_test.value_counts(normalize=True)

## CV-NB Pipeline Setup

In [None]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('logreg', LogisticRegression())
])

## GridSearchCV

In [None]:
pipe_params = {
    'cvec__max_features': [2_000, 3_000, 4_000, 5_000],
    'cvec__min_df': [2, 3],
    'cvec__max_df': [.9, .95],
    'cvec__ngram_range': [(1,1), (1,2)]
}

In [None]:
gs = GridSearchCV(pipe, # what object are we optimizing?
                  param_grid=pipe_params, # what parameters values are we searching?
                  cv=5) # 5-fold cross-validation.

In [None]:
gs.fit(X_train, y_train)

In [None]:
print(gs.best_score_)

In [None]:
gs.score(X_train, y_train), gs.score(X_test, y_test)

In [None]:
# Get predictions
preds = gs.predict(X_test)

# Save confusion matrix values
tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()

In [None]:
plot_confusion_matrix(gs, X_test, y_test, cmap='Blues', values_format='d');

In [None]:
spec = tn / (tn + fp)

print('Specificity:', spec)

## CV Pre-Processing w/ stopwords 

In [None]:
cvec = CountVectorizer()
cvec.fit(X_train)

In [None]:
X_train = cvec.transform(X_train)

In [None]:
X_train.shape

In [None]:
X_test = cvec.transform(X_test)

In [None]:
X_test.shape

In [None]:
X_train_ladies = pd.DataFrame(X_train.todense(), 
                          columns=cvec.get_feature_names())

# plot top occuring words
X_train_ladies.sum().sort_values(ascending=False).head(10).plot(kind='barh');

## Modeling

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.30,
                                                    stratify=y,
                                                    random_state=42)

In [None]:
y_test.value_counts(normalize=True)

In [None]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('logreg', LogisticRegression())
])

In [None]:
pipe_params = {
    'cvec__max_features': [2_000, 3_000, 4_000, 5_000],
    'cvec__min_df': [2, 3],
    'cvec__max_df': [.9, .95],
    'cvec__ngram_range': [(1,1), (1,2)]
}

In [None]:
gs = GridSearchCV(pipe, # what object are we optimizing?
                  param_grid=pipe_params, # what parameters values are we searching?
                  cv=5) # 5-fold cross-validation.

In [None]:
gs.fit(X_train, y_train)

In [None]:
print(gs.best_score_)

In [None]:
gs.score(X_train, y_train), gs.score(X_test, y_test)

In [None]:
# Get predictions
preds = gs.predict(X_test)

# Save confusion matrix values
tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()

In [None]:
plot_confusion_matrix(gs, X_test, y_test, cmap='Blues', values_format='d');

In [None]:
spec = tn / (tn + fp)

print('Specificity:', spec)

------------------------------------------------------------------------------------------------------------------

## Pre-Processing w/ TFID + stopwords REMOVED

In [None]:
tvec = TfidfVectorizer(stop_words='english')

In [None]:
X_train_df = pd.DataFrame(tvec.fit_transform(X_train).todense(), 
                          columns=tvec.get_feature_names())

# plot top occuring words
X_train_df.sum().sort_values(ascending=False).head(10).plot(kind='barh');

In [None]:
pipe_tvec = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('logreg', LogisticRegression())
])

In [None]:
pipe_tvec_params = {
    'tvec__max_features': [2_000, 3_000, 4_000, 5_000],
    'tvec__stop_words': [None, 'english'],
    'tvec__ngram_range': [(1,1), (1,2)]
}

In [None]:
# Instantiate GridSearchCV.

gs_tvec = GridSearchCV(pipe_tvec, # what object are we optimizing?
                        param_grid = pipe_tvec_params, # what parameters values are we searching?
                        cv=5) # 5-fold cross-validation.

In [None]:
gs_tvec.fit(X_train, y_train)

In [None]:
gs_tvec.score(X_train, y_train), gs_tvec.score(X_test, y_test)

In [None]:
preds_tvec = gs_tvec.predict(X_test)

# Save confusion matrix values
tn, fp, fn, tp = confusion_matrix(y_test, preds_tvec).ravel()

# Calculate the specificity

spec = tn / (tn + fp)

print('Specificity:', spec)

In [None]:
plot_confusion_matrix(gs_tvec, X_test, y_test, cmap='Blues', values_format='d');

## Pre-Processing w/ TFID + NO stopwords removed

In [None]:
tvec = TfidfVectorizer()

In [None]:
X_train_df = pd.DataFrame(tvec.fit_transform(X_train).todense(), 
                          columns=tvec.get_feature_names())

# plot top occuring words
X_train_df.sum().sort_values(ascending=False).head(10).plot(kind='barh');

In [None]:
pipe_tvec = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('logreg', LogisticRegression())
])

In [None]:
pipe_tvec_params = {
    'tvec__max_features': [2_000, 3_000, 4_000, 5_000],
    'tvec__stop_words': [None, 'english'],
    'tvec__ngram_range': [(1,1), (1,2)]
}

In [None]:
gs_tvec = GridSearchCV(pipe_tvec, # what object are we optimizing?
                        param_grid = pipe_tvec_params, # what parameters values are we searching?
                        cv=5) # 5-fold cross-validation.

In [None]:
gs_tvec.fit(X_train, y_train)

In [None]:
# Score model on training + test sets
gs_tvec.score(X_train, y_train), gs_tvec.score(X_test, y_test)

In [None]:
# Get predictions
preds_tvec = gs_tvec.predict(X_test)

# Save confusion matrix values
tn, fp, fn, tp = confusion_matrix(y_test, preds_tvec).ravel()

# Calculate the specificity

spec = tn / (tn + fp)

print('Specificity:', spec)

In [None]:
# visualize this

plot_confusion_matrix(gs_tvec, X_test, y_test, cmap='Blues', values_format='d');