In [88]:
# Nathan Azoulay
# Logistic Regression
# CS581
# Note: The assignment answers are all the way at the bottom starting at the bold header labeled assignment.


import numpy as np
from scipy.special import expit

# Using a corpus of movie review data
# 2000 positive and negative reviews, evenly balanced.

from nltk.corpus import movie_reviews as mr

# If you want to read the corpus collectors' introduction to
# this corpus, uncomment the next line.
#print mr.readme()

def add_data_from_files (file_list,data_list):
    for f in file_list:
        with open(f,'r') as fh:
            data_list.append(fh.read())

def get_data (clses, data_dirs, training_proportion = .9):
    # We're going to compile 4 lists: training data and labels, test data and labels
    train_data, train_labels = [], []
    test_data, test_labels = [], []

    for i,cls  in enumerate(clses):
        d_dir = data_dirs[i]
        os.chdir(d_dir)
        cls_files = os.listdir(d_dir)
        num_cls_files = len(cls_files)
        training_index = int(training_proportion * num_cls_files)
        train_labels.extend(cls for f in cls_files[:training_index])
        test_labels.extend(cls for f in cls_files[training_index:])
        add_data_from_files (cls_files[:training_index],train_data)
        add_data_from_files (cls_files[training_index:],test_data)
    return train_data, train_labels, test_data, test_labels

data = dict(pos = mr.fileids('pos'),
            neg = mr.fileids('neg'))

def get_review_text (cls,file_id,start=0,end=None):
    words = list(mr.words(data[cls][file_id]))
    return ' '.join(words[start:end])


In [89]:
#Evaluating classifier output
#Precision: percentage of true positives out all positive guesses the system made
#Accuracy: percentage of correct answers out of total corpus
#Recall: percentage of true positives out of all good reviews

from sklearn.metrics import precision_score, recall_score,accuracy_score
from collections import defaultdict

flip = dict(pos = 'neg', neg = 'pos')
test_dict = defaultdict(list)
def do_evaluation (predicted, actual, pos_label='pos', verbose=True, 
                   test_dict= None, system_id = '',
                   two_way = False, omit_accuracy = False, override = True):
    #predicted, actual = zip(*pairs)
    #nb_pairs = list(zip(nb_predicted_labels, test_labels))
    (precision, recall,accuracy) = (precision_score(actual,predicted,pos_label=pos_label), 
                                    recall_score(actual,predicted,pos_label=pos_label),
                                    accuracy_score(actual,predicted))
    if verbose:
        print_results(precision, recall, accuracy, pos_label, omit_accuracy=omit_accuracy)
    if test_dict is not None:
        if override:
            test_dict[system_id] = []
        test_dict[system_id].append((pos_label, (precision, recall, accuracy)))
    if two_way:
        return ((precision, recall,accuracy),
                 do_evaluation (predicted, actual, pos_label=flip[pos_label], 
                                verbose=verbose, test_dict= test_dict, 
                                system_id = system_id,
                                two_way = False, omit_accuracy = True,
                                override = False))
    else:
        return (precision, recall, accuracy)

def print_results (precision, recall, accuracy, pos_label, omit_accuracy = False):
    banner =  f'P/R Evaluation with pos label = {pos_label}'
    Precision, Recall, Accuracy = 'Precision', 'Recall', 'Accuracy'
    print ()
    if not omit_accuracy:
           print (f'{Accuracy:10s} {accuracy:.1%}')
           print()
    print (banner)
    print ('=' * len(banner))
    print (f'{Precision:10s} {precision:.1%}')
    print (f'{Recall:10s} {recall:.1%}')
    
def print_test_dict (test_dict):
    for k in test_dict.keys():
        line0 = f'{k[0]:<10} {k[1]:<22}'
        indent = len(line0)
        print(line0, end = ' ')
        print(f'{test_dict[k][0][0]:<10}', end='')
        print(f'{test_dict[k][0][1][0]:.3f}')
        print(f'{"":<{indent}} {test_dict[k][1][0]:<10}', end='')
        print(f'{test_dict[k][1][1][0]:.3f}')

In [191]:
# Logistic Regression Classifier
# Load and split data
import sklearn
module_name = sklearn.__name__
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression as lr
class_name = lr.__name__
import os.path

system_id = (module_name, class_name)
home = os.getenv('HOME')  
# This is where MY NLTK data is.  Yours should be in a similar place relative
# to what your machine thinks is HOME.
data_dir = os.path.join(home,'nltk_data/corpora/movie_reviews/')

clses = ['pos','neg']

#  The data is in the data_dir, sorted into subdirectories, one for each class.
data_dirs = [os.path.join(data_dir,cls) for cls in clses]

train_data, train_labels, test_data, test_labels = get_data(clses, data_dirs)

# Train and test classifier

#  We use a somewhat more traditional feature weights, called TfidF weights
#  max_df = 0.5; "df" is document frequency.  
#  Omit any word that occurs in more than half of the training data documents
#. Try higher values for max_df, reducing the number of features
max_df = .5
vectorizer = TfidfVectorizer(stop_words='english', use_idf=True,sublinear_tf=True, 
                             binary = False, max_df = max_df
                             )
#vectorizer = TfidfVectorizer()
# Now with data set represented as a list of strings (one from each file),
# extract the TfidF features
train_features = vectorizer.fit_transform(train_data)

#  We extract features from the test data using the same vectorizer
#  trained on training data. The TFIDF feature model has been fit to 
#  (depends only on) the training data.
test_features = vectorizer.transform(test_data)

# Create an Logistic Regression Classifier instance
# using default settings
max_iter = 100. # The default
solver = 'lbfgs' # The default, I think
lr_clf = lr(solver = solver, max_iter = max_iter)

# Train (or "fit") the model to the training data.
lr_clf.fit(train_features, train_labels)

# Test the model on the test data.  Evaluation below.
lr_predicted_labels = lr_clf.predict(test_features)

In [91]:
# Classifying individual examples
vec1 = vectorizer.transform(['Inception is the greatest movie ever'])
print(lr_clf.predict(vec1))

#Under the hood
w, b  = lr_clf.coef_, lr_clf.intercept_  
# Usparsify, flatten, use the feature weights
logit = (vec1.toarray().ravel().dot(w.T) + b)
logit

['pos']


array([0.05421534])

In [92]:
import numpy as np
from scipy.special import expit
loss = expit(logit)
print(loss)

[0.51355052]


In [93]:
# Function classify with test example at bottom
def classify (text,  clf, vectorizer, verbose = False):
    """ 
    Text is a string.
    """
    vec = vectorizer.transform([text])
    w, b  = clf.coef_, clf.intercept_  
    # Usparsify, flatten, use the feature weights/bias, make it a prob
    loss = expit(vec.toarray().ravel().dot(w.T) + b)[0]
    if verbose:
        # vec . w^T (dot product)
        logit_nb = vec.toarray().ravel().dot(w.T).ravel()[0]
        #print('logit (no bias): ', logit_nb)
        print(f'{"logit (no bias):":<20} {logit_nb: .4f}')
        print(f"{'bias value: ':<20} {b[0]:.4f}")              
        print(f"{'logit (w/bias): ':<20} {logit_nb + b[0]: .4f}")
        print(f"{'prob: ':<20} {loss: .4f}")
        print()
    if loss > .5:
       return 'pos'
    else:
       return 'neg'
              
text = "I don't know how anyone could sit through Inception"
print(classify(text, lr_clf, vectorizer) +'\n')
              
text = "I don't know how anyone could sit through Inception"
print(classify(text, lr_clf, vectorizer, verbose = True))

neg

logit (no bias):     -0.2793
bias value:          -0.1654
logit (w/bias):      -0.4446
prob:                 0.3906

neg


In [94]:
# Evaluate the classifier
# Guessing 51% positive on a 50/50 dataset


do_evaluation (lr_predicted_labels, test_labels, pos_label='pos', verbose=True, 
               system_id = system_id, 
               test_dict = test_dict, two_way = True)

test_size = len(test_labels)
pos_cnt = lambda lbls: len([cl for cl in lbls if cl == "pos"])

print()
print (f'{pos_cnt(lr_predicted_labels)/test_size:.1%} classifier predictions positive')

#  Was the test data biased?  Were most of the test reviews positive, for example?
print (f'{pos_cnt(test_labels)/test_size:.1%} reviews positive')


Accuracy   88.0%

P/R Evaluation with pos label = pos
Precision  87.3%
Recall     89.0%

P/R Evaluation with pos label = neg
Precision  88.8%
Recall     87.0%

51.0% classifier predictions positive
50.0% reviews positive


In [95]:
# Evaluation results stored in dict
test_dict

defaultdict(list,
            {('sklearn',
              'LogisticRegression'): [('pos',
               (0.8725490196078431, 0.89, 0.88)), ('neg',
               (0.8877551020408163, 0.87, 0.88))]})

In [96]:
# Features for the first document from our train data
x0sparse = train_features[0]
x0 = x0sparse.toarray()             #document vector = x, y is class assigned to it

# model stores w and b
w, b = lr_clf.coef_, lr_clf.intercept_

#Both are 3D vectors
x0.shape, w.shape

#37834 is size of vocabulary

((1, 37834), (1, 37834))

In [97]:
# Sort indices of weight vector from lowest valued index to highest
max_indices = w.argsort()
max_indices

array([[ 2805, 37406,  4210, 25235, 32828, 32459, 36679,  2705, 35378,
        29502, ..., 24601, 33636, 26681, 24608, 34727, 15687, 11661,
        24596, 14530, 19518]])

In [192]:
#dir(vectorizer)
feats = vectorizer.get_feature_names()
print(len(feats)) # Vocab size
# Most highly weighted word!
feats[19518]

37834


'life'

In [99]:
# Lookup highest/lowest weight features

n = 10
# top_n_feats contains the feature indices of the n highest-weighted features
# ravel flattens a 2D array into a 1D array (vector)
top_n_feats = max_indices.ravel()[-n:]
bottom_n_feats = max_indices.ravel()[:n]
# name_array[i]  returns word with feature index i
name_array = np.array(vectorizer.get_feature_names())

# threshold: number of items that triggers summarization of an array.  [default 1000 (?)]
# edgeitems: num items to print from each end when summarizing [default 3]
# print(np.set_printoptions.__doc__)  
np.set_printoptions(threshold=20)
print('Max: ', name_array[top_n_feats])
print('Min: ', name_array[bottom_n_feats])

Max:  ['perfectly' 'terrific' 'quite' 'performance' 'true' 'hilarious'
 'excellent' 'perfect' 'great' 'life']
Min:  ['bad' 'worst' 'boring' 'plot' 'supposed' 'stupid' 'waste' 'awful'
 'unfortunately' 'script']


In [100]:
# Wrap above into a function

def find_max_min_feats (vec, vectorizer, n=10, positive_vals_only = False,
                        verbose = False, prune = True, w = None,
                        get_vals = False):
    """
    Print features containing top n and bottom n values in a feature vector,
    where feature names are defined by vectorizer.
    
    Use with weight vector:
       >>> w = lr_clf.coef_
       >>> find_max_min_feats (w, vectorizer, n=10)
       ['perfectly' 'terrific' 'quite' 'performance' 'true' 'hilarious'
        'excellent' 'perfect' 'great' 'life']
       ['bad' 'worst' 'boring' 'plot' 'supposed' 'stupid' 'waste' 'awful'
        'unfortunately' 'script']
    
    positive_vals_only should be set to true when the vector only has 
    positive values.  In that case, we assume we are still only interested in the 
    lowest nonzero values.
    """
    vec = vec.ravel()
    max_indices = vec.argsort()
    top_n_feats = max_indices[-n:]
    if verbose:
        print('Max feats:')
        print_feat_info (top_n_feats, vec, w, verbose = verbose)
    if prune:
        # restrict top_n to truly positive feats, even if less than n feats
        top_n_feats = [f for f in top_n_feats if vec[f] > 0]
    if positive_vals_only:
       vec_cp = vec.copy()
       # Assign the max value to all zero-valued positions
       # removing 0-values from the competition for min value
       vec_cp[vec==0] = vec[top_n_feats[-1]]
       # Now find min valued feats
       max_indices2 = vec_cp.argsort().ravel()
       bottom_n_feats = max_indices2[:n]
    else:
        bottom_n_feats = max_indices[:n]
        if prune:
            bottom_n_feats = [f for f in bottom_n_feats if vec[f] < 0]
    if verbose:
       print()
       print('Min feats:')
       print_feat_info (bottom_n_feats, vec, w, verbose = verbose)
       print()
    name_array = np.array(vectorizer.get_feature_names())
    # Reset numpy print options to print n feats. first get old settings
    opts = np.get_printoptions()
    # Reset.
    np.set_printoptions(threshold=2 * n)
    print('Max: ', name_array[top_n_feats])
    print('Min: ', name_array[bottom_n_feats])
    # Put old settings back
    np.set_printoptions(**opts)
    if get_vals:
       return (top_n_feats, name_array[top_n_feats],
               bottom_n_feats, name_array[bottum_n_feats])

def print_feat_info (feat_index_vector, vec, w = None, verbose = False):
    if (verbose >= 2): 
        print('Feat indices: ', feat_index_vector) 
    print('Feat values: ', [vec[f] for f in feat_index_vector])
    if w is not None:
       print('Feat weights: ', [w[f] for f in feat_index_vector])

   
def find_max_min_feats_classifier (clf, vectorizer, n = 10, verbose = False,
                                      weight_attr = 'coef_'):
    w = getattr(clf, weight_attr).ravel()
    find_max_min_feats (w, vectorizer, n=n, verbose = verbose,positive_vals_only=True)


In [101]:
find_max_min_feats_classifier (lr_clf, vectorizer)

Max:  ['perfectly' 'terrific' 'quite' 'performance' 'true' 'hilarious'
 'excellent' 'perfect' 'great' 'life']
Min:  ['bad' 'worst' 'boring' 'plot' 'supposed' 'stupid' 'waste' 'awful'
 'unfortunately' 'script']


In [102]:
# numpy version of sigmoid is expit, applied to document vector x0
x0sparse = train_features[0]
x0 = x0sparse.toarray().ravel()
loss = expit(x0.dot(w.T) + b)

loss # probability that the sample belongs to the positive class

array([0.66501437])

In [103]:
# Returns probability of the sample for each class in the model, classes are ordered as they are in lr_clf.classes_
print(lr_clf.classes_)
prob_array = lr_clf.predict_proba(x0sparse).ravel()
print(prob_array)
# Too few decimal places: High prob looks liek 1
print(f"{prob_array[1]:.4f}")
# Enough decimal places.
print(f"{prob_array[1]:.5f}")
print(lr_clf.predict_proba(x0sparse).sum())

['neg' 'pos']
[0.33498563 0.66501437]
0.6650
0.66501
1.0


In [104]:
# predict method chooses a class
lr_clf.predict(x0sparse)

array(['pos'], dtype='<U3')

In [105]:
w.dot(x0.T) + b

array([0.68572103])

In [106]:
# Finding active features in the document

def find_max_min_feats_docvec (vec, clf, vectorizer, n = 10, prune = True,
                               verbose = False, weight_attr = 'coef_',
                               get_vals = False):
    """
    Vec is a document vector.  Classifier a trained classifier.  Vectorizer
    is the vectorizer that produced vec.
    
    Return highest- and lowest- weighted features per this trained classifier,
    """
    w = getattr(clf, weight_attr).ravel()
    dw = (vec.toarray().ravel() * w)
    find_max_min_feats (dw, vectorizer, n=n, prune = prune, verbose = verbose, w = w)
    if get_vals:
       return dw
    
def find_max_min_feats_text (text, clf, vectorizer, n = 10, prune=True,
                             verbose = False, weight_attr = 'coef_',
                             get_vals = False):
    """
    Text is a string.  Classifier a trained classifier.  Vectorizer
    a trained vectorizer for turning strings into doc vectors.
    
    Return most and least siugnificant features per classifier.
    """
    # Need to pass vectorizer a list of strings
    vec = vectorizer.transform([text])
    # Make sparse array an array,  flatten, get weighted feats [modified equation (a)]
    # dw = (vec.toarray().ravel() * w) + b
    dw = find_max_min_feats_docvec (vec, clf, vectorizer, n=10, prune = prune,
                                    verbose = verbose, weight_attr = weight_attr)
    if get_vals:
        return vec, dw

In [107]:
# Finding the text of the first positive review
get_review_text ('pos',0)[:1000]

'films adapted from comic books have had plenty of success , whether they \' re about superheroes ( batman , superman , spawn ) , or geared toward kids ( casper ) or the arthouse crowd ( ghost world ) , but there \' s never really been a comic book like from hell before . for starters , it was created by alan moore ( and eddie campbell ) , who brought the medium to a whole new level in the mid \' 80s with a 12 - part series called the watchmen . to say moore and campbell thoroughly researched the subject of jack the ripper would be like saying michael jackson is starting to look a little odd . the book ( or " graphic novel , " if you will ) is over 500 pages long and includes nearly 30 more that consist of nothing but footnotes . in other words , don \' t dismiss this film because of its source . if you can get past the whole comic book thing , you might find another stumbling block in from hell \' s directors , albert and allen hughes . getting the hughes brothers to direct this seems

In [108]:
vec11 = vectorizer.transform([get_review_text ('pos',0)]).toarray().ravel()
vec11.shape

(37834,)

In [109]:
find_max_min_feats (vec11, vectorizer, n=10, positive_vals_only = True) #weighting of words

Max:  ['hughes' 'hell' 'graham' 'depp' 'moore' 'campbell' 'ghetto' 'whitechapel'
 'abberline' 'ripper']
Min:  ['films' 'bad' 'scenes' 'new' 'great' 'better' 'love' 'big' 'long' 'isn']


In [110]:
#TFIDF value
vec11.max()

0.16435001314847014

In [111]:
# Transform document vector by applying weights to each feature -> weighted document vector
w, b  = lr_clf.coef_, lr_clf.intercept_  
dw = (vec11 * w)
dw.shape

(1, 37834)

In [112]:
# Constructed weighted document vectors out of logits without bias
ind = vectorizer.vocabulary_['captures']

print('bias: ', b, 'tfidf:', vec11[ind], 'weight:', w[0][ind])
print('logit with bias: ',(vec11[ind] * w[0][ind]) + b, 'Negative value!')
print('Easier to understand logit w/o bias')
print((vec11[ind] * w[0][ind]))

bias:  [-0.16537522] tfidf: 0.05359015649564609 weight: 0.4413422719369864
logit with bias:  [-0.14172362] Negative value!
Easier to understand logit w/o bias
0.023651601421247094


In [113]:
# Find max/min features of dw
# Finding the top features in vec
w = lr_clf.coef_
# sparse => array, flatten, compute weighted doc vector
dw = (vec11 * w[0])
# Same function that we use for finding max min feats in the classifier weights.
find_max_min_feats (dw, vectorizer, n=10)

Max:  ['job' 'captures' 'oscar' 'comic' 'great' 'campbell' 'era' 'graham'
 'world' 'strong']
Min:  ['bad' 'wasn' 'don' 'attempt' 'ludicrous' 'looks' 'batman' 'tries' 'blame'
 'point']


In [114]:
# Wrap into function for future use
find_max_min_feats_text (get_review_text ('pos',0), lr_clf, vectorizer, n = 10)

Max:  ['job' 'captures' 'oscar' 'comic' 'great' 'campbell' 'era' 'graham'
 'world' 'strong']
Min:  ['bad' 'wasn' 'don' 'attempt' 'ludicrous' 'looks' 'batman' 'tries' 'blame'
 'point']


In [115]:
# Document vector, a sparse matrix
d0sparse = train_features[0]
# To make th evector suitablke for operations with w, b, we do this:
# Unsparsify and flatten.  This will be done inside find_max_min_feats_docvec 
d0 = d0sparse.toarray().ravel()
# Add in feat weights to get weighted doc vec
dw = (d0 * w[0])
find_max_min_feats (dw, vectorizer, n=10)

Max:  ['light' 'perfect' 'unlike' 'delightful' 'damon' 'great' 'job' 'wonderful'
 'hilarious' 'oscar']
Min:  ['script' 'boring' 'plot' 'supposed' 'dull' 'perry' 'dialogue'
 'whatsoever' 'flat' 'material']


In [116]:
# Wrapped into a function
weighted_doc_vec = find_max_min_feats_docvec (d0sparse, lr_clf, vectorizer, n=10)

Max:  ['light' 'perfect' 'unlike' 'delightful' 'damon' 'great' 'job' 'wonderful'
 'hilarious' 'oscar']
Min:  ['script' 'boring' 'plot' 'supposed' 'dull' 'perry' 'dialogue'
 'whatsoever' 'flat' 'material']


In [117]:
# Classifying individual examples with analysis
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
 ]
vectorizer0 = TfidfVectorizer()
X = vectorizer0.fit_transform(corpus)
vectorizer.transform(['document first and']).toarray().ravel()

vec11 = vectorizer.transform([get_review_text ('pos',0)])
vec11.shape

(1, 37834)

In [118]:
# Use function to classify made up examples
text = 'Inception is the greatest movie ever'
classify(text, lr_clf, vectorizer)

'pos'

In [119]:
# This means two words carry positive weights as features and none carry negative
# only inception and greatest have been learning in the training data
find_max_min_feats_text (text, lr_clf, vectorizer, n = 10)

Max:  ['inception' 'greatest']
Min:  []


In [120]:
# Sanity check
vec = vectorizer.transform(['Inception is the worst movie ever'])
lr_clf.predict(vec)

array(['neg'], dtype='<U3')

In [121]:
# analysis 
text = 'Inception is the worst movie ever'
print(classify(text, lr_clf, vectorizer, verbose = False))
find_max_min_feats_text (text, lr_clf, vectorizer, n = 10)

neg
Max:  ['inception']
Min:  ['worst']


In [122]:
text = "I don't know how anyone could sit through Inception"
print(classify(text, lr_clf, vectorizer))
find_max_min_feats_text (text, lr_clf, vectorizer, n = 10)

neg
Max:  ['inception']
Min:  ['don' 'sit' 'know']


In [123]:
ind = vectorizer.vocabulary_['sit']
w[0][ind]

-0.2621686592991448

In [124]:
# Word repetition
# Look at both examples and how it makes no difference

text = "I loved this movie"
print(classify(text, lr_clf, vectorizer, verbose = True))
find_max_min_feats_text (text, lr_clf, vectorizer, n = 10)

logit (no bias):      0.5136
bias value:          -0.1654
logit (w/bias):       0.3482
prob:                 0.5862

pos
Max:  ['loved']
Min:  []


In [125]:
text = "I loved loved loved this movie"
print(classify(text, lr_clf, vectorizer, verbose = True))
find_max_min_feats_text (text, lr_clf, vectorizer, n = 10)

logit (no bias):      0.5136
bias value:          -0.1654
logit (w/bias):       0.3482
prob:                 0.5862

pos
Max:  ['loved']
Min:  []


In [126]:
word = 'loved'
vec12 = vectorizer.transform([f"I {word} this movie"])
ind = vectorizer.vocabulary_[word]
# Get the feature value for this example
print(vec12.toarray().ravel()[ind])
vec122 = vectorizer.transform([f"I {word} {word} {word} this movie"])
# Get the feature value for the other example
print(vec122.toarray().ravel()[ind])

1.0
1.0


In [127]:
# example with no known words
text = 'xnwnei'
print(classify(text, lr_clf, vectorizer, verbose = True))
find_max_min_feats_text (text, lr_clf, vectorizer, n = 10)

logit (no bias):      0.0000
bias value:          -0.1654
logit (w/bias):      -0.1654
prob:                 0.4588

neg
Max:  []
Min:  []


In [128]:
# MultinomialNB
import sklearn
module_name = sklearn.__name__
from sklearn.naive_bayes import MultinomialNB as nb
class_name = nb.__name__
from sklearn.feature_extraction.text import CountVectorizer

if class_name == 'BernoulliNB':
    binary = True
elif class_name == 'MultinomialNB':
    binary = False
else:
    raise Exception('binary var unset. Do you want count vectorization?  Is it binary?')

count_vectorizer = CountVectorizer(stop_words='english', max_df = .5, binary = binary)
# Now with data set represented as a list of strings (one from each file),
# extract the TFIDF features
train_features = count_vectorizer.fit_transform(train_data)

#  We extract features from the test data using the same vectorizer
#  trained on training data. The TFIDF feature model has been fit to 
#  (depends only on) the training data.
test_features = count_vectorizer.transform(test_data)

# Turning smoothing off! [alpha REALLY close to 0]
#No smoothing alpha
no_smoothing_alpha=1.0e-10
nb_clf = nb(alpha=no_smoothing_alpha)
#nb_clf = MultinomialNB(alpha=no_smoothing_alpha)
# Train (or "fit") the model to the training data.
nb_clf.fit(train_features, train_labels)

# Test the model on the test data.
nb_predicted_labels = nb_clf.predict(test_features)

# Evaluate the results
nb_pos_predictions = [p for p in nb_predicted_labels if p=='pos']



do_evaluation (nb_predicted_labels, test_labels, pos_label='pos', verbose=True, 
               system_id = (module_name, class_name), 
               test_dict = test_dict, two_way = True)

print()

print (f'{pos_cnt(nb_predicted_labels)/test_size:.1%} classifier predictions positive')

#  Was the test data biased?  Were most of the test reviews positive, for example?
print (f'{pos_cnt(test_labels)/test_size:.1%} reviews positive')



Accuracy   70.5%

P/R Evaluation with pos label = pos
Precision  68.5%
Recall     76.0%

P/R Evaluation with pos label = neg
Precision  73.0%
Recall     65.0%

55.5% classifier predictions positive
50.0% reviews positive


In [129]:
# compute new weight vector
weight_attr = 'weights'

def add_nb_w (clf, pos_cls = 'pos', weight_attr = 'weights'):
    clses = clf.classes_
    pos_index = list(clses).index(pos_cls)
    nb_w_cl_pos = clf.feature_log_prob_[pos_index]
    nb_w_cl_neg = clf.feature_log_prob_[(1 - pos_index)]
    # pos_prob/neg_prob ratio > 1 means neg_log_prob/pos_log_prog ratio > 1,
    weight_vector = np.log(nb_w_cl_neg/nb_w_cl_pos)
    # Item assignment not implemented for sklearn learner instances
    #nb_clf['weights'] = compute_nb_w (nb_clf)
    setattr(clf, weight_attr, weight_vector)

add_nb_w (nb_clf)

In [130]:
text = "I don't know how anyone could sit through Inception"
print(classify(text, nb_clf, count_vectorizer))
find_max_min_feats_text (text, nb_clf, count_vectorizer, n = 10, verbose = True,
                          weight_attr = weight_attr)

neg
Max feats:
Feat values:  [-0.0, 0.0, 0.0, 0.0, -0.0, -0.0, 0.0, 0.0, 0.0, 1.0372699332507003]
Feat weights:  [-1.050457698954473, 0.03248696950987699, 0.1011008517948238, 0.0816721411795122, -0.010692368113133378, -1.050457698954473, 0.23025058518467753, 1.0372699332507003, 0.03659535262165882, 1.0372699332507003]

Min feats:
Feat values:  [-0.05553717257391996, -0.05455432724682377, -0.034708923222847625]
Feat weights:  [-0.05553717257391996, -0.05455432724682377, -0.034708923222847625]

Max:  ['inception']
Min:  ['sit' 'don' 'know']


In [131]:
find_max_min_feats_classifier(nb_clf, count_vectorizer, n = 50, weight_attr= weight_attr)

Max:  ['griffiths' 'stephens' 'lithgow' 'skinhead' 'muriel' 'valjean' 'soviet'
 'kurtz' 'chocolat' 'tibbs' 'slade' 'boone' 'reza' 'humbert' 'fantasia'
 'coens' 'scarlett' 'booker' 'motta' 'bianca' 'pollock' 'capone'
 'commodus' 'shyamalan' 'dolores' 'bubby' 'kaufman' 'giles' 'burbank'
 'lumumba' 'rounders' 'niccol' 'matilda' 'sethe' 'farquaad' 'apostle'
 'donkey' 'fei' 'guido' 'maximus' 'taran' 'mallory' 'lambeau' 'jude'
 'sweetback' 'leila' 'argento' 'gattaca' 'ordell' 'shrek']
Min:  ['jolie' 'brenner' 'sphere' 'palmetto' 'macdonald' 'pokemon' 'supergirl'
 'jericho' 'jill' 'cisco' 'bilko' 'mona' 'hush' 'silverman' 'bruckheimer'
 'memphis' 'kersey' 'mandingo' 'compensate' 'geronimo' 'psychlos' 'liu'
 'grinch' 'highlander' 'hawk' 'elwood' 'degenerates' 'caulder' 'terl'
 'vikings' 'rimbaud' 'angelina' 'wrestlers' 'dwayne' 'alessa' 'autistic'
 'kip' 'horrid' 'diedre' 'diaries' 'zach' 'fern' 'loveless' 'brooke'
 'switchback' 'peet' 'wayans' 'muresan' 'tatopoulos' 'forsythe']


In [132]:
vec25 = count_vectorizer.transform(['Inception is the worst movie ever'])
nb_clf.predict(vec25)

array(['pos'], dtype='<U3')

In [133]:
vec26 = count_vectorizer.transform(['Inception is the best movie ever'])
nb_clf.predict(vec26)

array(['pos'], dtype='<U3')

In [134]:
# Unweighted vector
avec25 = vec25.toarray().ravel()
opts = np.get_printoptions()
np.set_printoptions(threshold=20, edgeitems = 10)
print(avec25[:25])
#np.set_printoptions(**opts)

[0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0]


In [135]:
# For MultinomialNB the doc vector has count:
avec25.max(),avec25.min(),avec25.sum()

(1, 0, 2)

In [136]:
# Test repetition
vec28 = count_vectorizer.transform(['I loved this movie'])
nb_clf.predict_proba(vec28)

array([[0.34860705, 0.65139295]])

In [137]:
vec28 = count_vectorizer.transform(['I loved loved loved this movie'])
nb_clf.predict_proba(vec28)

array([[0.13290625, 0.86709375]])

In [138]:
# Finding max/min feats and the weighted doc vector
wd = find_max_min_feats_docvec (vec25, nb_clf, count_vectorizer, 
                                n = 10, weight_attr = weight_attr)

Max:  ['inception']
Min:  ['worst']


In [139]:
# Intercept = class prior, as expected, reflecting the fact that our data is balanced.
ind = count_vectorizer.vocabulary_['worst']
classes = nb_clf.classes_
nb_w = nb_clf.coef_.ravel()
print(classes)
nb_w_cl0 = nb_clf.feature_log_prob_[0]
nb_w_cl1 = nb_clf.feature_log_prob_[1]
# log p(greatest | pos), feature weight for greatest, log p(greatest|neg),
# ratio: log p(greatest | pos)/ log p(greatest | neg)
nb_w_cl1[ind], nb_w[ind], nb_w_cl0[ind], nb_w_cl0[ind]/nb_w_cl1[ind]

['neg' 'pos']


(-8.745478284939654,
 -8.745478284939654,
 -6.979802305330207,
 0.7981041262603019)

In [140]:
# negative feature
x = nb_clf.class_log_prior_[1:][0]
print(x,nb_clf.intercept_[0])
print(np.exp(nb_clf.intercept_[0]))

-0.6931471805599454 -0.6931471805599454
0.49999999999999994


In [141]:
import numpy as np
from sklearn.utils.fixes import logsumexp
from sklearn.utils.extmath import safe_sparse_dot

x = count_vectorizer.transform(['Inception is the worst movie ever'])

# th jll computation
self = nb_clf
# This could be precomputed. Saving space?  Not much.
neg_prob = np.log(1 - np.exp(self.feature_log_prob_))
#Compute  neg_prob · (1 - X).T  as  ∑neg_prob - X · neg_prob
jll = safe_sparse_dot(x, (self.feature_log_prob_ - neg_prob).T)
jll += self.class_log_prior_ + neg_prob.sum(axis=1)
jll

array([[-44.15016134, -23.01278725]])

In [142]:
# Normalizing
log_prob_x = logsumexp(jll, axis=1)
jll - np.atleast_2d(log_prob_x).T

array([[-2.11373741e+01, -6.60929089e-10]])

In [143]:
neg_prob.shape

(2, 37834)

In [144]:
np.exp(jll - np.atleast_2d(log_prob_x).T)

array([[6.60929395e-10, 9.99999999e-01]])

In [146]:
# Example of positive feature
neg_prob, pos_prob = .16,.5
np.log(neg_prob),np.log(pos_prob), np.log(neg_prob)/np.log(pos_prob)

(-1.8325814637483102, -0.6931471805599453, 2.643856189774725)

In [147]:
# Check our model
ind = vectorizer.vocabulary_['worst']
print(classes)
nb_w_cl0 = nb_clf.feature_log_prob_[0]
nb_w_cl1 = nb_clf.feature_log_prob_[1]
# log p(greatest | pos), log p(greatest|neg),
# ratio: log p(greatest | pos)/ log p(greatest | neg)
nb_w_cl1[ind], nb_w_cl0[ind], nb_w_cl0[ind]/nb_w_cl1[ind]

['neg' 'pos']


(-8.745478284939654, -6.979802305330207, 0.7981041262603019)

In [148]:
# Same thing for inception
ind = vectorizer.vocabulary_['inception']
nb_w = nb_clf.coef_
classes = nb_clf.classes_
(cl1, cl2) = classes
print(classes)
# cl2 is positive.  Note that feature_log_prob
# for that class provides the weight vector.
nb_w_cl0 = nb_clf.feature_log_prob_[0]
nb_w_cl1 = nb_clf.feature_log_prob_[1]
nb_w_cl1[ind], nb_w_cl0[ind], nb_w_cl0[ind]/nb_w_cl1[ind]

['neg' 'pos']


(-12.574119681330924, -35.47792390894256, 2.8215035969172)

In [149]:
vectorizer.stop_words_

{'character',
 'film',
 'good',
 'just',
 'like',
 'make',
 'movie',
 'story',
 'time',
 'way'}

#  Assignment

1a) The evaluation is after the next block of code where I printed test_dict.
1b) The Logistic Regression shows the highest percentage of accuracy at 88% following with the Bernoulli that has an accuracy of 75.5%, and lastly the MultinomialNB with 70.5% accuracy. Between the BernoulliNB and MultinomialNB, the system with the highest percentage of the true positive reviews would be the BernoulliNB with a precision of 75.8% compared to the MultinomialNB with 68.5%. Below is the work for Bernoulli followed by the test_dict to compare the evaluations

In [157]:
# BernoulliNB
import sklearn
module_name = sklearn.__name__
from sklearn.naive_bayes import BernoulliNB as nb
class_name = nb.__name__
from sklearn.feature_extraction.text import CountVectorizer

if class_name == 'BernoulliNB':
    binary = True
elif class_name == 'MultinomialNB':
    binary = False
else:
    raise Exception('binary var unset. Do you want count vectorization?  Is it binary?')

count_vectorizer = CountVectorizer(stop_words='english', max_df = .5, binary = binary)
# Now with data set represented as a list of strings (one from each file),
# extract the TFIDF features
train_features = count_vectorizer.fit_transform(train_data)

#  We extract features from the test data using the same vectorizer
#  trained on training data. The TFIDF feature model has been fit to 
#  (depends only on) the training data.
test_features = count_vectorizer.transform(test_data)

# Turning smoothing off! [alpha REALLY close to 0]
#No smoothing alpha
no_smoothing_alpha=1.0e-10
nb_clf = nb(alpha=no_smoothing_alpha)
#nb_clf = MultinomialNB(alpha=no_smoothing_alpha)
# Train (or "fit") the model to the training data.
nb_clf.fit(train_features, train_labels)

# Test the model on the test data.
nb_predicted_labels = nb_clf.predict(test_features)

# Evaluate the results
nb_pos_predictions = [p for p in nb_predicted_labels if p=='pos']



do_evaluation (nb_predicted_labels, test_labels, pos_label='pos', verbose=True, 
               system_id = (module_name, class_name), 
               test_dict = test_dict, two_way = True)

print()

print (f'{pos_cnt(nb_predicted_labels)/test_size:.1%} classifier predictions positive')

#  Was the test data biased?  Were most of the test reviews positive, for example?
print (f'{pos_cnt(test_labels)/test_size:.1%} reviews positive')



Accuracy   75.5%

P/R Evaluation with pos label = pos
Precision  75.8%
Recall     75.0%

P/R Evaluation with pos label = neg
Precision  75.2%
Recall     76.0%

49.5% classifier predictions positive
50.0% reviews positive


In [158]:
# ANSWER FOR 1A
test_dict

defaultdict(list,
            {('sklearn',
              'LogisticRegression'): [('pos',
               (0.8725490196078431, 0.89, 0.88)), ('neg',
               (0.8877551020408163, 0.87, 0.88))],
             ('sklearn',
              'MultinomialNB'): [('pos',
               (0.6846846846846847, 0.76, 0.705)), ('neg', (0.7303370786516854,
                0.65,
                0.705))],
             ('sklearn',
              'BernoulliNB'): [('pos',
               (0.7575757575757576, 0.75, 0.755)), ('neg', (0.7524752475247525,
                0.76,
                0.755))]})

Q2) Below I have ran the sample text with both the Logistic Regression and Bernoulli classifier.
The logistic regression outputs positive, while Bernoulli outputs negative review. In the logistic regression,one of the positive words is "disgraced", which is considered negative in the BernoulliNB. Some words such as redeemable and love should show up as positive words, so I am confused if I did my calculations correctly.

In [166]:
# Logistic Regression
text = 'Harrison Ford turns in a glowing performance as the disgraced but eminently redeemable John Thornton. As Buck comes to love Thornton, so do we'
print(classify(text,lr_clf,vectorizer, verbose = True))

logit (no bias):      0.4067
bias value:          -0.1654
logit (w/bias):       0.2413
prob:                 0.5600

pos


In [164]:
# Bernoulli
text = 'Harrison Ford turns in a glowing performance as the disgraced but eminently redeemable John Thornton. As Buck comes to love Thornton, so do we'
print(classify(text, nb_clf, count_vectorizer,verbose = True))

logit (no bias):     -39.6649
bias value:          -0.6931
logit (w/bias):      -40.3581
prob:                 0.0000

neg


In [163]:
# Logictic regression min and max
find_max_min_feats_text(text,lr_clf,vectorizer,n=10)

Max:  ['eminently' 'disgraced' 'turns' 'thornton' 'john' 'glowing' 'love'
 'harrison' 'ford' 'performance']
Min:  ['buck' 'comes']


In [167]:
# Bernoulli min and max
find_max_min_feats_text(text, nb_clf, count_vectorizer,n=10)

Max:  []
Min:  ['disgraced' 'eminently' 'thornton' 'buck' 'glowing' 'harrison' 'ford'
 'turns' 'john' 'comes']


Q3) The repetition of a known positive word in a review does not affec the values for all three models. My speculation would be that this feature is built into each model to not affect the probabilities or numbers due to the repetition of a word. For the MultinomialNB I used the data used above since variable names are reused for BernoulliNB.

In [169]:
# Bernoulli without repetition
textb = 'My friend enjoyed the movie alot'
print(classify(textb, nb_clf, count_vectorizer,verbose = True)+'\n')

# Bernoulli repetition
textb = 'My friend enjoyed enjoyed the movie alot'
print(classify(textb, nb_clf, count_vectorizer,verbose = True))

logit (no bias):     -9.5245
bias value:          -0.6931
logit (w/bias):      -10.2177
prob:                 0.0000

neg

logit (no bias):     -9.5245
bias value:          -0.6931
logit (w/bias):      -10.2177
prob:                 0.0000

neg


In [170]:
# Logistic Regression without repetition
textb = 'My friend enjoyed the movie alot'
print(classify(text,lr_clf,vectorizer, verbose = True) +'\n')

# With repetition
textb = 'My friend enjoyed enjoyed the movie alot'
print(classify(text,lr_clf,vectorizer, verbose = True))

logit (no bias):      0.4067
bias value:          -0.1654
logit (w/bias):       0.2413
prob:                 0.5600

pos

logit (no bias):      0.4067
bias value:          -0.1654
logit (w/bias):       0.2413
prob:                 0.5600

pos


Q4) A document vector is a vector corresponding to x in the equation sigmoid(dot(wx) + b). However, a weighted document vector is present in this equation because of the w which is a weight vector for our features. The value can be positive or negative, the higher means important features. On the other hand, values closer to 0 define features that don't decide or do much. In the formula (a) x should be the document vector because it contains the example to be classified in which y defines the class that will be assigned to it.

Q5) Stop words are words like "and","the","him", which are uninformative in representing content of a text, and can be removed to avoid having them conflict with our probabilities. As of now 'english' is the only supported string value. The words we have in stop words are "movie", "character", "make","story","time". These are all words that do not have any positive or negative context because they are for the most part nouns that do not reflect any positive or negative bias.

Q6) Suppose we are using a logistic regression model. w = (-.1, 1.2, -.6) b = -1.3
Determine if vectors are classified positive or negative

a) (0,0,-.3)
    Based off a sigmoid functions attributes, I can tell it will be negative because we are subtracting a small number by a big negative number, resulting in a negative number, so we can assume it will be below .5
    
b) (2,-.4,13)
    This vector as well will result in a large negative number, even larger than the one above, meaning it will fall below .5 most likely, causing it to be negative class.
    
c) (5,2,3)
    This vector I believe would result in a positive classification because of the large positive number. (My logic was incorrect here when looking at the probability calculated below after making an educated guess.

Part B
Compute P(y=1) for each of the 3 above vectors
Used python below to use sigmoid function because I couldn't get dot product to work so calculated that part by hand.


In [184]:
# First Vector
import numpy as np
from scipy.special import expit

# Used a calculator to get number plugged into expit which is the dot product of (w,x) + b
# I kept getting errors when trying to use dot function

a = expit(-1.12)
print("First vector p: ",a)

b = expit(-9.78)
print("Second vector p: ",b)

c = expit(-1.2)
print("Third vector p: ",c)

First vector p:  0.2460112835510519
Second vector p:  5.656859586026619e-05
Third vector p:  0.23147521650098238


Q7) When raising the value of max_df, the vocabulary size increased by 10 when increasing the df to .9
When lowering the value of max_df, the vocabulary size decreased significantly by over 400 when lowering the df to .1. My speculation would be that because the df descreased, a smaller vocabulary size will be present to be considered when comparing to our df value. Same way with a higher df, more words will be considered in comparison to our df value.