### Using Natural Language Processing to Preprocess and Clean Movie reviews 

Use a machine learning classifiers to determine the sentiment of processed movie reviews data
Building NLP Sentiment Analyzer, Loading and Preprocessing Data, Training Your Classifier, Classifying Reviews, Connecting the Pipeline.

Data: Large Movie Review Dataset(https://ai.stanford.edu/~amaas/data/sentiment/)

In [1]:
import os
import random
import spacy
import arviz as az
from spacy.util import minibatch, compounding
import pandas as pd
nlp = spacy.load('en',parse=True,tag=True, entity=True)
spacy.load('en')

import scipy
print(scipy.__version__)
import pymc3 as pm
print(f"Runing on PyMC3 v{pm.__version__}")
import en_core_web_sm
nlp = en_core_web_sm.load()



1.8.0
Runing on PyMC3 v3.11.5


### Model 1: Convolutional neural network

In [87]:
TEST_REVIEW = """
Transcendently beautiful in moments outside the office, it seems almost
sitcom-like in those scenes. When Toni Colette walks out and ponders
life silently, it's gorgeous.<br /><br />The movie doesn't seem to decide
whether it's slapstick, farce, magical realism, or drama, but the best of it
doesn't matter. (The worst is sort of tedious - like Office Space with less
humor.)
"""


eval_list = []


def train_model_CNN(
    training_data: list, test_data: list, iterations: int = 20
) -> None:
    # Build pipeline
    nlp = spacy.load(r'/Users/grace/opt/anaconda3/lib/python3.9/site-packages/en_core_web_sm/en_core_web_sm-2.3.1') #load build-in pipeline
    if "textcat" not in nlp.pipe_names: # check textcat componet is already available
        textcat = nlp.create_pipe(  #textcategorizer
            "textcat", config={"architecture": "simple_cnn"}
        )
        nlp.add_pipe(textcat, last=True)
    else:
        textcat = nlp.get_pipe("textcat")# get_pipe() to assign it to a variable

    textcat.add_label("pos")# add the labels from data
    textcat.add_label("neg")

    
    # Train only textcat
    training_excluded_pipes = [
        pipe for pipe in nlp.pipe_names if pipe != "textcat"
    ]
    with nlp.disable_pipes(training_excluded_pipes): #content manager
        #As we are only focusing on entity extraction, we will disable all other pipeline components to train our model for ner only using nlp.disable_pipes()
        optimizer = nlp.begin_training() #returns the initial optimizer function
        # Training loop
        print("Beginning training")
        print("Loss\tPrecision\tRecall\tF-scoret\tAccuracy")
        batch_sizes = compounding(  #create a generator
            4.0, 32.0, 1.001
        )  # A generator that yields infinite series of input numbers(batch_sizes)
        for i in range(iterations):
            print(f"Training iteration {i}")
            loss = {}
            random.shuffle(training_data)
            
            batches = minibatch(training_data, size=batch_sizes)
            for batch in batches:
                text, labels = zip(*batch)
                nlp.update(text, labels, drop=0.2, sgd=optimizer, losses=loss) #update the weights of the underlying model
            with textcat.model.use_params(optimizer.averages):
                evaluation_results = evaluate_model(
                    tokenizer=nlp.tokenizer,
                    textcat=textcat,
                    test_data=test_data,
                )
                print(
                    f"{loss['textcat']}\t{evaluation_results['precision']}"
                    f"\t{evaluation_results['recall']}"
                    f"\t{evaluation_results['f-score']}"
                    f"\t{evaluation_results['accuracy']}"
                )

    # Save model
    with nlp.use_params(optimizer.averages):
        nlp.to_disk("model_artifacts")


def evaluate_model(tokenizer, textcat, test_data: list) -> dict:
    reviews, labels = zip(*test_data)
    reviews = (tokenizer(review) for review in reviews)
    true_positives = 0
    false_positives = 1e-8  # Can't be 0 because of presence in denominator
    true_negatives = 0
    false_negatives = 1e-8
    for i, review in enumerate(textcat.pipe(reviews)):
        true_label = labels[i]["cats"]
        for predicted_label, score in review.cats.items():
            # Every cats dictionary includes both labels, you can get all
            # the info you need with just the pos label
            if predicted_label == "neg":
                continue
            if score >= 0.5 and true_label["pos"]:
                true_positives += 1
            elif score >= 0.5 and true_label["neg"]:
                false_positives += 1
            elif score < 0.5 and true_label["neg"]:
                true_negatives += 1
            elif score < 0.5 and true_label["pos"]:
                false_negatives += 1
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    accuracy = (true_positives+true_negatives)/(true_positives + false_negatives+false_positives + true_negatives)

    if precision + recall == 0:
        f_score = 0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"precision": precision, "recall": recall, "f-score": f_score, "accuracy": accuracy}


def test_model(loaded_model, input_data: str = TEST_REVIEW):
    #  Load saved trained model
    #loaded_model = spacy.load("model_artifacts")
    # Generate prediction
    parsed_text = loaded_model(input_data)

    output_labels = []
    # Determine prediction to return
    if parsed_text.cats["pos"] > parsed_text.cats["neg"]:
        #prediction = "Positive"
        #score = parsed_text.cats["pos"]
        output_labels.append(1)
    else:
        #prediction = "Negative"
        #score = parsed_text.cats["neg"]
        output_labels.append(0)
    #print(
        #f"Review text: {input_data}\nPredicted sentiment: {prediction}"
        #f"\tScore: {score}"
    #)

    return output_labels


    #constrcut the directory structure of data, look for and open text files, then appende a tuple of 
    #the contents and a label dictionary to the reviews list.
def load_training_data(
    data_directory: str = "/Users/grace/Desktop/aclImdb/train", split: float = 0.8, limit: int = 0
) -> tuple:
    # Load from files
    reviews = []
    for label in ["pos", "neg"]:
        labeled_directory = f"{data_directory}/{label}"
        for review in os.listdir(labeled_directory):
            if review.endswith(".txt"):
                with open(f"{labeled_directory}/{review}") as f:
                    text = f.read()
                    text = text.replace("<br />", "\n\n") #replace html tags with newlines
                    if text.strip(): #remove all leading and tailing whitespace
                        spacy_label = {
                            "cats": {
                                "pos": "pos" == label,
                                "neg": "neg" == label,
                            }
                        }
                        reviews.append((text, spacy_label))
   
    # shuffle data to eliminate any possible bias from the order in which training data is loaded.
    random.shuffle(reviews) 

    if limit:
        reviews = reviews[:limit]
    split = int(len(reviews) * split)
    return reviews[:split], reviews[split:] #convert the split to a number of items that define the split boundary



In [31]:
def label_training_data(dataset):
    training_data = []
    training_labels = []
    
    for text, spacy_label in dataset:
        training_data.append(text)
        categories = spacy_label['cats']
        if categories['pos'] == True:
            training_labels.append(1)
        else:
            training_labels.append(0)
    
    return training_data, training_labels

In [76]:
train, test = load_training_data(limit=20000)
validation = train[12000:]
train = train[:12000]
training_texts, training_labels = label_training_data(train)
test_texts, test_labels = label_training_data(test)
print('Training dataset: ', len(training_texts))
print('Test dataset: ', len(test_texts))
print('Validation dataset: ', len(validation))


Training dataset:  12000
Test dataset:  4000
Validation dataset:  4000


In [34]:
print(training_texts[:2])
print(training_labels[:10])

["Allow yourself to be transported to a different, old school kind of storytelling. Scoop is classic Woody Allen.\n\n\n\nAllen's latest muse, Scarlett Johansson (who also appeared in last year's Match Point, also by Allen), is surprisingly able to tone down her sultry sex kitten appeal and transform into a normal looking student-type with the aid of nerdish glasses and outfits but still fails to make the audience believe how Hugh Jackman's lordly character can be so smitten by her, given the royal's background (don't worry, no spoilers here). There are no grand transformations for Johansson's character here, as she consistently plays the same character throughout despite the script saying otherwise. You even forgive her character's apparent lack of logic, continuing an affair with a suspected serial killer, simply because he is His Royal Hotness Jackman, who is refreshing to see sans the Wolverine duds.\n\n\n\nIf anything, consistency is what the 70-year old Allen is all about. He cont

In [89]:
from sklearn import metrics 

def compute_test_set(loaded_model, test_texts, test_labels):
    predicted_labels = []

    for text in test_texts:
        predicted_labels += test_model(loaded_model, text)


    print(metrics.classification_report(test_labels, predicted_labels))

    

In [108]:

#train, test = load_training_data(limit=20000)
print("Training model") 
train_model_CNN(train[:6000], validation[:6000])
df = pd.DataFrame(eval_list)
pd.DataFrame.plot(df)


Training model
Beginning training
Loss	Precision	Recall	F-scoret	Accuracy
Training iteration 0
15.651854295749217	0.8407851690248594	0.770229770225923	0.8039624608925758	0.8119999999959401
Training iteration 1
0.7328757226350717	0.8502109704596509	0.8051948051907832	0.8270908157987322	0.8314999999958426
Training iteration 2
0.1599758323172864	0.8516397709481955	0.8171828171787354	0.8340555697128013	0.8372499999958138
Training iteration 3
0.057379591518838424	0.8528025144009806	0.8131868131827513	0.8325236512358347	0.8362499999958188
Training iteration 4
0.022719194933415565	0.8559411146116933	0.8131868131827513	0.8340163934383503	0.8379999999958101
Training iteration 5
0.01671620870888546	0.860576923072326	0.8046953046912853	0.8316985028351487	0.8369999999958151
Training iteration 6
0.013931921963830973	0.8644158628035133	0.8056943056902813	0.8340227507712823	0.8394999999958025
Training iteration 7
0.01676963727572911	0.8649225840850778	0.8091908091867673	0.8361290322537491	0.841249999

<pandas.plotting._core.PlotAccessor object at 0x7fe7c9686f40>

In [109]:
loaded_model = spacy.load("model_artifacts")
print("Testing model")
compute_test_set(loaded_model, test_texts, test_labels)

Testing model
              precision    recall  f1-score   support

           0       0.83      0.86      0.84      1998
           1       0.85      0.83      0.84      2002

    accuracy                           0.84      4000
   macro avg       0.84      0.84      0.84      4000
weighted avg       0.84      0.84      0.84      4000



In [117]:
df = pd.DataFrame(eval_list)
print(pd.DataFrame.plot(df))


<pandas.plotting._core.PlotAccessor object at 0x7fe80587ca30>


### Model 2: Naive Bayese

In [99]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
import math

In [100]:
from sklearn.pipeline import Pipeline
NB = Pipeline([
     ('vect', CountVectorizer()),
     ('tfidf', TfidfTransformer()),
     ('clf', MultinomialNB()),
 ])

In [101]:
# train the model 
NB.fit(training_texts, training_labels)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB())])

In [102]:
predicted_NB = NB.predict(test_texts) 

In [103]:
# scikit-learn provides further utilities for more detailed performance analysis
from sklearn import metrics
print(metrics.classification_report(test_labels, predicted_NB))

metrics.confusion_matrix(test_labels, predicted_NB)

              precision    recall  f1-score   support

           0       0.82      0.89      0.86      1998
           1       0.88      0.81      0.84      2002

    accuracy                           0.85      4000
   macro avg       0.85      0.85      0.85      4000
weighted avg       0.85      0.85      0.85      4000



array([[1776,  222],
       [ 380, 1622]])

### Model 3: Support Vector machine

In [110]:
#linear support vector machine (SVM),  a bit slower than naïve Bayes
from sklearn.linear_model import SGDClassifier
SV = Pipeline([
     ('vect', CountVectorizer()),
     ('tfidf', TfidfTransformer()),
     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                           alpha=1e-3, random_state=42,
                           max_iter=5, tol=None)),
 ])

SV.fit(training_texts, training_labels)
predicted_SV = SV.predict(test_texts)
np.mean(predicted_SV == test_labels)

0.853

In [111]:
predicted_SV = SV.predict(test_texts) 

In [112]:
# scikit-learn provides further utilities for more detailed performance analysis
from sklearn import metrics
print(metrics.classification_report(test_labels, predicted_SV))

metrics.confusion_matrix(test_labels, predicted_SV)

              precision    recall  f1-score   support

           0       0.88      0.81      0.85      1998
           1       0.83      0.89      0.86      2002

    accuracy                           0.85      4000
   macro avg       0.86      0.85      0.85      4000
weighted avg       0.86      0.85      0.85      4000



array([[1622,  376],
       [ 212, 1790]])

### Model 4: Random Forest

In [119]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction import DictVectorizer

In [120]:
RF = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', RandomForestClassifier(n_estimators=20, 
                                   criterion = 'entropy',
                                   max_depth=50, 
                                   min_samples_leaf=4,
                                   min_samples_split=3))])
RF.fit(training_texts, training_labels)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf',
                 RandomForestClassifier(criterion='entropy', max_depth=50,
                                        min_samples_leaf=4, min_samples_split=3,
                                        n_estimators=20))])

In [121]:
preds_RF = RF.predict(test_texts)
print(metrics.classification_report(test_labels, preds_RF))

              precision    recall  f1-score   support

           0       0.81      0.79      0.80      1998
           1       0.80      0.81      0.80      2002

    accuracy                           0.80      4000
   macro avg       0.80      0.80      0.80      4000
weighted avg       0.80      0.80      0.80      4000

