# Project 2 Part 7

### Tasks
**Part 1**
- Define a filepaths dictionary and save it to config/filepaths.json  to include file paths for each component you will save (review below).
- Copy your best models from part 6 into the new notebook.
    - Note: update your code to define the final public-facing class labels.
- Saving the data
    - For the Machine Learning Model
        - Save training data: X_train, y_train
        - Save testing data: X_test, y_test
        - Save target_lookup dictionary and/or label encoder
        - Save best model
    - For Deep NLP Model
        - Save training data: X_train, y_train
        - Save testing data: X_test, y_test
        - Save best neural network
            - Note: use safe_format='tf' to save model in a folder of repo-friendly files
 
**Part 2**
- Create a streamlit app for getting predictions for a user-entered text from your loaded model
- (Optional but recommended); Include a Lime Text Explainer explanation for the prediction.
- Include the ability to load the training and test data to evaluate the model.

### Imports and Loading

In [1]:
# Imports

from pprint import pprint
import os, json, sys

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import numpy as np
import pandas as pd
import spacy
import nltk
import joblib

# Sklearn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

# Tensorflow
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import optimizers

# Random seed
tf.keras.utils.set_random_seed(42)
tf.random.set_seed(42)
np.random.seed(42)

# Configuration
from sklearn import set_config
set_config(transform_output='pandas')   

In [2]:
%load_ext autoreload
%autoreload 2
import custom_functions as fn

### Creating Directory

In [3]:
# Define a filepath dictionary
FPATHS = dict(
    
    # Data
    data={
        # Source data files
        "source": {
            "raw": 'Data-NLP/movie_reviews_v2.csv',
            "processed": "Data-NLP/reviews_processed.joblib",
        },
        # Machine Learning data files
        "ml": {
            "train": "Data-NLP/training-data.joblib",
            "test": "Data-NLP/testing-data.joblib",
            "target_lookup": "Data-NLP/target_lookup.joblib",
            "encoder": "Data-NLP/encoder.joblib",
        },
        # Tensorflow data files
        "tf": {
            "train": "Data-NLP/train_ds.joblib",
            "test": "Data-NLP/test_ds.joblib",
        },
    },
    
    # Models
    models={
        "random_forest": "models/random_forest/rf_reg.joblib",
        "GRU": "models/GRU/", # We haven't saved this yet
    },
    
    # Images/EDA
    images={
        "wordcloud": "images/movie_reviews_wordcloud.png",
    },
)
pprint(FPATHS)

{'data': {'ml': {'encoder': 'Data-NLP/encoder.joblib',
                 'target_lookup': 'Data-NLP/target_lookup.joblib',
                 'test': 'Data-NLP/testing-data.joblib',
                 'train': 'Data-NLP/training-data.joblib'},
          'source': {'processed': 'Data-NLP/reviews_processed.joblib',
                     'raw': 'Data-NLP/movie_reviews_v2.csv'},
          'tf': {'test': 'Data-NLP/test_ds.joblib',
                 'train': 'Data-NLP/train_ds.joblib'}},
 'images': {'wordcloud': 'images/movie_reviews_wordcloud.png'},
 'models': {'GRU': 'models/GRU/',
            'random_forest': 'models/random_forest/rf_reg.joblib'}}


In [4]:
# Save filepaths in a config folder
os.makedirs('config/', exist_ok = True)
FPATHS_FILE = 'config/filepaths.json'
with open(FPATHS_FILE, 'w') as f:
    json.dump(FPATHS, f)

In [5]:
# Function from "Creating a File Structure"
fn.create_directories_from_paths(FPATHS)

### Preprocessing and Saving Data - Machine Learning Model

In [6]:
# Load processed review dataframe 
df = joblib.load("Data-NLP/reviews_processed.joblib")
df.head(1)

Unnamed: 0,review_id,movie_id,imdb_id,original_title,review,rating,highlow_rating,tokenized,lemmatized,tokenized_joined,lemmatized_joined
1,57086ff5c3a3681d29001512,7443,tt0120630,Chicken Run,"A guilty pleasure for me personally, as I love...",9.0,high,"[guilty, pleasure, personally, love, great, es...","[guilty, pleasure, personally, love, great, es...",guilty pleasure personally love great escape w...,guilty pleasure personally love great escape w...


**Apply New Labels and TTS**

In [7]:
## Apply new labels for public-facing app

# Rename columns
df = df.rename(columns={'review': 'Review', 'highlow_rating': 'Rating'})
# Format ratings labels
rating_dict = {'low': 'Low Rating', 'high': 'High Rating'}

# Use original review column as X and classification target column as y
y = df['Rating'].replace(rating_dict)
X = df['Review']

In [8]:
#Train Test Split
X_train, X_test, y_train_str, y_test_str = train_test_split(X, y, random_state=42)
y_train_str.value_counts(normalize=True)

Low Rating     0.510474
High Rating    0.489526
Name: Rating, dtype: float64

**Obtain Encoder Labels**

In [9]:
# Getting unique classes to convert 
class_names = y_train_str.unique()

# Fit the label encoder on unique class names
encoder = LabelEncoder()
encoder.fit(class_names)

In [10]:
# Transform the y_train_str and y_test_str with encoder
y_train = encoder.transform(y_train_str)
y_test = encoder.transform(y_test_str)
y_train[:10]

array([1, 0, 0, 0, 1, 0, 0, 0, 1, 0])

In [11]:
# Extract class names from the encoder
classes = encoder.classes_

# Get the encoded values for each both classes
class_codes = encoder.transform(classes)

# Making lookup dictionary to find the encoded label's original name
target_lookup = dict(zip(class_codes, classes))
# Verify dictionary
target_lookup

{0: 'High Rating', 1: 'Low Rating'}

**Creating the Model from Part 6**

In [12]:
## Build text vectorizer and classification model
# Create a sklearn text vectorizer. Consider stopwords, punc, etc.
tfidf_vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)
# Create classification model
rf_clf = RandomForestClassifier()

In [13]:
# Create an sklearn modeling pipeline
rf_pipe = Pipeline([
    ('vectorizer', tfidf_vectorizer), 
    ('classifier', rf_clf),
])

# Fit and evaluate the model
rf_pipe.fit(X_train, y_train)

**Saving the Data and Model**

In [14]:
## Saving the data
# Save the training data
train_path = FPATHS['data']['ml']['train']
joblib.dump([X_train, y_train], train_path)

# Save the testing data
test_path = FPATHS['data']['ml']['test']
joblib.dump([X_test, y_test], test_path)

# Save the label encoder
encoder_path = FPATHS['data']['ml']['encoder']
joblib.dump(encoder, encoder_path);

# Save the target_lookup
lookup_path = FPATHS['data']['ml']['target_lookup']
joblib.dump(target_lookup, lookup_path);

# Save the model
model_path = FPATHS['models']['random_forest']
joblib.dump(rf_pipe, model_path);

### Preprocessing and Saving Data - Deep NLP Model

In [15]:
# Load training data using dictionary
[X_train, y_train] = joblib.load(FPATHS['data']['ml']['train'])
[X_test, y_test]  = joblib.load(FPATHS['data']['ml']['test'])

In [16]:
# Make a dataset object from train data
train_tf = tf.data.Dataset.from_tensor_slices((X_train, y_train))
# Make a dataset object from test data
test_tf = tf.data.Dataset.from_tensor_slices((X_test, y_test))

In [17]:
# Shuffle dataset
train_tf = train_tf.shuffle(buffer_size=len(train_tf),reshuffle_each_iteration=False)

In [18]:
# Set the ratio of the train, validation, test split
split_train = .75
split_val =  .25
# Calculate the number of samples for training and validation data 
n_train_samples =  int(len(train_tf) * split_train)
n_val_samples = int(len(train_tf) * split_val)
# Set the batch size
BATCH_SIZE =32
import math
# math.ceil will round up
# How many batches? 
n_train_batches = math.ceil(n_train_samples/BATCH_SIZE)
n_val_batches = math.ceil(n_val_samples/BATCH_SIZE)
print(f"    - train:\t{n_train_samples} samples \t({n_train_batches} batches)")
print(f"    - val:  \t{n_val_samples} samples \t({n_val_batches} batches)")

    - train:	1360 samples 	(43 batches)
    - val:  	453 samples 	(15 batches)


In [19]:
# Use take and skip to define each set
train_ds = train_tf.take(n_train_samples).batch(batch_size=BATCH_SIZE)
# Skip over the training batches and take the validation batches
val_ds = train_tf.skip(n_train_samples).take(n_val_samples).batch(batch_size=BATCH_SIZE)
# Confirm the number of batches in each
print (f' There are {len(train_ds)} training batches.')
print (f' There are {len(val_ds)} validation batches.')

 There are 43 training batches.
 There are 15 validation batches.


In [20]:
# Put the test data into batches also
test_ds = test_tf.batch(batch_size = BATCH_SIZE)
# How many batches
print (f' There are {len(test_ds)} testing batches.')

 There are 19 testing batches.


**Saving the datasets**

In [21]:
# Save training dataset object
train_ds_fpath = FPATHS['data']['tf']['train']
tf.data.Dataset.save(train_ds, train_ds_fpath)
# Save testing dataset object
test_ds_fpath = FPATHS['data']['tf']['test']
tf.data.Dataset.save(test_ds, test_ds_fpath)

In [22]:
# Define classes
encoder = joblib.load(FPATHS['data']['ml']['encoder'])
classes = encoder.classes_
classes

array(['High Rating', 'Low Rating'], dtype=object)

In [23]:
# Create keras text vect layer for RNN sequence model
SEQUENCE_LENGTH = 100
sequence_vectorizer = tf.keras.layers.TextVectorization(
    standardize="lower_and_strip_punctuation",
    output_mode="int",
    output_sequence_length=SEQUENCE_LENGTH
)

In [24]:
# Get just the text from ds_train
ds_texts = train_ds.map(lambda x, y: x)

# Train model
sequence_vectorizer.adapt(ds_texts)

In [25]:
# Getting list of vocab
vocab = sequence_vectorizer.get_vocabulary()
int_to_str = {idx: word for idx, word in enumerate(vocab)}

# Programmatically define size of vocab from vectorization layer
VOCAB_SIZE = sequence_vectorizer.vocabulary_size()

# Define classes variable
classes = y.unique()

In [26]:
# Build an RNN with textVector layer
def build_bidir_GRU(text_vectorization_layer):
    
    model = tf.keras.Sequential([
        text_vectorization_layer,
        layers.Embedding(input_dim=VOCAB_SIZE, 
                         output_dim=250, 
                         input_length=SEQUENCE_LENGTH)])

    # Two bidirectional GrU layers    
    model.add(layers.Bidirectional(layers.GRU(128, return_sequences=True)))
    model.add(layers.Dropout(.5))
    model.add(layers.Bidirectional(layers.GRU(64, return_sequences=True)))
    model.add(layers.Dropout(.5))
    model.add(layers.GlobalMaxPooling1D())
    
    # Output layer
    model.add(layers.Dense(len(classes), activation='sigmoid'))
    
    # Compile
    optimizer = optimizers.legacy.Adam()
    model.compile(optimizer=optimizer,  
                  loss='sparse_categorical_crossentropy', 
                  metrics=['accuracy'])
    model.summary()
    return model

In [27]:
# Include callbacks
def get_callbacks(patience=3, monitor='val_accuracy'):
    early_stop = tf.keras.callbacks.EarlyStopping(patience=patience, monitor=monitor)
    return [early_stop]

In [28]:
# Build model
model = build_bidir_GRU(sequence_vectorizer)

EPOCHS = 30
history = model.fit(
    train_ds,
    epochs=EPOCHS,
    validation_data=val_ds,
    callbacks=get_callbacks(),
)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, 100)              0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 100, 250)          5736000   
                                                                 
 bidirectional (Bidirectiona  (None, 100, 256)         291840    
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 100, 256)          0         
                                                                 
 bidirectional_1 (Bidirectio  (None, 100, 128)         123648    
 nal)                                                            
                                                        

In [29]:
# Save model
model_fpath = FPATHS['models']['GRU']
# tf.keras.models.save_model(model, model_fpath, save_format='tf')



INFO:tensorflow:Assets written to: models/GRU/assets


INFO:tensorflow:Assets written to: models/GRU/assets
