# Model Prototype

## Imports

In [10]:
import os
import re

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.layers import Embedding
from tensorflow.keras import layers

print("Version: ", tf.__version__)
print("GPU is ","available" if tf.config.list_physical_devices("GPU") else "NOT AVAILABLE")

Version:  2.4.1
GPU is  available


## Data Ingestion

In [11]:
parent_dir = os.path.dirname(os.getcwd())
print(parent_dir)
CSV_dir = os.path.join(parent_dir, 'ml/data/data.csv')
print(CSV_dir)

/text-classifier/Text-Classifer
/text-classifier/Text-Classifer/ml/data/data.csv


In [12]:
df = pd.read_csv(CSV_dir)
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
...,...,...
49995,I thought this movie did a down right good job...,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0
49997,I am a Catholic taught in parochial elementary...,0
49998,I'm going to have to disagree with the previou...,0


## Data Analysis

In [13]:
print(df['review'][20])

After the success of Die Hard and it's sequels it's no surprise really that in the 1990s, a glut of 'Die Hard on a .....' movies cashed in on the wrong guy, wrong place, wrong time concept. That is what they did with Cliffhanger, Die Hard on a mountain just in time to rescue Sly 'Stop or My Mom Will Shoot' Stallone's career.<br /><br />Cliffhanger is one big nit-pickers dream, especially to those who are expert at mountain climbing, base-jumping, aviation, facial expressions, acting skills. All in all it's full of excuses to dismiss the film as one overblown pile of junk. Stallone even managed to get out-acted by a horse! However, if you an forget all the nonsense, it's actually a very lovable and undeniably entertaining romp that delivers as plenty of thrills, and unintentionally, plenty of laughs.<br /><br />You've got to love John Lithgows sneery evilness, his tick every box band of baddies, and best of all, the permanently harassed and hapless 'turncoat' agent, Rex Linn as Travers.

In [14]:
print(df['sentiment'][20])

1


## Data Preprocessing

#### Remove HTML Tags

In [15]:
def filter_html(row):
    f = re.compile(r'<.*?>')
    f = f.sub('', row['review'])
    f = f.replace("\'", "")
    return f

In [16]:
df['review'] = df.apply(filter_html, axis=1)
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. The filming tec...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically theres a family where a little boy (...,0
4,"Petter Matteis ""Love in the Time of Money"" is ...",1
...,...,...
49995,I thought this movie did a down right good job...,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0
49997,I am a Catholic taught in parochial elementary...,0
49998,Im going to have to disagree with the previous...,0


In [17]:
print(df['review'][0])

One of the other reviewers has mentioned that after watching just 1 Oz episode youll be hooked. They are right, as this is exactly what happened with me.The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.I would say the main appeal of the show is due to the fact that it goes where other shows woul

#### Train Test Split 

In [18]:
X_train, X_test, y_train, y_test = train_test_split(
    df['review'], df['sentiment'], test_size=0.30, random_state=42
)

In [19]:
X_train.head()

38094    As much as I love trains, I couldnt stomach th...
40624    This was a very good PPV, but like Wrestlemani...
49425    Not finding the right words is everybodys prob...
35734    Im really suprised this movie didnt get a high...
41708    Ill start by confessing that I tend to really ...
Name: review, dtype: object

#### Convert to numpy array

In [20]:
X_train_n = X_train.to_numpy()

In [21]:
X_train_n

array(['As much as I love trains, I couldnt stomach this movie. The premise that one could steal a locomotive and "drive" from Arkansas to Chicago without hitting another train along the way has to be right up there on the Impossible Plot lines hit board. Imagine two disgruntled NASA employees stealing the "crawler" that totes the shuttles to and fro and driving it to New York and you get the idea.Having said all that, its a nice try. Wilford Brimely is at his Quaker Oats best, and Levon Helm turns a good performance as his dimwitted but well-meaning sidekick. Bob Balaban is suitably wormy as the Corporate Guy, and the "little guy takes on Goliath" story gets another airing.',
       'This was a very good PPV, but like Wrestlemania XX some 14 years later, the WWE crammed so many matches on it, some of the matches were useless. Im not going to go through every match on the card because it would take forever to do.However major highlights included the HUGE pop for Demolition winning the 

In [22]:
X_test_n = X_test.to_numpy()
y_train_n = y_train.to_numpy()
y_test_n = y_test.to_numpy()

#### Text Vectorization 

In [61]:
MAX_FEATURES = 10000
MAX_LEN = 200
EMBEDDING_DIMS = 2

In [62]:
vectorize_layer = TextVectorization(
    max_tokens = MAX_FEATURES,
    output_mode = 'int',
    output_sequence_length = MAX_LEN,
)
vectorize_layer

<tensorflow.python.keras.layers.preprocessing.text_vectorization.TextVectorization at 0x7f64002763c8>

In [63]:
vectorize_layer.adapt(X_train_n)

#### Embedding 

In [64]:
embedding_layer = Embedding(
    MAX_FEATURES,
    32,
    input_length=MAX_LEN,
)

In [81]:
model = tf.keras.models.Sequential([
    tf.keras.Input(shape=(1,),dtype=tf.string),
    vectorize_layer,
    embedding_layer,
    layers.Flatten(),
#     layers.Dense(1, activation='sigmoid')
])

In [82]:
# model = tf.keras.models.Sequential()
# model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
# model.add(vectorize_layer)

In [83]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization_2 (TextVe (None, 200)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 200, 32)           320000    
_________________________________________________________________
flatten (Flatten)            (None, 6400)              0         
Total params: 320,000
Trainable params: 320,000
Non-trainable params: 0
_________________________________________________________________


In [84]:
processed_X = model.predict(X_test_n)

In [85]:
processed_X.shape

(15000, 6400)

In [86]:
processed_X[0]

array([ 0.03715139, -0.03951082,  0.04351897, ..., -0.01247861,
        0.03959943,  0.02102547], dtype=float32)

## Data Analysis 

In [87]:
processed_X_df = pd.DataFrame(processed_X)
processed_X_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6390,6391,6392,6393,6394,6395,6396,6397,6398,6399
0,0.037151,-0.039511,0.043519,-0.012676,-0.049001,0.048414,-0.027672,-0.023931,0.027512,-0.047284,...,0.020752,0.011692,0.042319,-0.013472,0.011908,-0.004481,0.029991,-0.012479,0.039599,0.021025
1,-0.042028,-0.020603,-0.01428,-0.004191,-0.001042,0.017406,-0.045665,0.016261,-0.027302,0.041986,...,0.031065,0.03229,0.028777,0.037715,0.030861,0.02279,0.01152,-0.003613,0.03532,-0.001542
2,-0.044107,0.032542,0.042482,-0.028165,0.032288,0.027521,0.049152,0.039995,0.048789,-0.011609,...,0.000816,-0.036514,0.001668,0.022847,0.022369,0.015303,-0.040903,0.023043,-0.03442,-0.029275
3,0.004317,-0.02233,0.00358,0.025369,-0.023975,-0.001051,0.025758,0.016449,-0.022952,-0.042192,...,0.000816,-0.036514,0.001668,0.022847,0.022369,0.015303,-0.040903,0.023043,-0.03442,-0.029275
4,-0.002559,0.019066,0.038694,0.007921,-0.025409,-0.022586,-0.043493,-0.037645,0.045353,0.017482,...,-0.040848,0.036653,-0.031116,0.002187,-0.049533,0.015519,-0.038246,-0.040085,0.031107,0.04145


In [88]:
nonzeros = processed_X_df.astype(bool).sum(axis=1)

In [89]:
nonzeros.mean()

6400.0