In [77]:
import pathlib
CURRENT_DIR = pathlib.Path().resolve()
EXPORTS_DIR = CURRENT_DIR / 'reviews_dataset' /'exports'


In [6]:
import pandas as pd

In [7]:
df = pd.read_csv(r"C:\Users\hp\Desktop\blog\nbs\final_ds\df.csv")

In [10]:
df.head()

Unnamed: 0,review,rate,label
0,"A very, very, very slow-moving, aimless movie ...",0,negative
1,Not sure who was more lost - the flat characte...,0,negative
2,Attempting artiness with black & white and cle...,0,negative
3,Very little music or anything to speak of.,0,negative
4,The best scene in the movie was when Gerardo i...,1,positive


In [8]:
labels = df['label'].tolist() #['negative','positive','negative',...]
texts = df['review'].tolist() 


In [9]:
label_legend = {"positive":1 , "negative":0}
labels_as_int = [label_legend[x] for x in labels] #[0,0,0,1,..]
label_legend_inverted = {f"{v}" : k for k,v in label_legend.items()} #{'0':'negative'}



In [10]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [11]:
MAX_NUM_WORDS = 280
tokenizer = Tokenizer(num_words= MAX_NUM_WORDS )
tokenizer.fit_on_texts(texts)
seq = tokenizer.texts_to_sequences(texts)


MAX_SEQ_LENGTH = 300
X = pad_sequences(seq , maxlen = MAX_SEQ_LENGTH)
print(X)

[[  0   0   0 ...  46   3 251]
 [  0   0   0 ...   1   8  49]
 [  0   0   0 ... 121   2 273]
 ...
 [  0   0   0 ... 192 103  15]
 [  0   0   0 ...  65   5   1]
 [  0   0   0 ...   1 138 173]]


In [20]:
! pip install numpy



In [37]:
import numpy as np
from tensorflow.keras.utils import to_categorical

In [13]:
label_as_int_array = np.asarray(labels_as_int)
y = to_categorical(label_as_int_array)
print(y)

[[1. 0.]
 [1. 0.]
 [1. 0.]
 ...
 [1. 0.]
 [1. 0.]
 [1. 0.]]


In [2]:
! pip install -U scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.2.1-cp310-cp310-win_amd64.whl (8.3 MB)
Collecting threadpoolctl>=2.0.0
  Using cached threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Collecting joblib>=1.1.1
  Using cached joblib-1.2.0-py3-none-any.whl (297 kB)
Collecting scipy>=1.3.2
  Using cached scipy-1.10.0-cp310-cp310-win_amd64.whl (42.5 MB)
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.2.0 scikit-learn-1.2.1 scipy-1.10.0 threadpoolctl-3.1.0


In [14]:
from sklearn.model_selection import train_test_split

In [15]:
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.33 , random_state = 42)

In [16]:
import pickle

In [19]:
training_data = {
    "X_train": X_train ,
    "X_test": X_test ,
    "y_train": y_train ,
    "y_test": y_test ,
    "max_words":MAX_NUM_WORDS,
    "max_seq": MAX_SEQ_LENGTH ,
    "label_legend":  label_legend ,
    "label_legend_inverted" : label_legend_inverted }


In [29]:
tokenizer_json = tokenizer.to_json()
METADATA_EXPORT_PATH = EXPORTS_DIR / 'metadata.pkl'
TOKENIZER_EXPORT_PATH  = EXPORTS_DIR / 'tokenizer.json'

TOKENIZER_EXPORT_PATH.write_text(tokenizer_json)

with open(METADATA_EXPORT_PATH , 'wb') as f :
    pickle.dump(training_data , f)


In [34]:
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding, LSTM, SpatialDropout1D
from tensorflow.keras.models import Model, Sequential

In [35]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(MAX_NUM_WORDS, embed_dim, input_length=X_train.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 300, 128)          35840     
                                                                 
 spatial_dropout1d (SpatialD  (None, 300, 128)         0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 196)               254800    
                                                                 
 dense (Dense)               (None, 2)                 394       
                                                                 
Total params: 291,034
Trainable params: 291,034
Non-trainable params: 0
_________________________________________________________________
None


In [68]:
batch_size = 32
epochs = 8
model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=batch_size, verbose=1, epochs=epochs)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x1857f7cf160>

In [69]:
def predict(text_str, max_words=280, max_sequence = 300, tokenizer=None):
    if not tokenizer:
        return None
    sequences = tokenizer.texts_to_sequences([text_str])
    x_input = pad_sequences(sequences, maxlen=max_sequence)
    y_output = model.predict(x_input)
    top_y_index = np.argmax(y_output)
    preds = y_output[0]
  
    labeled_preds = [{f"{label_legend_inverted[str(i)]}": x} for i, x in enumerate(preds)]
    return y_output

In [76]:
predict("such a nice one", tokenizer=tokenizer)



array([[6.9657550e-04, 9.9930346e-01]], dtype=float32)

In [79]:
import json
metadata = {
    "labels_legend_inverted": label_legend_inverted,
    "legend": label_legend,
    "max_sequence":  MAX_SEQ_LENGTH,
    "max_words": MAX_NUM_WORDS,
}

METADATA_EXPORT_PATH = EXPORTS_DIR / 'metadata.json'
METADATA_EXPORT_PATH.write_text(json.dumps(metadata, indent=4))

205

In [80]:
MODEL_EXPORT_PATH = EXPORTS_DIR / 'model.h5'
model.save(str(MODEL_EXPORT_PATH))