In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.python.keras import models, layers, optimizers
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
import bz2
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
import re
from google.colab import drive
#drive.mount('/content/gdrive')

%matplotlib inline


import os
dataset='gdrive/My Drive/Colab Notebooks/dataset'

# Any results you write to the current directory are saved as output.

In [2]:
def get_labels_and_texts(file):
    labels = []
    texts = []
    for line in bz2.BZ2File(file):
        x = line.decode("utf-8")
        labels.append(int(x[9]) - 1)
        texts.append(x[10:].strip())
    return np.array(labels), texts
train_labels, train_texts = get_labels_and_texts(dataset+'/train.ft.txt.bz2')
test_labels, test_texts = get_labels_and_texts(dataset+'/test.ft.txt.bz2')

FileNotFoundError: ignored

In [3]:
import re
NON_ALPHANUM = re.compile(r'[\W]')
NON_ASCII = re.compile(r'[^a-z0-1\s]')
def normalize_texts(texts):
    normalized_texts = []
    for text in texts:
        lower = text.lower()
        no_punctuation = NON_ALPHANUM.sub(r' ', lower)
        no_non_ascii = NON_ASCII.sub(r'', no_punctuation)
        normalized_texts.append(no_non_ascii)
    return normalized_texts
        
train_texts = normalize_texts(train_texts)
test_texts = normalize_texts(test_texts)

NameError: ignored

In [0]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, random_state=57643892, test_size=0.2)

In [0]:
MAX_FEATURES = 10000

tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(train_texts)

#import pickle
#file= open('gdrive/My Drive/Colab Notebooks/tokenizer.pkl','rb')
#tokenizer=pickle.load(file)
#file.close()
train_texts = tokenizer.texts_to_sequences(train_texts)
val_texts = tokenizer.texts_to_sequences(val_texts)
test_texts = tokenizer.texts_to_sequences(test_texts)


In [0]:
MAX_LENGTH = max(len(train_ex) for train_ex in train_texts)
train_texts = pad_sequences(train_texts, maxlen=MAX_LENGTH)
val_texts = pad_sequences(val_texts, maxlen=MAX_LENGTH)
test_texts = pad_sequences(test_texts, maxlen=MAX_LENGTH)
print(MAX_LENGTH)


254


In [0]:
def build_lstm_model():
    sequences = layers.Input(shape=(MAX_LENGTH,))
    embedded = layers.Embedding(MAX_FEATURES, 64)(sequences)
    x = layers.LSTM(128, return_sequences=True)(embedded)
    x = layers.CuDNNLSTM(128)(x)
    x = layers.Dense(32, activation='relu')(x)
    x = layers.Dense(100, activation='relu')(x)
    predictions = layers.Dense(1, activation='sigmoid')(x)
    model = models.Model(inputs=sequences, outputs=predictions)
    model.compile(
        optimizer='rmsprop',
        loss='binary_crossentropy',
        metrics=['binary_accuracy']
    )
    return model
    
lstm_model = build_lstm_model()

In [0]:
lstm_model.fit(
    train_texts, 
    train_labels, 
    batch_size=1000,
    epochs=10,
    validation_data=(val_texts, val_labels), )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f321865a6d8>

In [0]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive 
from google.colab import auth 
from oauth2client.client import GoogleCredentials

In [0]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
lstm_model.save("lstm.h5")
model_file = drive.CreateFile({'title' : 'lstm.h5'})
model_file.SetContentFile('lstm.h5')
model_file.Upload()
drive.CreateFile({'id': model_file.get('id')})


GoogleDriveFile({'id': '1XXYX5NZlKwgQTU4UcYJY2xZnAU7MoWiX'})

In [0]:
preds = lstm_model.predict(test_texts)
print('Accuracy score: {:0.4}'.format(accuracy_score(test_labels, 1 * (preds > 0.5))))
print('F1 score: {:0.4}'.format(f1_score(test_labels, 1 * (preds > 0.5))))
print('ROC AUC score: {:0.4}'.format(roc_auc_score(test_labels, preds)))

Accuracy score: 0.9544
F1 score: 0.9546
ROC AUC score: 0.9893


In [0]:
def build_model():
    sequences = layers.Input(shape=(MAX_LENGTH,))
    embedded = layers.Embedding(MAX_FEATURES, 64)(sequences)
    x = layers.Conv1D(64, 3, activation='relu')(embedded)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPool1D(3)(x)
    x = layers.Conv1D(64, 5, activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPool1D(5)(x)
    x = layers.Conv1D(64, 5, activation='relu')(x)
    x = layers.GlobalMaxPool1D()(x)
    x = layers.Flatten()(x)
    x = layers.Dense(100, activation='relu')(x)
    predictions = layers.Dense(1, activation='sigmoid')(x)
    model = models.Model(inputs=sequences, outputs=predictions)
    model.compile(
        optimizer='rmsprop',
        loss='binary_crossentropy',
        metrics=['binary_accuracy']
    )
    return model
    
model = build_model()

In [0]:
model.fit(
    train_texts, 
    train_labels, 
    batch_size=1000,
    epochs=10,
    validation_data=(val_texts, val_labels), verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f31f88d8ac8>

In [0]:
model.save("cnn.h5")
model_file = drive.CreateFile({'title' : 'cnn.h5'})
model_file.SetContentFile('cnn.h5')
model_file.Upload()
drive.CreateFile({'id': model_file.get('id')})


GoogleDriveFile({'id': '1po_5C2K0JuskhYG_l_ro5X96SOob9ar5'})

In [0]:
preds = model.predict(test_texts)
print('Accuracy score: {:0.4}'.format(accuracy_score(test_labels, 1 * (preds > 0.5))))
print('F1 score: {:0.4}'.format(f1_score(test_labels, 1 * (preds > 0.5))))
print('ROC AUC score: {:0.4}'.format(roc_auc_score(test_labels, preds)))

Accuracy score: 0.9471
F1 score: 0.9478
ROC AUC score: 0.9866


In [0]:
def build_rnn_model():
    sequences = layers.Input(shape=(MAX_LENGTH,))
    embedded = layers.Embedding(MAX_FEATURES, 64)(sequences)
    x = layers.CuDNNGRU(128, return_sequences=True)(embedded)
    x = layers.CuDNNGRU(128)(x)
    x = layers.Dense(32, activation='relu')(x)
    x = layers.Dense(100, activation='relu')(x)
    predictions = layers.Dense(1, activation='sigmoid')(x)
    model = models.Model(inputs=sequences, outputs=predictions)
    model.compile(
        optimizer='rmsprop',
        loss='binary_crossentropy',
        metrics=['binary_accuracy']
    )
    return model
    
rnn_model = build_rnn_model()

In [0]:
import pickle
from google.colab import files

file = open('gdrive/My Drive/Colab Notebooks/tokenizer.pkl', 'wb')
pickle.dump(tokenizer, file)
file.close()

f=['pleasant experience i will buy one more good']
f=tokenizer.texts_to_sequences(f)
f=pad_sequences(f,maxlen=MAX_LENGTH)
lstm_model.predict(f)


array([[0.92998284]], dtype=float32)

In [0]:
rnn_model.fit(
    train_texts, 
    train_labels, 
    batch_size=1500,
    epochs=10,
    validation_data=(val_texts, val_labels), )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f31f3f08908>

In [0]:
rnn_model.save("rnn.h5")
model_file = drive.CreateFile({'title' : 'rnn.h5'})
model_file.SetContentFile('rnn.h5')
model_file.Upload()
drive.CreateFile({'id': model_file.get('id')})


InvalidConfigError: ignored

In [0]:
preds = rnn_model.predict(test_texts)
print('Accuracy score: {:0.4}'.format(accuracy_score(test_labels, 1 * (preds > 0.5))))
print('F1 score: {:0.4}'.format(f1_score(test_labels, 1 * (preds > 0.5))))
print('ROC AUC score: {:0.4}'.format(roc_auc_score(test_labels, preds)))

Accuracy score: 0.9531
F1 score: 0.9533
ROC AUC score: 0.9891


In [0]:
def build_bilstm_model():
    sequences = layers.Input(shape=(MAX_LENGTH,))
    embedded = layers.Embedding(MAX_FEATURES, 64)(sequences)
    x = layers.LSTM(64, return_sequences=True)(embedded)
    x=layers.Dropout(0.2)(x)
    x = layers.Bidirectional(layers.CuDNNLSTM(128, return_sequences=True))(x)
    x=layers.Dropout(0.2)(x)
    x = layers.Bidirectional(layers.CuDNNLSTM(128, return_sequences=False))(x)
    x = layers.Dense(32, activation='relu')(x)
    x = layers.Dense(100, activation='relu')(x)
    predictions = layers.Dense(1, activation='sigmoid')(x)
    model = models.Model(inputs=sequences, outputs=predictions)
    model.compile(
        optimizer='rmsprop',
        loss='binary_crossentropy',
        metrics=['binary_accuracy']
    )
    return model

In [0]:
bilstmmodel = build_bilstm_model()

In [0]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
bilstmmodel.fit(
    train_texts, 
    train_labels, 
    batch_size=1500,
    epochs=10,
    validation_data=(val_texts, val_labels), )

In [0]:
bilstmmodel.save("bilstm.h5")
from google.colab import files
model_file = drive.CreateFile({'title' : 'bilstm.h5'})
model_file.SetContentFile('bilstm.h5')
model_file.Upload()
drive.CreateFile({'id': model_file.get('id')})
