In [None]:
import re
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation

import tensorflow as tf
from keras.models import Sequential
from keras.layers import Conv1D, BatchNormalization, Activation
from keras.layers import Embedding, Input, Dense, Dropout, Lambda, MaxPooling1D
from keras.optimizers import SGD
from keras.models import Model
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
from google.colab import files
uploaded = files.upload()

Saving Womens Clothing E-Commerce Reviews.csv to Womens Clothing E-Commerce Reviews.csv


In [None]:
dataset=pd.read_csv('StudentData.csv')
dataset.head(2)

In [None]:
dataset.dropna(inplace=True)
dataset.reset_index(inplace=True)

In [None]:
dataset.isnull().sum()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dataset['Review Text'],dataset.Rating, test_size=0.33, random_state=42)

In [None]:
#len(ALPHABET)=68
ALPHABET = 'abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:’"/|_#$%ˆ&*˜‘+=<>()[]{} '
FEATURE_LEN = 1024 #maxlen
path = '../data/'
TRAIN_DATA_FILE=path+'train.csv'
TEST_DATA_FILE=path+'test.csv'

In [None]:
def get_char_dict():
    char_dict={}
    for i,c in enumerate(ALPHABET):
        char_dict[c]=i+1
    return char_dict

def char2vec(text, max_length=FEATURE_LEN):
    char_dict = get_char_dict()
    data=np.zeros(max_length)
    
    for i in range(0, len(text)):
        if i >= max_length:
            return data
        
        elif text[i] in char_dict:
            data[i] = char_dict[text[i]]
        
        else:
            data[i]=68
    return data
    

def conv_shape(conv):
    return conv.get_shape().as_list()[1:]

replace_ip=re.compile(r'([0-9]+)(?:\.[0-9]+){3}',)
def text_to_wordlist(text, remove_stopwords=True, stem_words=False):
    # Clean the text, with the option to remove stopwords and to stem words.
    
    # Convert words to lower case and split them
    text = text.lower().split()
    
    
    
    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
    
    text = " ".join(text)
    
    
    #Replace IP address
    text=replace_ip.sub('',text)
    
    
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)


In [None]:
dataset.Rating.value_counts()

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
#train_df = pd.read_csv(TRAIN_DATA_FILE)
#test_df = pd.read_csv(TEST_DATA_FILE)

list_sentences_train = dataset["Review Text"]
list_classes = [1,2,3,4,5]
y = dataset.Rating[list_classes].values
list_sentences_test = dataset["Rating"]
data=[]
for text in list_sentences_train:
    data.append(char2vec(text_to_wordlist(text)))
data=np.array(data)

test_data = []
#for text in list_sentences_test:
#    test_data.append(char2vec(text_to_wordlist(text)))
test_data=np.array(dataset["Rating"])

In [None]:
y

array([5, 5, 2, 5, 4])

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(dataset.Rating)
y=le.transform(dataset.Rating) 


In [None]:
y

array([2, 4, 4, ..., 2, 2, 4])

In [None]:
test_data.shape

(19662,)

In [None]:
def ConvolutionalBlock(input_shape, num_filters):
    model=Sequential()

    #1st conv layer
    model.add(Conv1D(filters = num_filters, kernel_size = 3, strides = 1, padding = "same", input_shape = input_shape))
    model.add(BatchNormalization())
    model.add(Activation("relu"))

    #2nd conv layer
    model.add(Conv1D(filters = num_filters, kernel_size = 3, strides = 1, padding = "same"))
    model.add(BatchNormalization())
    model.add(Activation("relu"))

    return model

#https://www.tensorflow.org/api_docs/python/tf/nn/top_k
def top_kmax(x):
    x=tf.transpose(x, [0, 2, 1])
    k_max = tf.nn.top_k(x, k=top_k)
    return tf.reshape(k_max[0], (-1, num_filters[-1]*top_k))

In [None]:
def vdcnn_model(num_filters, num_classes, sequence_max_length, num_chars, embedding_size, top_k, learning_rate=0.001):
    
    inputs=Input(shape=(sequence_max_length, ), dtype='int32', name='input')
    
    embedded_seq = Embedding(num_chars, embedding_size, input_length=sequence_max_length)(inputs)
    embedded_seq = BatchNormalization()(embedded_seq)
    #1st Layer
    conv = Conv1D(filters=64, kernel_size=3, strides=2, padding="same")(embedded_seq)
    
    #ConvBlocks
    for i in range(len(num_filters)):
        conv = ConvolutionalBlock(conv_shape(conv), num_filters[i])(conv)
        conv = MaxPooling1D(pool_size=3, strides=2, padding="same")(conv)
        
    def _top_k(x):
        x = tf.transpose(x, [0, 2, 1])
        k_max = tf.nn.top_k(x, k=top_k)
        return tf.reshape(k_max[0], (-1, num_filters[-1] * top_k))
    
    k_max = Lambda(_top_k, output_shape=(num_filters[-1] * top_k,))(conv)
    
    #fully connected layers
    # in original paper they didn't used dropouts
    fc1=Dense(512, activation='relu', kernel_initializer='he_normal')(k_max)
    fc1=Dropout(0.3)(fc1)
    fc2=Dense(512, activation='relu', kernel_initializer='he_normal')(fc1)
    fc2=Dropout(0.3)(fc2)
    out=Dense(num_classes, activation='sigmoid')(fc2)
    
    #optimizer
    #sgd = SGD(lr=learning_rate, decay=1e-6, momentum=0.9, nesterov=False)
    
    model = Model(inputs=inputs, outputs=out)
    model.compile(optimizer='rmsprop', loss='binary_crossentropy',  metrics=['accuracy','mae','mse',keras_metrics.precision(), keras_metrics.recall()])
    
    return model

In [None]:
!pip install -q keras_metrics 
import keras_metrics

In [None]:
num_filters = [64, 128, 256, 512]
model=vdcnn_model(num_filters=num_filters, num_classes=5,num_chars=69, sequence_max_length=FEATURE_LEN,embedding_size=16,top_k=3)
model.summary()

Model: "model_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           (None, 1024)              0         
_________________________________________________________________
embedding_8 (Embedding)      (None, 1024, 16)          1104      
_________________________________________________________________
batch_normalization_64 (Batc (None, 1024, 16)          64        
_________________________________________________________________
conv1d_64 (Conv1D)           (None, 512, 64)           3136      
_________________________________________________________________
sequential_29 (Sequential)   (None, 512, 64)           25216     
_________________________________________________________________
max_pooling1d_29 (MaxPooling (None, 256, 64)           0         
_________________________________________________________________
sequential_30 (Sequential)   (None, 256, 128)          7500

In [None]:
from keras.backend.tensorflow_backend import set_session
import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True  # dynamically grow the memory used on the GPU
config.log_device_placement = True  # to log device placement (on which device the operation ran)
sess = tf.Session(config=config)
set_session(sess)  # set this TensorFlow session as the default session for Keras
def preds(k):
    y_temp = np.zeros((len(test_data), len(list_classes)))
    y_pred = np.zeros((len(test_data), len(list_classes)))
    i=0;
    from sklearn.model_selection import KFold
    kf = KFold(n_splits=k, random_state=2)

    for train_index, test_index in kf.split(data):
        print('fold====================>>>>>>>>>>',i+1)
        X_train, X_test = data[train_index], data[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model = None
        num_filters = [64, 128, 256, 512]
        model=vdcnn_model(num_filters=num_filters, num_classes=1,num_chars=69, sequence_max_length=FEATURE_LEN,embedding_size=16,top_k=3)

        early_stopping =EarlyStopping(monitor='val_loss', patience=3)
        bst_model_path = 'cv10_best_weights'+str(i+1) + '.h5'
        model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

        hist = model.fit(X_train, y_train, \
                validation_data=(X_test, y_test), \
                epochs=200, batch_size=256,callbacks=[early_stopping, model_checkpoint])

        #bst_val_score = min(hist.history['val_loss'])
        #print('bst_val_score',bst_val_score)

        model.load_weights(bst_model_path)
        #model.fit(data, y,epochs=2, batch_size=256, shuffle=True,)
        
        y_temp = model.predict([test_data], batch_size=256, verbose=1)
        y_pred+=y_temp
        #end=datetime.now()
        #print(" ")
        #print('time taken for this fold', end-start)
        i+=1
    y_test_pred=y_pred/k
    return y_test_pred

Device mapping:
/job:localhost/replica:0/task:0/device:XLA_CPU:0 -> device: XLA_CPU device



In [None]:
y_test=preds(5)



IndexError: ignored