In [1]:
from string import punctuation
from os import listdir
import numpy as np
from numpy import array
from numpy import asarray
from numpy import zeros
np.random.seed(7)

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

from sklearn.model_selection import train_test_split
import imblearn
from imblearn.over_sampling import SMOTE


from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers import Dropout
from keras.layers import LSTM
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

import gensim
from gensim.models import KeyedVectors

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\manon\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


ModuleNotFoundError: No module named 'imblearn'

In [None]:
import glob
text_files_dsat = glob.glob("./allconversations/dsat/*.txt")
text_files_sat = glob.glob("./allconversations/sat/*.txt")

In [None]:
# Get the stop words
stop_words = stopwords.words('english')
stop_words.append("user")
stop_words.append("system")
stop_words = set(stop_words)


In [None]:
# Load Google's Word2Vec model  --> Remember to change the folder name to where you stored the Google model
W2Vmodel = KeyedVectors.load_word2vec_format('./google_model/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [None]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text


In [None]:
# turn a doc into clean tokens
def clean_doc(doc):
    indexed_vocab = []
    # split into tokens by white space
    tokens = doc.split()
    # remove punctuation from each token
    table = str.maketrans('', '', punctuation)
    tokens = [w.lower().translate(table) for w in tokens]
    # filter out tokens not in vocab
    tokens = [w for w in tokens if w in W2Vmodel.vocab]
    #filter out tokens in stop words
    tokens = [w for w in tokens if w not in stop_words]
    for word in tokens:
        # add to the vocabulary
        if word not in vocabulary:
                vocabulary.append(word)
        idx = vocabulary.index(word)
        indexed_vocab.append(idx)
    tokens = ' '.join(tokens)
    return tokens



In [None]:
# load all docs in a directory
def process_docs(folder):
    documents = list()
    indexed_vocabulary = []
    # walk through all files in the folder
    for filename in folder:
        doc = load_doc(filename)
        # clean doc
        tokens = clean_doc(doc)
        # add to list
        documents.append(tokens)
        #indexed_vocabulary.append(indexed_vocab)
    return documents

In [None]:
# create a weight matrix for the Embedding layer from a loaded embedding
def get_weight_matrix(vocab):
    # total vocabulary size plus 0 for unknown words
    vocab_size = len(vocab) + 1
    # define weight matrix dimensions with all 0
    weight_matrix = zeros((vocab_size, 300))
    # step vocab, store vectors using the Tokenizer's integer mapping
    for word, i in vocab.items():
        weight_matrix[i] = W2Vmodel[word]
    return weight_matrix

In [None]:
# To get the f1 metrics from keras

from keras import backend as K

def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))


In [None]:
# load all documents
sat_docs = process_docs(text_files_sat)
dsat_docs = process_docs(text_files_dsat)

docs = sat_docs+dsat_docs



In [None]:
# split into train and test
#x = docs
#y = array([1 for _ in range(314)] + [0 for _ in range(157)])

#x_train, x_test, ytrain, ytest = train_test_split(x, y, random_state=0)




In [None]:
# With oversampling

sm = SMOTE(random_state=42)
x_train, ytrain = sm.fit_sample(X_train, y_train.ravel())

print(len(x_train))
print(len(ytrain))

In [None]:
# create the tokenizer
tokenizer = Tokenizer()
# fit the tokenizer on the documents
tokenizer.fit_on_texts(x_train)

In [None]:
# Encode (internal representation of the vocabulary) and pad the train data
# sequence encode
encoded_docs = tokenizer.texts_to_sequences(x_train)
# pad sequences
max_length = max([len(s.split()) for s in x_train])
print(max_length)
Xtrain = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

In [None]:
# Now do the same for the test

# sequence encode
encoded_docs = tokenizer.texts_to_sequences(x_test)
# pad sequences
Xtest = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

In [None]:
# define vocabulary size (=largest integer value)
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

In [None]:
# get vectors in the right order
embedding_vectors = get_weight_matrix(tokenizer.word_index)
print(embedding_vectors.shape)


### CNN model

In [None]:
# create the embedding layer
embedding_layer = Embedding(vocab_size, 300, weights=[embedding_vectors], input_length=max_length, trainable=True)

In [None]:
# define cnn model
cnn = Sequential()
cnn.add(embedding_layer)
cnn.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
cnn.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
cnn.add(MaxPooling1D(pool_size=2))

cnn.add(Flatten())
cnn.add(Dense(100, activation='sigmoid'))
cnn.add(Dense(1, activation='sigmoid'))
print(cnn.summary())


In [None]:
# compile network
cnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', f1])
# fit network
cnn.fit(Xtrain, ytrain, epochs=20, verbose=2)
# evaluate
loss, acc, f1 = cnn.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))
print('F1: %f' % f1)

In [None]:
y_pred = cnn.predict(Xtest)

In [None]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
precision_recall_fscore_support(ytest, y_pred, average='binary')

In [None]:
accuracy_score(ytest, y_pred)

### LSTM Model

In [None]:
# create the embedding layer
lstm_embedding_layer = Embedding(vocab_size, 300, weights=[embedding_vectors], input_length=max_length, trainable=True)

In [None]:
# define model LSTM
lstm = Sequential()
lstm.add(lstm_embedding_layer)
lstm.add(LSTM(100))
lstm.add(Dense(1, activation='sigmoid')) 
print(lstm.summary())



In [None]:
# compile network
lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
lstm.fit(Xtrain, ytrain, epochs=20, verbose=2)
# evaluate
loss, acc, f1 = lstm.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))
print('F1: %f' % f1)

In [None]:
y_pred_lstm = lstm.predict(Xtest)

In [None]:
precision_recall_fscore_support(ytest, y_pred_lstm, average='binary')

In [None]:
accuracy_score(ytest, y_pred_lstm)