TF-IDF

In [2]:

# split dataset into train and test.
X_train, X_test, Y_train, Y_test = train_test_split(data["Description"], data["intent_label"], test_size=3)

# vectorize the input using tfidf values.
tfidf = TfidfVectorizer()
tfidf = tfidf.fit(X_train)
X_train = tfidf.transform(X_train)
X_test = tfidf.transform(X_test)

# label encoding for different categories of intents
le = LabelEncoder().fit(Y_train)
Y_train = le.transform(Y_train)
Y_test = le.transform(Y_test)

# other models like GBM, Random Forest may also be used
model = SVC() 
model = model.fit(X_train, Y_train)
p = model.predict(X_test)
# calculate the f1_score. average="micro" since we want to calculate score for multiclass and
# each instance(rather than class, search for macro average) contribute equally towards the scoring.
print("f1_score:", f1_score( Y_test, p, average="micro"))
print("accuracy_score:",accuracy_score(Y_test, p))


f1_score: 0.333333333333
accuracy_score: 0.333333333333


WORD2VEC

In [5]:
# import required packages
from gensim.models import Word2Vec
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout
from keras.models import Model

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score


# read data
data = pd.read_csv("intent1.csv")

# split data into test and train
X_train, X_test, Y_train, Y_test = train_test_split(data["Description"], data["intent_label"], test_size=6)

# label encoding for different categories of intents
le = LabelEncoder().fit(Y_train)
Y_train = le.transform(Y_train)
Y_test = le.transform(Y_test)
X_train = list(X_train)

# tokenize input strings
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
sequences = tokenizer.texts_to_sequences(X_train)
sequences_test = tokenizer.texts_to_sequences(X_test)
word_index = tokenizer.word_index
vocab_size = len(word_index)

# prune each sentence to maximum of 20 words.
max_sent_len = 20

# sentences with less than 20 words, will be padded with zeroes to make it of length 20
# sentences with more than 20 words, will be pruned to 20.
x = pad_sequences(sequences, maxlen=max_sent_len)
X_test = pad_sequences(sequences_test, maxlen=max_sent_len)

# get word_vectors for words in training set
X_train = [[word for word in str(sent).split()] for sent in X_train]

word_vecs = Word2Vec(X_train)
print("Word vectors trained")
    
# 100 is the size of wordvec.
embedding_matrix = np.zeros((vocab_size, 100))

# make matrix of each word with its word_vectors for the CNN model. 
# so each row of a matrix will represent one word. There will be a row for each word in
# the training set
for word, i in word_index.items():
        try:
            embedding_vector = word_vecs[word]
        except:
            embedding_vector = None
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
print("Embeddings done")
vocab_size = len(embedding_matrix)

# CNN model requires multiclass labels to be converted into one hot ecoding.
# i.e. each column represents a label, and will be marked one for corresponding label.
y = to_categorical(np.asarray(Y_train))

embedding_layer = Embedding(vocab_size,
                                100,
                                weights=[embedding_matrix],
                                input_length=max_sent_len,
                                trainable=True)
sequence_input = Input(shape=(max_sent_len,), dtype='int32')

# stack each word of a sentence in a matrix. So each matrix represents a sentence.
# Each row in a matrix is a word(Word Vector) of a sentence.
embedded_sequences = embedding_layer(sequence_input)

# build the Convolutional model.
l_cov1 = Conv1D(128, 4, activation='relu')(embedded_sequences)
l_pool1 = MaxPooling1D(4)(l_cov1)
l_flat = Flatten()(l_pool1)
hidden = Dense(100, activation='relu')(l_flat)
preds = Dense(len(y[0]), activation='softmax')(hidden)
model = Model(sequence_input, preds)
model.compile(loss='binary_crossentropy',
                  optimizer='Adam'
                  )

print("model fitting - simplified convolutional neural network")
model.summary()

# train the model
model.fit(x, y, epochs=10, batch_size=128)

#get scores and predictions.
p = model.predict(X_test)
p = [np.argmax(i) for i in p]
score_cnn = f1_score(Y_test, p, average="micro")
print("accuracy_score:",accuracy_score(Y_test, p))
print("f1_score:", score_cnn)


Word vectors trained
Embeddings done
model fitting - simplified convolutional neural network
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 20)                0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 20, 100)           2700      
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 17, 128)           51328     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 4, 128)            0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 512)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 100)               51300     
_________________________________________________



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
accuracy_score: 0.833333333333
f1_score: 0.833333333333


In [1]:
# import required packages
from gensim.models import Word2Vec
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout
from keras.models import Model

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
# read data
data = pd.read_csv("intent1.csv")

# split data into test and train
X_train, X_test, Y_train, Y_test = train_test_split(data["Description"], data["intent_label"], test_size=6)

# label encoding for different categories of intents
le = LabelEncoder().fit(Y_train)
Y_train = le.transform(Y_train)
Y_test = le.transform(Y_test)

# get word_vectors for words in training set
X_train = [sent for sent in X_train]
X_test = [sent for sent in X_test]
# by default genism.Word2Vec uses CBOW, to train wordvecs. We can also use skipgram with it
# by setting the “sg” attribute to number of skips we want.
# CBOW and Skip gram for the sentence "Hi Ron how was your day?" becomes:
# Continuos bag of words: 3-grams {"Hi Ron how", "Ron how was", "how was your” ...}
# Skip-gram 1-skip 3-grams: {"Hi Ron how", "Hi Ron was", "Hi how was", "Ron how 
# your", ...}
# See how: "Hi Ron was" skips over "how".
# Skip-gram 2-skip 3-grams: {"Hi Ron how", "Hi Ron was", "Hi Ron your", "Hi was 
# your", ...}
# See how: "Hi Ron your" skips over "how was".
# Those are the general meaning of CBOW and skip gram.              
word_vecs = Word2Vec(X_train)
print("Word vectors trained")

# prune each sentence to maximum of 20 words.
max_sent_len = 20

# tokenize input strings
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
sequences = tokenizer.texts_to_sequences(X_train)
sequences_test = tokenizer.texts_to_sequences(X_test)
word_index = tokenizer.word_index
vocab_size = len(word_index)

# sentences with less than 20 words, will be padded with zeroes to make it of length 20
# sentences with more than 20 words, will be pruned to 20.
x = pad_sequences(sequences, maxlen=max_sent_len)
X_test = pad_sequences(sequences_test, maxlen=max_sent_len)
    
# 100 is the size of wordvec.
embedding_matrix = np.zeros((vocab_size + 1, 100))

# make matrix of each word with its word_vectors for the CNN model. 
# so each row of a matrix will represent one word. There will be a row for each word in
# the training set
for word, i in word_index.items():
        try:
            embedding_vector = word_vecs[word]
        except:
            embedding_vector = None
            if embedding_vector is not None:
            	embedding_matrix[i] = embedding_vector
print("Embeddings done")
vocab_size = len(embedding_matrix)

# CNN model requires multiclass labels to be converted into one hot ecoding.
# i.e. each column represents a label, and will be marked one for corresponding label.
y = to_categorical(np.asarray(Y_train))

embedding_layer = Embedding(vocab_size,
                                100,
                                weights=[embedding_matrix],
                                input_length=max_sent_len,
                                trainable=True)
sequence_input = Input(shape=(max_sent_len,), dtype='int32')

# stack each word of a sentence in a matrix. So each matrix represents a sentence.
# Each row in a matrix is a word(Word Vector) of a sentence.
embedded_sequences = embedding_layer(sequence_input)

# build the Convolutional model.
l_cov1 = Conv1D(128, 4, activation='relu')(embedded_sequences)
l_pool1 = MaxPooling1D(4)(l_cov1)
l_flat = Flatten()(l_pool1)
hidden = Dense(100, activation='relu')(l_flat)
preds = Dense(len(y[0]), activation='softmax')(hidden)
model = Model(sequence_input, preds)
model.compile(loss='binary_crossentropy',optimizer='Adam')

print("model fitting - simplified convolutional neural network")
model.summary()

# train the model
model.fit(x, y, epochs=10, batch_size=128)

#get scores and predictions.
p = model.predict(X_test)
p = [np.argmax(i) for i in p]
score_cnn = f1_score(Y_test, p, average="micro")
print("accuracy_score:",accuracy_score(Y_test, p))
print("f1_score:", score_cnn)

Using TensorFlow backend.


KeyError: 'intent_label'