In [1]:
import numpy as np
import tensorflow as tf
import nltk
from nltk.corpus import stopwords
from collections import defaultdict
import random
import pickle

from nltk.stem.lancaster import LancasterStemmer
stemmer = LancasterStemmer()

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [2]:
ml_classes = ['location', 'professor', 'time']
store = defaultdict(list)
for ml_class in ml_classes:
    with open("{}.txt".format(ml_class), encoding="utf-8") as file:
        lines = file.read().splitlines()
        for line in lines:
            store[ml_class].append(line)

stop_words = set(stopwords.words('english'))


all_words = []
sentences = []

def preprocess(sentence):
    tokens = nltk.word_tokenize(sentence) # tokenize
    tokens = [t.lower() for t in tokens]
    # tokens_filtered = filter(lambda token: token not in stop_words, tokens)
    tokens_filtered = tokens
    stemmed = [stemmer.stem(t) for t in tokens_filtered]
    lemmatized = [lemmatizer.lemmatize(t) for t in stemmed]
    return lemmatized


for ml_class, lines in store.items():
    for line in lines:
        processed_tokens = preprocess(line)
        all_words.extend(processed_tokens)
        sentences.append((processed_tokens, ml_class))

In [3]:
all_uniq_words = list(set(all_words))
uniq_ml_classes = list(set(ml_classes))

def vectorize(token_set, all_uniq_words):
    return list(map(lambda x: 1 if x in token_set else 0, all_uniq_words))

training = []

for tokens, ml_class in sentences:
    token_set = set(tokens)
    x = vectorize(token_set, all_uniq_words)
    y = uniq_ml_classes.index(ml_class)
    training.append([np.array(x), np.array([y])])

random.shuffle(training)
train_set = np.asarray(training)

train_X = train_set[:, 0]
train_X = np.vstack(train_X)
print(train_X.shape)

train_Y = train_set[:, 1]
train_Y = np.vstack(train_Y)
print(train_Y.shape)

print(train_X[:1])


(1456, 31)
(1456, 1)
[[0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0]]


In [4]:
feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input(train_X)
print(feature_columns)
dnn_clf = tf.contrib.learn.DNNClassifier(hidden_units=[10, 10], n_classes=3, feature_columns=feature_columns, model_dir="./chat_model")
dnn_clf.fit(x=train_X, y=train_Y, steps=20000)


[_RealValuedColumn(column_name='', dimension=31, default_value=None, dtype=tf.int64, normalizer=None)]
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_steps': None, '_master': '', '_keep_checkpoint_max': 5, '_log_step_count_steps': 100, '_evaluation_master': '', '_environment': 'local', '_session_config': None, '_is_chief': True, '_num_worker_replicas': 0, '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1.0
}
, '_keep_checkpoint_every_n_hours': 10000, '_save_checkpoints_secs': 600, '_task_type': None, '_num_ps_replicas': 0, '_task_id': 0, '_save_summary_steps': 100, '_model_dir': './chat_model', '_tf_random_seed': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f9f92b5aef0>}
Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_size are only
available in the SKCompat class, Estimator will only accept input

INFO:tensorflow:global_step/sec: 97.1677
INFO:tensorflow:step = 6001, loss = 8.12259e-05 (1.029 sec)
INFO:tensorflow:global_step/sec: 103.489
INFO:tensorflow:step = 6101, loss = 7.96086e-05 (0.966 sec)
INFO:tensorflow:global_step/sec: 97.529
INFO:tensorflow:step = 6201, loss = 7.80507e-05 (1.026 sec)
INFO:tensorflow:global_step/sec: 96.6783
INFO:tensorflow:step = 6301, loss = 7.65437e-05 (1.034 sec)
INFO:tensorflow:global_step/sec: 99.18
INFO:tensorflow:step = 6401, loss = 7.5096e-05 (1.008 sec)
INFO:tensorflow:global_step/sec: 102.982
INFO:tensorflow:step = 6501, loss = 7.3696e-05 (0.971 sec)
INFO:tensorflow:global_step/sec: 95.7408
INFO:tensorflow:step = 6601, loss = 7.23391e-05 (1.044 sec)
INFO:tensorflow:global_step/sec: 101.856
INFO:tensorflow:step = 6701, loss = 7.10353e-05 (0.982 sec)
INFO:tensorflow:global_step/sec: 95.4065
INFO:tensorflow:step = 6801, loss = 6.97736e-05 (1.049 sec)
INFO:tensorflow:global_step/sec: 102.345
INFO:tensorflow:step = 6901, loss = 6.85501e-05 (0.976 

INFO:tensorflow:global_step/sec: 105.678
INFO:tensorflow:step = 14101, loss = 2.91528e-05 (0.946 sec)
INFO:tensorflow:global_step/sec: 100.898
INFO:tensorflow:step = 14201, loss = 2.89073e-05 (0.991 sec)
INFO:tensorflow:global_step/sec: 116.535
INFO:tensorflow:step = 14301, loss = 2.86651e-05 (0.859 sec)
INFO:tensorflow:global_step/sec: 103.158
INFO:tensorflow:step = 14401, loss = 2.84269e-05 (0.968 sec)
INFO:tensorflow:global_step/sec: 104.979
INFO:tensorflow:step = 14501, loss = 2.82004e-05 (0.953 sec)
INFO:tensorflow:global_step/sec: 104.662
INFO:tensorflow:step = 14601, loss = 2.79747e-05 (0.956 sec)
INFO:tensorflow:global_step/sec: 105.251
INFO:tensorflow:step = 14701, loss = 2.77485e-05 (0.949 sec)
INFO:tensorflow:global_step/sec: 104.712
INFO:tensorflow:step = 14801, loss = 2.75303e-05 (0.957 sec)
INFO:tensorflow:global_step/sec: 92.7587
INFO:tensorflow:step = 14901, loss = 2.73095e-05 (1.078 sec)
INFO:tensorflow:global_step/sec: 97.5123
INFO:tensorflow:step = 15001, loss = 2.70

DNNClassifier(params={'input_layer_min_slice_size': None, 'gradient_clip_norm': None, 'activation_fn': <function relu at 0x7f9f942778c8>, 'head': <tensorflow.contrib.learn.python.learn.estimators.head._MultiClassHead object at 0x7f9f7bda7d68>, 'feature_columns': (_RealValuedColumn(column_name='', dimension=31, default_value=None, dtype=tf.int64, normalizer=None),), 'optimizer': None, 'dropout': None, 'hidden_units': [10, 10], 'embedding_lr_multipliers': None})

In [None]:
pickle.dump( {'all_uniq_words':all_uniq_words, 'uniq_ml_classes':uniq_ml_classes}, open( "training_data", "wb" ) )
