In [1]:
import numpy as np
import tensorflow as tf
import nltk
from nltk.corpus import stopwords
from collections import defaultdict
import random
import pickle

from nltk.stem.lancaster import LancasterStemmer
stemmer = LancasterStemmer()

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [2]:
ml_classes = ['location', 'professor', 'time']
store = defaultdict(list)
for ml_class in ml_classes:
    with open("{}.txt".format(ml_class), encoding="utf-8") as file:
        lines = file.read().splitlines()
        for line in lines:
            store[ml_class].append(line)

stop_words = set(stopwords.words('english'))


all_words = []
sentences = []

def preprocess(sentence):
    tokens = nltk.word_tokenize(sentence) # tokenize
    tokens = [t.lower() for t in tokens]
    # tokens_filtered = filter(lambda token: token not in stop_words, tokens)
    tokens_filtered = tokens
    stemmed = [stemmer.stem(t) for t in tokens_filtered]
    lemmatized = [lemmatizer.lemmatize(t) for t in stemmed]
    return lemmatized


for ml_class, lines in store.items():
    for line in lines:
        processed_tokens = preprocess(line)
        all_words.extend(processed_tokens)
        sentences.append((processed_tokens, ml_class))

In [3]:
all_uniq_words = list(set(all_words))
uniq_ml_classes = list(set(ml_classes))

def vectorize(token_set, all_uniq_words):
    return list(map(lambda x: 1 if x in token_set else 0, all_uniq_words))

training = []

for tokens, ml_class in sentences:
    token_set = set(tokens)
    x = vectorize(token_set, all_uniq_words)
    y = uniq_ml_classes.index(ml_class)
    training.append([np.array(x), np.array([y])])

random.shuffle(training)
train_set = np.asarray(training)

train_X = train_set[:, 0]
train_X = np.vstack(train_X)
print(train_X.shape)

train_Y = train_set[:, 1]
train_Y = np.vstack(train_Y)
print(train_Y.shape)

print(train_X[:1])


(1456, 31)
(1456, 1)
[[1 1 0 0 0 0 0 0 1 0 0 1 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0]]


In [5]:
feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input(train_X)
print(feature_columns)
dnn_clf = tf.contrib.learn.DNNClassifier(hidden_units=[10, 10], n_classes=3, feature_columns=feature_columns, model_dir="./chat_model")
dnn_clf.fit(x=train_X, y=train_Y, steps=20000)


[_RealValuedColumn(column_name='', dimension=31, default_value=None, dtype=tf.int64, normalizer=None)]
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_task_type': None, '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x118977a20>, '_master': '', '_num_ps_replicas': 0, '_num_worker_replicas': 0, '_environment': 'local', '_is_chief': True, '_evaluation_master': '', '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1
}
, '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_log_step_count_steps': 100, '_session_config': None, '_save_checkpoints_steps': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_model_dir': './chat_model'}
Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_size are only
available in the SKCompat class, Estimator will only accept input_fn.


INFO:tensorflow:loss = 0.000134985, step = 25601 (0.485 sec)
INFO:tensorflow:global_step/sec: 197.972
INFO:tensorflow:loss = 0.000132245, step = 25701 (0.505 sec)
INFO:tensorflow:global_step/sec: 201.478
INFO:tensorflow:loss = 0.000129614, step = 25801 (0.497 sec)
INFO:tensorflow:global_step/sec: 197.589
INFO:tensorflow:loss = 0.000127075, step = 25901 (0.507 sec)
INFO:tensorflow:global_step/sec: 220.822
INFO:tensorflow:loss = 0.000124629, step = 26001 (0.456 sec)
INFO:tensorflow:global_step/sec: 211.369
INFO:tensorflow:loss = 0.000122269, step = 26101 (0.468 sec)
INFO:tensorflow:global_step/sec: 181.233
INFO:tensorflow:loss = 0.00011999, step = 26201 (0.552 sec)
INFO:tensorflow:global_step/sec: 200.042
INFO:tensorflow:loss = 0.000117793, step = 26301 (0.500 sec)
INFO:tensorflow:global_step/sec: 233.349
INFO:tensorflow:loss = 0.000115665, step = 26401 (0.430 sec)
INFO:tensorflow:global_step/sec: 204.797
INFO:tensorflow:loss = 0.000113605, step = 26501 (0.487 sec)
INFO:tensorflow:global

INFO:tensorflow:global_step/sec: 237.843
INFO:tensorflow:loss = 4.81311e-05, step = 33701 (0.421 sec)
INFO:tensorflow:global_step/sec: 238.894
INFO:tensorflow:loss = 4.7731e-05, step = 33801 (0.419 sec)
INFO:tensorflow:global_step/sec: 238.407
INFO:tensorflow:loss = 4.7334e-05, step = 33901 (0.419 sec)
INFO:tensorflow:global_step/sec: 239.376
INFO:tensorflow:loss = 4.69495e-05, step = 34001 (0.418 sec)
INFO:tensorflow:global_step/sec: 217.255
INFO:tensorflow:loss = 4.65625e-05, step = 34101 (0.461 sec)
INFO:tensorflow:global_step/sec: 231.763
INFO:tensorflow:loss = 4.61896e-05, step = 34201 (0.431 sec)
INFO:tensorflow:global_step/sec: 222.24
INFO:tensorflow:loss = 4.58179e-05, step = 34301 (0.449 sec)
INFO:tensorflow:global_step/sec: 220.086
INFO:tensorflow:loss = 4.5455e-05, step = 34401 (0.455 sec)
INFO:tensorflow:global_step/sec: 229.136
INFO:tensorflow:loss = 4.50945e-05, step = 34501 (0.436 sec)
INFO:tensorflow:global_step/sec: 228.248
INFO:tensorflow:loss = 4.47414e-05, step = 34

DNNClassifier(params={'head': <tensorflow.contrib.learn.python.learn.estimators.head._MultiClassHead object at 0x1189778d0>, 'hidden_units': [10, 10], 'feature_columns': (_RealValuedColumn(column_name='', dimension=31, default_value=None, dtype=tf.int64, normalizer=None),), 'optimizer': None, 'activation_fn': <function relu at 0x104de26a8>, 'dropout': None, 'gradient_clip_norm': None, 'embedding_lr_multipliers': None, 'input_layer_min_slice_size': None})

In [6]:
pickle.dump( {'all_uniq_words':all_uniq_words, 'uniq_ml_classes':uniq_ml_classes}, open( "training_data", "wb" ) )
