In [37]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
import keras
import numpy as np
from keras.preprocessing.text import Tokenizer
from nltk.stem import PorterStemmer 
from nltk.lm import Vocabulary
import glob
import tensorflow as tf
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
import pickle
import nltk
import ssl
%load_ext autotime

import warnings
warnings.filterwarnings('ignore')

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('averaged_perceptron_tagger')

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Ani\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

time: 7.98 ms


In [1]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 3659747947308028782
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 10189863649
locality {
  bus_id: 1
  links {
  }
}
incarnation: 5578378573520185249
physical_device_desc: "device: 0, name: GeForce GTX TITAN X, pci bus id: 0000:3d:00.0, compute capability: 5.2"
]


## Map tags to meanings and examples

In [34]:
tagmap = {
'CC': 'coordinating conjunction',
'CD': 'cardinal digit',
'DT': 'determiner',
'EX': 'existential there (like: “there is”)',
'FW': 'foreign word',
'IN': 'preposition/subordinating conjunction',
'JJ' :'adjective ‘big’',
'JJR' :'adjective, comparative ‘bigger’',
'JJS': 'adjective, superlative ‘biggest’',
'LS': 'list marker 1)',
'MD': 'modal, could, will',
'NN': 'noun, singular ex:‘desk',
'NNS': 'noun plural ex:‘desks',
'NNP': 'proper noun, singular ex:‘Harrison’',
'NNPS':'proper noun, plural ex:‘Americans’',
'PDT': 'predeterminer ex:‘all the kids’',
'POS': 'possessive ending ex:parent’s',
'PRP': 'personal pronoun ex:I, he, she',
'PRP$':'possessive pronoun ex:my, his, hers',
'RB': 'adverb ex:very, silently,',
'RBR': 'adverb, comparative ex:better',
'RBS': 'adverb, superlative ex:best',
'RP': 'particle give up',
'TO': 'to go ex:‘to’ the store.',
'UH': 'interjection, ex:errrrrrrrm',
'VB': 'verb, base form ex:take',
'VBD': 'verb, past tense ex:took',
'VBG': 'verb, gerund/present participle ex:taking',
'VBN': 'verb, past participle ex:taken',
'VBP': 'verb, sing. present, non-3d ex:take',
'VBZ': 'verb, 3rd person sing. present ex:takes',
'WDT': 'wh-determiner ex:which',
'WP': 'wh-pronoun ex:who, what',
'WP$': 'possessive wh-pronoun ex:whose',
'WRB': 'wh-abverb ex:where, when'
}

time: 22.9 ms


## Web scraping data about AI


In [4]:
import bs4 as bs
import urllib.request
import re
import nltk

scrapped_data = urllib.request.urlopen('https://en.wikipedia.org/wiki/Artificial_intelligence')
article = scrapped_data .read()

parsed_article = bs.BeautifulSoup(article,'lxml')

paragraphs = parsed_article.find_all('p')

article_text = ""

for p in paragraphs:
    article_text += p.text

corpus = article_text.lower()
corpus = re.sub('[^a-zA-Z]', ' ', corpus)
corpus = re.sub(r'\s+', ' ', corpus)

time: 1.29 s


In [6]:
ngram_size = 3

time: 1.02 ms


## Tf-idf trigram prediction model

In [7]:
words = nltk.word_tokenize(corpus)
vocab = Vocabulary(words)
vectorizer = TfidfVectorizer(vocabulary=vocab)
X = vectorizer.fit_transform([corpus])

time: 114 ms


In [8]:
feats = vectorizer.get_feature_names()

time: 1.98 ms


In [9]:
def preprocess(words, deep_learning=True):
    X = []
    Y = []

    for i in range(ngram_size,len(words)):
        if words[i] in feats:
            x = np.array(vectorizer.transform(words[i-ngram_size:i]).toarray())
            y = feats.index(words[i])
            if deep_learning == False:
                x = x.flatten()
                #maxent model allows for label encoded targets
            else:
                y = keras.utils.to_categorical(y,len(feats))
                #CNN wants categorical targets
            X.append(x)
            Y.append(y)
    return np.array(X),np.array(Y)

time: 2.99 ms


In [21]:
X_train, y_train = preprocess(words, deep_learning=False)

time: 5.79 s


In [22]:
X_train.shape

(13158, 9213)

time: 2.99 ms


In [23]:
from sklearn import linear_model

count_model = linear_model.LogisticRegression(multi_class='multinomial',solver='newton-cg',n_jobs=-1,verbose=2)
count_model.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed: 81.6min finished


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=-1, penalty='l2', random_state=None, solver='newton-cg',
          tol=0.0001, verbose=2, warm_start=False)

time: 1h 21min 38s


## Pos-tagging using trigram max-ent model

In [31]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

#encoding sets of 3 tags into flattened vector that's one-hot encoded

label_encoder = LabelEncoder()
onehot_encoder = OneHotEncoder(sparse=False)

xtemp = [tag[1] for tag in nltk.pos_tag(words)]
int_encoded = label_encoder.fit_transform(xtemp)
encoded = onehot_encoder.fit_transform(int_encoded.reshape(len(int_encoded), 1))

X_encoded = []
y_encoded = []

for i in range(ngram_size,len(encoded)):
    x_encoded = encoded[i-ngram_size:i].flatten().astype(int)
    X_encoded.append(x_encoded)
    y_encoded.append(int_encoded[i])
    
y_encoded = np.array(y_encoded)
X_encoded = np.array(X_encoded)

time: 822 ms


In [32]:
y_encoded.shape

(13158,)

time: 3.01 ms


In [26]:
tag_model = linear_model.LogisticRegression(multi_class='multinomial',solver='newton-cg',n_jobs=-1,verbose=2)
tag_model.fit(X_encoded, y_encoded)

[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   13.3s finished


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=-1, penalty='l2', random_state=None, solver='newton-cg',
          tol=0.0001, verbose=2, warm_start=False)

time: 14.2 s


## CNN-Deep Learning model semantic understanding

In [10]:
X_train,y_train = preprocess(words, deep_learning=True)
X_train = X_train.reshape(X_train.shape[0],X_train.shape[1],X_train.shape[2],1)

time: 7.08 s


In [11]:
X_train.shape

(13158, 3, 3071, 1)

time: 3.99 ms


In [12]:
cnn_model = Sequential()
cnn_model.add(Conv2D(32, kernel_size=(1, 1),
                 activation='relu',
                 input_shape=(X_train.shape[1],X_train.shape[2],X_train.shape[3])))
cnn_model.add(Conv2D(32, (3, 3), activation='relu'))
#cnn_model.add(MaxPooling2D(pool_size=(2, 2)))
#cnn_model.add(Dropout(0.25))
cnn_model.add(Flatten())
cnn_model.add(Dense(128, activation='relu'))
cnn_model.add(Dropout(0.5))
cnn_model.add(Dense(len(feats), activation='softmax'))
cnn_model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adadelta(),
              metrics=['accuracy'])

W1018 08:59:31.901607 23536 deprecation_wrapper.py:119] From c:\users\ani\appdata\local\programs\python\python36\lib\site-packages\keras\backend\tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W1018 08:59:31.953477 23536 deprecation_wrapper.py:119] From c:\users\ani\appdata\local\programs\python\python36\lib\site-packages\keras\backend\tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W1018 08:59:31.973415 23536 deprecation_wrapper.py:119] From c:\users\ani\appdata\local\programs\python\python36\lib\site-packages\keras\backend\tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W1018 08:59:32.051206 23536 deprecation_wrapper.py:119] From c:\users\ani\appdata\local\programs\python\python36\lib\site-packages\keras\backend\tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecate

time: 306 ms


In [13]:
cnn_model.fit(X_train, y_train,
          batch_size=64,
          epochs=50,
          verbose=1) 

W1018 08:59:34.096933 23536 deprecation.py:323] From c:\users\ani\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\ops\math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1bdedbb8400>

time: 7min 18s


### Saving all 3 models

In [23]:
# filename = 'savefiles/wordmaxent.sav'
# pickle.dump(count_model, open(filename, 'wb'))

# filename = 'savefiles/tagmaxent.sav'
# pickle.dump(tag_model, open(filename, 'wb'))

# filename = 'savefiles/CNNpredictor.sav'
# pickle.dump(model, open(filename, 'wb'))

time: 1.98 s


# TESTING


In [24]:
count_model = pickle.load(open('savefiles/wordmaxent.sav','rb'))
tag_model = pickle.load(open('savefiles/tagmaxent.sav', 'rb'))
#cnn_model = pickle.load(open('savefiles/CNNpredictor.sav', 'rb'))

time: 842 ms


In [25]:
def input_tag_to_prediction(inp):
    words = nltk.word_tokenize(inp)
    tags = [tag[1] for tag in nltk.pos_tag(words)]
    integer_encoded = label_encoder.transform(tags)
    X_encoded = onehot_encoder.transform(integer_encoded.reshape(len(integer_encoded), 1)).flatten()
    possible_predictions = tag_model.predict_proba(X_encoded.reshape(1, -1))[0]
    indices = (-possible_predictions).argsort()[:3]
    return [label_encoder.inverse_transform([idx])[0] for idx in indices]

time: 2.99 ms


In [26]:
def input_word_to_prediction(inp):
    words = nltk.word_tokenize(inp)
    words_tf = np.array(vectorizer.transform(words).toarray()).flatten()
    possible_predictions = count_model.predict_proba(words_tf.reshape(1, -1))[0]
    indices = (-possible_predictions).argsort()[:3]
    return set(feats[idx] for idx in indices)

time: 997 µs


In [27]:
def prediction_union(sent):
    sent = sent.lower()
    pw, pt = input_word_to_prediction(sent), input_tag_to_prediction(sent)
    return pw, pt[0]

time: 998 µs


In [28]:
def DLpredict(sent):
    words = nltk.word_tokenize(sent)
    x = np.array(vectorizer.transform(words).toarray())
    x = x.reshape(1,x.shape[0],x.shape[1],1)
    y_pred = model.predict(x)[0]
    indices = (-y_pred).argsort()[:3]
    return [feats[index] for index in indices]

time: 1.99 ms


In [29]:
def run(sent):
    out = sent
    for i in range(16):
        print(out)
        pw,pt = prediction_union(sent)
        cw = DLpredict(sent)
        print("predictions: "+str(pw.union(cw))+" -> model suggests a "+str(tagmap[pt]))
        inp = input()
        if inp=="exit":
            return out
        out+= " " +inp
        sent = " ".join(nltk.word_tokenize(sent)[1:]+[inp])
    print(out)

time: 978 µs


In [None]:
out = run("where is ai")

where is ai
predictions: {'in', 'might', 'research', 'to'} -> model suggests a noun, singular ex:‘desk
research
where is ai research
predictions: {'in', 'was', 'as'} -> model suggests a preposition/subordinating conjunction
heading
where is ai research heading
predictions: {'bird', 'to', 'drugs', 'the', 'and'} -> model suggests a noun, singular ex:‘desk
towards
where is ai research heading towards
predictions: {'in', 'artificial', 'to', 'creating', 'the', 'u'} -> model suggests a verb, sing. present, non-3d ex:take


In [56]:
out

'artificial intelligence is the field of ai where the focus is not centered on carnegie mellon university'

time: 2.62 ms


In [18]:
# fix removal of punctuations
#improve CNN accuracy and visualize layer contributions
#integrate into iOS as python script