# Load Word Dictionary

In [1]:
import csv

with open('./data/word_dict.csv', mode='r') as infile:
    reader = csv.reader(infile)
    word_dict = {rows[0]:int(rows[1]) for rows in reader}
    
with open('./data/index_dict.csv', mode='r') as infile:
    reader = csv.reader(infile)
    index_dict = {int(rows[0]):rows[1] for rows in reader}
    
index_dict[0] = 'UNKNOWN'

In [22]:
word_dict

{'the': 1,
 'a': 2,
 'of': 3,
 '.': 4,
 ',': 5,
 'and': 6,
 'to': 7,
 'network': 8,
 'neural': 9,
 'is': 10,
 'for': 11,
 'in': 12,
 'The': 13,
 'an': 14,
 'data': 15,
 'are': 16,
 'by': 17,
 'A': 18,
 'input': 19,
 'system': 20,
 'with': 21,
 'output': 22,
 'or': 23,
 'from': 24,
 'which': 25,
 'be': 26,
 'as': 27,
 'that': 28,
 'on': 29,
 'signal': 30,
 'method': 31,
 'each': 32,
 'one': 33,
 'image': 34,
 'at': 35,
 'using': 36,
 'plurality': 37,
 'first': 38,
 'layer': 39,
 'can': 40,
 'includes': 41,
 'set': 42,
 'training': 43,
 'control': 44,
 'may': 45,
 'processing': 46,
 'based': 47,
 'values': 48,
 'second': 49,
 'information': 50,
 'signals': 51,
 'model': 52,
 'value': 53,
 'used': 54,
 'process': 55,
 'learning': 56,
 'neuron': 57,
 'least': 58,
 'pattern': 59,
 'such': 60,
 'device': 61,
 'between': 62,
 'time': 63,
 'In': 64,
 'circuit': 65,
 'vector': 66,
 'unit': 67,
 'having': 68,
 'into': 69,
 'neurons': 70,
 'apparatus': 71,
 'function': 72,
 'trained': 73,
 'more'

In [59]:
index_dict

{1: 'the',
 2: 'a',
 3: 'of',
 4: '.',
 5: ',',
 6: 'and',
 7: 'to',
 8: 'network',
 9: 'neural',
 10: 'is',
 11: 'for',
 12: 'in',
 13: 'The',
 14: 'an',
 15: 'data',
 16: 'are',
 17: 'by',
 18: 'A',
 19: 'input',
 20: 'system',
 21: 'with',
 22: 'output',
 23: 'or',
 24: 'from',
 25: 'which',
 26: 'be',
 27: 'as',
 28: 'that',
 29: 'on',
 30: 'signal',
 31: 'method',
 32: 'each',
 33: 'one',
 34: 'image',
 35: 'at',
 36: 'using',
 37: 'plurality',
 38: 'first',
 39: 'layer',
 40: 'can',
 41: 'includes',
 42: 'set',
 43: 'training',
 44: 'control',
 45: 'may',
 46: 'processing',
 47: 'based',
 48: 'values',
 49: 'second',
 50: 'information',
 51: 'signals',
 52: 'model',
 53: 'value',
 54: 'used',
 55: 'process',
 56: 'learning',
 57: 'neuron',
 58: 'least',
 59: 'pattern',
 60: 'such',
 61: 'device',
 62: 'between',
 63: 'time',
 64: 'In',
 65: 'circuit',
 66: 'vector',
 67: 'unit',
 68: 'having',
 69: 'into',
 70: 'neurons',
 71: 'apparatus',
 72: 'function',
 73: 'trained',
 74: 'm

# Load Sentences

In [2]:
import numpy as np
import pandas as pd
from utils import get_model, find_closest, create_train_valid,  generate_output, guess_human

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
# Read in data
data = pd.read_csv(
    './data/neural_network_patent_query.csv', parse_dates=['patent_date'])

# Extract abstracts
original_abstracts = list(data['patent_abstract'])
original_abstracts

['" A ""Barometer"" Neuron enhances stability in a Neural Network System that, when used as a track-while-scan system, assigns sensor plots to predicted track positions in a plot/track association situation. The ""Barometer"" Neuron functions as a bench-mark or reference system node that equates a superimposed plot and track to a zero distance as a ""perfect"" pairing of plot and track which has a measured/desired level of inhibition. The ""Barometer"" Neuron responds to the System inputs, compares these inputs against the level of inhibition of the ""perfect"" pair, and generates a supplied excitation or inhibition output signal to the System which adjusts the System to a desired value at or near 1.0; this the reference level of inhibition of the ""perfect"" pair. "',
 '" This invention is a novel high-speed neural network based processor for solving the ""traveling salesman"" and other global optimization problems. It comprises a novel hybrid architecture employing a binary synaptic 

In [4]:
# from keras.preprocessing.text import Tokenizer
import re


example = 'This is a short sentence (1) with one reference to an image. This next sentence, while non-sensical, does not have an image and has two commas.'

def format_patent(patent):
    """Add spaces around punctuation and remove references to images/citations."""

    # Add spaces around punctuation
    patent = re.sub(r'(?<=[^\s0-9])(?=[.,;?])', r' ', patent)

    # Remove references to figures
    patent = re.sub(r'\((\d+)\)', r'', patent)

    # Remove double spaces
    patent = re.sub(r'\s\s', ' ', patent)
    return patent

format_patent(example)

'This is a short sentence with one reference to an image . This next sentence , while non-sensical , does not have an image and has two commas .'

In [5]:
formatted = []

# Iterate through all the original abstracts
for a in original_abstracts:
    formatted.append(format_patent(a))

len(formatted)

3522

## Convert sequences into indexes

In [6]:
seq = formatted[100].split(" ")

index_seq = []
for word in seq:
    if word in word_dict.keys():
        index_seq.append(word_dict[word])
    else:
        index_seq.append(0)

print(len(seq), len(index_seq))
index_seq

125 125


[13,
 91,
 75,
 125,
 14,
 71,
 6,
 2,
 31,
 11,
 566,
 6,
 560,
 34,
 130,
 36,
 2,
 3395,
 9,
 8,
 5,
 300,
 2988,
 7591,
 342,
 46,
 366,
 10183,
 1,
 1343,
 4,
 13,
 3395,
 9,
 8,
 5,
 25,
 10,
 345,
 3,
 4735,
 6,
 2029,
 5,
 2987,
 1481,
 10184,
 140,
 12,
 2,
 10185,
 511,
 67,
 7,
 22,
 1,
 282,
 178,
 11,
 1,
 282,
 92,
 4246,
 3,
 1,
 869,
 145,
 1625,
 3,
 1,
 92,
 4,
 13,
 91,
 75,
 996,
 7591,
 34,
 59,
 156,
 6,
 81,
 47,
 29,
 342,
 46,
 5,
 25,
 10,
 1,
 1871,
 155,
 12,
 9,
 8,
 223,
 5,
 1522,
 3175,
 9,
 83,
 6,
 3175,
 8,
 148,
 684,
 223,
 16,
 153,
 7,
 188,
 295,
 24,
 1,
 34,
 19,
 350,
 7,
 1,
 59,
 566,
 6,
 560,
 350,
 4]

## Optional: Compile C program

If the program has already been compiled, simply skip this step.

The compiled program is in ./sim.

In [10]:
!make clean -C ./sim
!make -C ./sim

make: Entering directory '/home/jwq/6868esp/C-implementation-of-RNN/python_demo/sim'
MAKE: CLEAN /home/jwq/6868esp/C-implementation-of-RNN/python_demo
make: Leaving directory '/home/jwq/6868esp/C-implementation-of-RNN/python_demo/sim'
make: Entering directory '/home/jwq/6868esp/C-implementation-of-RNN/python_demo/sim'
MAKE: CC fc.o
MAKE: CC rnn.o
MAKE: CC init.o
MAKE: CC main.o
MAKE: CC utils.o
MAKE: CC softmax.o
MAKE: CC activation.o
MAKE: LINK c-rnn
make: Leaving directory '/home/jwq/6868esp/C-implementation-of-RNN/python_demo/sim'


# Run C program

In [11]:
FEED_LEN = 50
# part_seq = index_seq[0:FEED]

def write_seq(index_seq, start_idx):
    """
    write the index sequence to txt file
    Length: 50
    """
    with open("./tmp/seq.txt", 'w') as f:
        for item in index_seq[start_idx: start_idx + FEED_LEN]:
            f.write("%d " % item)

## predict next word by origin sequence

Our language model always predict the next word given the 50 ORGINAL words.

In [25]:
def arg_topk(arr, k):
    return arr.argsort()[-k:][::-1]

def list_to_arr(ls):
    arr = np.zeros(len(ls))
    for i, ele in enumerate(ls):
        arr[i] = (np.float64(ele))
        
    return arr

In [31]:
print("length of sentence: {}".format(len(index_seq)))

for i in range(len(index_seq) - FEED_LEN - 1):
    write_seq(index_seq, i)
    prob_str = !./sim/c-rnn
    prob_arr = list_to_arr(prob_str)
    topk = arg_topk(prob_arr, 3)
    print(topk)
#     predict_idx = int(temp[0])
#     print("predicted result: {}\tactual result: {}".format(predict_idx, index_seq[FEED_LEN + i]))

length of sentence: 125
[  9 817 234]
[511 260 198]
[ 67 249 147]
[   4   11 6885]
[5975  127  109]
[ 1  2 14]
[22 19 87]
[22  7 63]
[3 4 7]
[ 1 32  2]
[38  9 19]
[63  9 92]
[ 4  5 10]
[5 2 4]
[ 1  2 14]
[19 22 87]
[ 3  4 34]
[ 2  1 63]
[ 4  3 48]
[ 1  2 14]
[ 9 37 19]
^C
[]
[13 18 64]
[ 9 31 20]
[  75 3570   63]
[ 10  85 125]
[7591    2  978]
[34 43 15]
[15 81 46]
[81  5 10]
[ 138    4 6929]
[  2   1 125]
[ 3 15  4]
[ 29 240 585]
[ 1  2 25]
[984  46  15]
[  6 217 593]
[  6 162   1]
[10 16 40]
[ 2 54 73]
[ 282 1871  178]
[ 3 11 28]
[17 24 12]
[  1   2 122]
[ 83   8 501]
[5 4 6]
[5 6 4]
[6 1 7]
[3 1 2]
[ 21   8 124]
[ 83   8 501]
[5 4 7]
[  1 150  46]
[ 8 21  9]
[4 5 7]
[5 4 7]
[5 6 3]
[60  7 28]
[54 78 73]
[ 7 27 36]
[ 1  2 14]
[ 1  3 43]
[ 3  4 11]
[ 1  2 14]
[ 9 19 34]
[  15    4 1022]
[30  6 15]
[7 6 5]
[  1   2 109]
[ 9 38 19]
[ 4 81  3]
[61  6  5]
[   1    2 2402]
[  1   2 788]


## predict next word given a seed sequence

Given 50 start words, the language model will generate sequences without refering to the original sequence.

## greedy
if we use greedy (always choose the word of highest probability), we may get a loop of repeated words...

In [39]:
from termcolor import colored

def print_seq(seq, index_dict):
    """
    given an index sequence, print it's corresponding words
    """
    print(colored("blue: ", 'blue'), end="")
    print("start sequence")
    print("black: sequence generated by our language model")
    print(colored("red: ", 'red'), end="")
    print("current prediction\n\n")

    for i, word_idx in enumerate(seq):
        if i < FEED_LEN:
            print(colored(index_dict[word_idx], 'blue'), end=" ")
        elif i == len(seq) - 1:    
            print(colored(index_dict[word_idx], 'red'), end=" ")
        else:
            print(index_dict[word_idx], end=" ")     
            
    print("\n\nnew word: ")
    print(colored(index_dict[seq[-1]], 'red'))
    
def random_choose(idx_arr):
    return idx_arr[np.random.randint(0, high=len(idx_arr))]

## Some randomness
choose, e.g. top 3 most likely words and randomly pick one of them.

### randomly pick top 3 most likely word

In [42]:
from IPython.display import clear_output

output_len = 100

my_seq = index_seq[:FEED_LEN]
      
for i in range(output_len):
    # generate a sequence that has equal length as original sequence
    write_seq(my_seq, i)
    prob_str = !./sim/c-rnn
    prob_arr = list_to_arr(prob_str)
    topk = arg_topk(prob_arr, 3)
#     print(topk)
    predict_idx = random_choose(topk)
    my_seq.append(predict_idx)
    clear_output()
    print_seq(my_seq, index_dict)
#     print("\n\nnew word: {}".format(index_dict[my_seq[-1]]))
#     print("predicted result: {}\tactual result: {}".format(predict_idx, index_seq[FEED_LEN + i]))

[34mblue: [0mstart sequence
black: sequence generated by our language model
[31mred: [0mcurrent prediction


[34mThe[0m [34mpresent[0m [34minvention[0m [34mprovides[0m [34man[0m [34mapparatus[0m [34mand[0m [34ma[0m [34mmethod[0m [34mfor[0m [34mclassifying[0m [34mand[0m [34mrecognizing[0m [34mimage[0m [34mpatterns[0m [34musing[0m [34ma[0m [34msecond-order[0m [34mneural[0m [34mnetwork[0m [34m,[0m [34mthereby[0m [34machieving[0m [34mhigh-rate[0m [34mparallel[0m [34mprocessing[0m [34mwhile[0m [34mlowering[0m [34mthe[0m [34mcomplexity[0m [34m.[0m [34mThe[0m [34msecond-order[0m [34mneural[0m [34mnetwork[0m [34m,[0m [34mwhich[0m [34mis[0m [34mmade[0m [34mof[0m [34madders[0m [34mand[0m [34mmultipliers[0m [34m,[0m [34mcorrects[0m [34mpositional[0m [34mtranslations[0m [34mgenerated[0m [34min[0m [34ma[0m variety of real time , to obtain the output y(t) the post output layer of each of the input

### randomly pick top 2 most likely word

In [43]:
from IPython.display import clear_output

output_len = 200

my_seq = index_seq[:FEED_LEN]
      
for i in range(output_len):
    # generate a sequence that has equal length as original sequence
    write_seq(my_seq, i)
    prob_str = !./sim/c-rnn
    prob_arr = list_to_arr(prob_str)
    topk = arg_topk(prob_arr, 2)
#     print(topk)
    predict_idx = random_choose(topk)
    my_seq.append(predict_idx)
    clear_output()
    print_seq(my_seq, index_dict)
#     print("\n\nnew word: {}".format(index_dict[my_seq[-1]]))
#     print("predicted result: {}\tactual result: {}".format(predict_idx, index_seq[FEED_LEN + i]))

[34mblue: [0mstart sequence
black: sequence generated by our language model
[31mred: [0mcurrent prediction


[34mThe[0m [34mpresent[0m [34minvention[0m [34mprovides[0m [34man[0m [34mapparatus[0m [34mand[0m [34ma[0m [34mmethod[0m [34mfor[0m [34mclassifying[0m [34mand[0m [34mrecognizing[0m [34mimage[0m [34mpatterns[0m [34musing[0m [34ma[0m [34msecond-order[0m [34mneural[0m [34mnetwork[0m [34m,[0m [34mthereby[0m [34machieving[0m [34mhigh-rate[0m [34mparallel[0m [34mprocessing[0m [34mwhile[0m [34mlowering[0m [34mthe[0m [34mcomplexity[0m [34m.[0m [34mThe[0m [34msecond-order[0m [34mneural[0m [34mnetwork[0m [34m,[0m [34mwhich[0m [34mis[0m [34mmade[0m [34mof[0m [34madders[0m [34mand[0m [34mmultipliers[0m [34m,[0m [34mcorrects[0m [34mpositional[0m [34mtranslations[0m [34mgenerated[0m [34min[0m [34ma[0m neural net which has a function of the neural networks to generate an output signal . The o