In [6]:
from collections import defaultdict
import re

In [79]:
class Neuron:
    def __init__(self, key, sequence):
        self.key = key
        self.sequence = sequence
        self.next = []
        self.last = []

    def link_nexts(self, n_next):
        self.next.append(n_next)
        n_next.link_last(self)
    
    def link_last(self, n_last):
        self.last.append(n_last)
    
    def set_sequence(self, sequence):
        self.sequence = sequence
    def __repr__(self):
        return "<<s:{} n:{}>>".format(self.sequence, self.next)
        
class Sequemem:
    def __init__(self, uuid):
        self.uuid = uuid
        self.n_init = Neuron('<init>', '<start>')
        self.layer = defaultdict(list)
        self.predicted = []
        self.active = []

    def reset(self):
        self.predicted = self.n_init.next[:]
        self.active = [self.n_init]
    
    def predict(self, str_sentence):
        self.reset()
        words = self.get_word_array(str_sentence)
        for idx, word in enumerate(words):
            current_active = self.active[:]
            if self.hit(word):
                continue
            else:
                neuron =  Neuron(word, "#".join(words[:(idx+1)]))
                self.layer[word] = neuron
                for n in current_active:
                    n.link_nexts(neuron)
                  
        return [neuron.key for neuron in self.predicted]
            
        
    def get_word_array(self, str_sentence):
        return re.compile(r'\w+').findall(str_sentence)
    
    def hit(self, word):
        # First for this word, move all hits to active
        while len(self.predicted) != 0:
            neuron = self.predicted.pop()
            if neuron.key == word:
                self.active.append(neuron)
        
        # Second, take any actives and set their nexts to predicted
        self.predicted = [n for neuron in self.active if neuron.next for n in neuron.next]
            
        # IF we have any predicts, we're good, else signal end
        return True if len(self.predicted) > 0 else False

    def __repr__(self):
        return "{}\n{}\n{}\n{}\n{}\n".format(
            self.uuid,
            self.n_init,
            self.layer,
            self.predicted,
            self.active
        )
            
            
            

In [80]:
seq = Sequemem('adz')
seq.get_word_array("The quick brown fox 22")

['The', 'quick', 'brown', 'fox', '22']

In [81]:
seq.predict("The quick brown fox")

['The']

In [82]:
seq

adz
<<s:<start> n:[<<s:The n:[]>>]>>
defaultdict(<type 'list'>, {'The': <<s:The n:[]>>})
[<<s:The n:[]>>]
[<<s:<start> n:[<<s:The n:[]>>]>>]

In [2]:
layer = LayerCount()

files = [
   'data/00_clean/sent_alice_in_wonderland.txt',
   'data/00_clean/sent_andersens_fairy_tales_pg1597.txt',
   'data/00_clean/sent_cats_of_ulthar.txt',
   'data/00_clean/sent_fairy_tales.txt',
   'data/00_clean/sent_grimms_fairy_tales.txt',
   'data/00_clean/sent_iris_fairy_tales.txt',
   'data/00_clean/sent_jungle_book_236-0.txt',
   'data/00_clean/sent_king_james_bible.txt',
   'data/00_clean/sent_shakespear.txt',
   'data/00_clean/sent_tao_te_king.txt',
   'data/00_clean/sent_the_prince.txt',
   'data/00_clean/sent_thousand_and_one.txt',
   'data/00_clean/sents_fables_la_fontaine.txt',
]

for f in files:
    print("starting: ", f)
    layer.load_from_file(f, lower=True)

layer.get_frequency_dict();
print("All Done!")

starting:  data/00_clean/sent_alice_in_wonderland.txt
Loading file data/00_clean/sent_alice_in_wonderland.txt
0
starting:  data/00_clean/sent_andersens_fairy_tales_pg1597.txt
Loading file data/00_clean/sent_andersens_fairy_tales_pg1597.txt
0
starting:  data/00_clean/sent_cats_of_ulthar.txt
Loading file data/00_clean/sent_cats_of_ulthar.txt
0
starting:  data/00_clean/sent_fairy_tales.txt
Loading file data/00_clean/sent_fairy_tales.txt
0
starting:  data/00_clean/sent_grimms_fairy_tales.txt
Loading file data/00_clean/sent_grimms_fairy_tales.txt
0
starting:  data/00_clean/sent_iris_fairy_tales.txt
Loading file data/00_clean/sent_iris_fairy_tales.txt
0
starting:  data/00_clean/sent_jungle_book_236-0.txt
Loading file data/00_clean/sent_jungle_book_236-0.txt
0
starting:  data/00_clean/sent_king_james_bible.txt
Loading file data/00_clean/sent_king_james_bible.txt
0
10000
20000
30000
40000
50000
60000
starting:  data/00_clean/sent_shakespear.txt
Loading file data/00_clean/sent_shakespear.txt
0


In [3]:
layer.jaccard('king','king')

Re-iitializing dictionary


0.999999999990909

In [18]:
layer.comparison_frequencies('marcellus',window_size=10,ratio=0.75,cutoff=25)[2]

['marcellus',
 'horatio',
 'enter',
 'it',
 'bernardo',
 'watch',
 'officer',
 'myself',
 'bell',
 'then',
 'beating',
 'burns',
 'now',
 'where',
 'heaven',
 'part',
 'illume',
 'caius',
 'wife',
 'she',
 'sir',
 'true',
 'stop',
 'speak',
 'stay',
 'on']

In [59]:
def save_compare_freqs_to_file(window_size, ratio=0.9, cutoff=25):
    w = str(window_size)
    r = str(ratio).replace('.', '_')
    c = str(cutoff)
    with open('word_compare_freqs_size_{}_ratio_{}_cutoff_{}.csv'.format(w, r,c), 'w') as target:
        for word in layer.d_w_uber_freq.keys():
            lst_wrds = layer.comparison_frequencies(word, window_size=window_size,ratio=ratio, cutoff=cutoff)[2]
            target.write("{}\t{}\n".format(word, "\t".join(lst_wrds)))

save_compare_freqs_to_file(15)            
print("All Done!")

All Done!


In [60]:
def load_word_from_line(line, d):
    """asume tab separated lines"""
    words = line.split('\t')
    d[words.pop(0)] = words
        
word_5 = {}
with open('word_compare_freqs_size_15_ratio_0_9_cutoff_25.csv', 'r') as source:
    for line in source:
        load_word_from_line(line, word_5)

print("dict loaded!")

dict loaded!


In [61]:
def jaccard_local(lst1, lst2):
    """Calculate jaccard distance between two words from their graphical representation
    as calculated with window_size and ratio.  Keep the number of cutoff words for the calculation.
    Args:
        w1: string, the first word for comparison
        w2: string, the second word for comparison
        window_size: int, go +- this number of words to calculate frequencies for graphical calc.
        ratio: float, use this as cutoff in graphical calculation
        cutoff: int, from graphical calculation, keep this number of words, the jaccard distance
                is then calculated as the overlap of these two sets.
    """

    u = len(set(lst1).union(set(lst2))) + 0.0000000001
    i = len(set(lst1).intersection(set(lst2)))
    return float(i)/float(u)

In [93]:
bust = 0
threshold = 0.50
w1 = 'morgiana'
w1nbors = word_5[w1]
best_matches = []
for word, nbors in word_5.items():

    if word == w1:
        continue

    score = jaccard_local(nbors, w1nbors)
    if score > threshold:
        print(word,score)
        best_matches.append((word,score))

across 0.5294117647043253
ali 0.5294117647043253
baba 0.5294117647043253
looked 0.5294117647043253
mustapha 0.5294117647043253
perceiving 0.5294117647043253
trying 0.5294117647043253


In [4]:
layer.get_counts_for_specific_key(key='king', window_size=2, direction=1).most_common(10)

[('king', 3574),
 ('of', 1184),
 ('and', 431),
 ('henry', 402),
 ('richard', 277),
 ('edward', 149),
 ('john', 124),
 ('said', 118),
 ('s', 112),
 ('that', 103)]

In [5]:
for idx in range(2,20):
    print(layer.comparison_frequencies('wolf',window_size=idx,ratio=0.9,cutoff=15)[2])

['wolf', 'father', 'mother', 'lone', 'sir', 'pack', 'ran', 'whose', 'seeonee', 's', 'replied', 'ravenous', 'got', 'shook', 'sleeping', 'stole']
['wolf', 'a', 'said', 'father', 'mother', 'or', 'lone', 'sir', 'replied', 'pack', 'ran', 'dead', 'himself', 'lay', 'fox', 'whose']
['wolf', 'a', 'said', 'father', 'mother', 'had', 'or', 'would', 'fox', 'lone', 'sir', 'replied', 'pack', 'himself', 'ran', 'bear']
['wolf', 'the', 'a', 'said', 'father', 'had', 'mother', 'would', 'fox', 'or', 'lone', 'pack', 'into', 'sir', 'replied', 'himself']
['wolf', 'the', 'a', 'said', 'father', 'had', 'mother', 'would', 'fox', 'or', 'if', 'up', 'who', 'pack', 'lone', 'sheep']
['the', 'wolf', 'a', 'said', 'as', 'father', 'had', 'mother', 'would', 'or', 'fox', 'if', 'up', 'who', 'pack', 'sheep']
['the', 'wolf', 'a', 'his', 'said', 'as', 'father', 'had', 'was', 'would', 'mother', 'fox', 'or', 'if', 'at', 'up']
['the', 'wolf', 'a', 'to', 'he', 'his', 'said', 'as', 'had', 'was', 'father', 'but', 'mother', 'or', 'fox

In [6]:

#layer.get_counts_for_specific_key('fox').most_common()
layer.window_size = 1
layer.get_frequency_dict()

cutoff = 100
ratio = 0.25
window_range = [10,15]
w1 = 'god'
w2 = 'jesus'
print(len(layer.get_frequency_dict()[1]))

profile = [jaccard(w1, w2, _w=_wdx, _c=cutoff, _r=ratio) for _wdx in window_range];
print(profile)
plt.axis([0, len(window_range), 0, 1])


39598


NameError: name 'jaccard' is not defined

In [None]:
plt.plot(profile)

In [None]:
cutoff = 100
ratio = 0.15
window_range = 10
w1 = 'queen'
_,_,lst = layer.comparison_frequencies(w1, window_size=window_range,ratio=ratio, cutoff=cutoff)
print(lst[:10])
words = layer.get_frequency_dict()[1].keys()
print(len(words))
high = 0
best_matches = []
bust = 0
threshold = 0.019
for word in words:
    bust += 1
    if word == w1:
        continue

    score = jaccard(w1, word, _w=window_range, _c=cutoff, _r=ratio)
    if bust % 10000 == 0: print("progress: ",bust)
    if score > threshold:
        print(word,score)
        best_matches.append((word,score))
        
print("Best match is {} with a score of {}".format(best_match, high))

In [None]:
print(best_matches)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
print(len(layer.get_frequency_dict()[1]))
a = np.random.random((128, 128))
plt.imshow(a, cmap='hot', interpolation='nearest')
plt.show()
layer.get_frequency_dict()[1]["Jesus"]

In [None]:
layer.get_frequency_dict()[1]["Jesus"]

In [None]:
l = ['a','b','c']
var = l.pop(0)
r = l[:]
l.pop(0)
l
r

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
cnt, dct = layer.initialize_frequency_dict()

In [None]:
# comparison_frequencies
layer.comparison_frequencies('men', ratio=0.5, cutoff=50, visualize_it=False);

In [None]:
layer.get_counts_for_specific_key('love').most_common(100)

In [None]:
# head_keys = layer.columns.keys()
# neurons = list({neuron for column in layer.columns.values() for neuron in column})
# one_hot_raw = np.eye(len(head_keys), dtype=int)
# len(one_hot_raw)
# d_head_keys = {}
# for idx, word in enumerate(head_keys):
#     d_head_keys[word] = one_hot_raw[idx]
# len(d_head_keys)
# ar_sent_vecs = np.zeros((len(neurons),len(head_keys)))
# ar_sent_vecs.shape

In [None]:
# headkeys = layer.columns.keys()
# neurons = list({neuron for column in layer.columns.values() for neuron in column})
# print(len(neurons)) # 481334 as list
#                     #  32796 as set

# for idx, neuron in enumerate(neurons):
#     if idx % 1000 == 0:
#         print(idx)
#     for _key in neuron.keys:
#         ar_sent_vecs[idx] = np.add(ar_sent_vecs[idx], d_head_keys[_key])
                

# ar_sent_vecs.shape
        

In [None]:
# np.sum(ar_sent_vecs[100])

In [None]:
# from sklearn.cluster import KMeans
# class KMeans():
#     def compute_clusters(self, X, centers):
#         return np.argmin([np.linalg.norm(X-c, axis=1) for c in centers], axis=0)
#     def compute_centers(self, X, clusters):
#         return np.array([X[clusters == c,].mean(0) for c in set(clusters)])
#     def fit(self, X, k, n_iter=100):
#         print('A')
#         clusters = self.compute_clusters(X, np.array(random.sample(list(X), k)))
#         for idx in range(n_iter):
#             print(idx)
#             #if idx % 10 == 0: print(idx)
#             centers = self.compute_centers(X, clusters)
#             clusters = self.compute_clusters(X, centers)
#         return clusters

In [None]:

# import numpy as np
# X = np.array([[1, 2], [1, 4], [1, 0],
#               [4, 2], [4, 4], [4, 0]])
# kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
# kmeans.labels_
# #> array([0, 0, 0, 1, 1, 1], dtype=int32)
# kmeans.predict([[0, 0], [4, 4]])
# #> array([0, 1], dtype=int32)
# kmeans.cluster_centers_
# #> array([[ 1.,  2.],
# #>     [ 4.,  2.]])

In [None]:
# %time
# clusters = KMeans(n_clusters=64, random_state=0).fit(ar_sent_vecs[:2000])
# clusters.labels_

In [None]:
# clusters.labels_[200:300]