In [None]:
%matplotlib -l

In [None]:
import random
from operator import and_, not_, or_
import math
import copy

from jupyter_plotly_dash import JupyterDash
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.attrs import LEMMA
import en_core_web_lg

from freqer import *
from palettes import *
from qualmath import *
from miketools import *

%matplotlib qt

# spacy.require_gpu()

In [None]:
nlp=en_core_web_lg.load()

In [None]:
# warning: make sure to have lots of RAM
# if you're going to feed it something large.

textfile = 'jekyllhyde.txt'
active_text = open(textfile).read()
# if len(active_text) > 1000000:
#     nlp.max_length = len(active_text) * 1.1
active_text = nlp(active_text)

In [None]:
for token in active_text[1:20]: print(token.text, token.pos_, token.dep_)

## 

In [None]:
display_palette(aqua,bump=0.1)

In [None]:
grab_tokens(active_text, token_type="all", boring=False, pos_list=None)[5].pos_

In [None]:
%time lemmalist = lemmatize(grab_tokens(active_text))
%time freq_chart = prep_freq_chart(active_text,lemmalist)

In [None]:
%time lemmalist_prop = lemmatize(grab_tokens(active_text,pos_list=['PROPN']))
%time freq_chart_prop = prep_freq_chart(active_text,lemmalist_prop)

In [None]:
trimwords = ['say','go','come','mr.','mrs.','miss']
trim_chart = notdf(freq_chart,inloc(freq_chart,"word",trimwords))

In [None]:
smash_chart = copy.deepcopy(freq_chart)
smash_chart["frequency"] = smash_chart["frequency"]**0.7

In [None]:
def dumb_sim(word,comp="cat"):
    """proper nouns, etc., do not possess vectors"""
    return nlp(word).similarity(nlp(comp))
smash_chart["like_cat"] = smash_chart["word"].apply(dumb_sim)

In [None]:
smash_chart["like_night"] = smash_chart["word"].apply(dumb_sim,comp="night")

In [None]:
chart = draw_freq_chart(
    trim_chart,
    2,
    'average_position',
    'frequency',
    palette_function = aqua,
    background = [0,0,0],
    wheel = 'stdev_position',
    heat = 'frequency',
    bump = 0
)

In [None]:
plt.plot(np.sin(smash_chart["frequency"]/np.max(smash_chart["frequency"])*100*math.pi))
plt.plot(smash_chart["frequency"])

In [None]:
np.vstack(chart)

In [None]:
plt.plot(np.vstack(chart)[:,0])

In [None]:
axis = np.arange(0,1,0.01)
np.meshgrid(axis,axis)

In [None]:
display_palette(aqua)

### proximity

In [None]:
# to make this more meaningful 
# we need a more robust concept of statistical distance
# and actually a metric that's meaningful in this space is part of what we're thinking of
# more broadly

def average_distance(text_positions,item_1,item_2,absolute=True):
    distances = [
        np.min(text_positions[item_1] - pos)
        for pos in text_positions[item_2]
    ]
    if absolute:
        return [np.abs(np.average(distances)),np.std(distances)]
    return [(np.average(distances)),np.std(distances)]

In [None]:
'janis' in freq_chart['word'].values

In [None]:
def single_word_distance_frame(index_word,freq_chart,lemmalist,text_positions):
    word_frame = freq_chart.reindex(columns=['word','frequency'])
    distlist = [
        average_distance(
            text_positions,index_word,word
        ) for word in word_frame['word']
    ]
    avg_dist = pd.Series([stat[0] for stat in distlist],name="avg_dist")
    std_dist = pd.Series([stat[1] for stat in distlist], name="std_dist")
    return pd.concat([
        word_frame,avg_dist,std_dist],axis=1).sort_values(
        by='frequency',ascending=False
    ).reset_index(drop=True)
       

In [None]:
text_positions=position_lemmas(copperfield)

In [None]:
%time morning_frame = single_word_distance_frame('heart',freq_chart,lemmalist,text_positions)

In [None]:
morning_frame.loc[morning_frame['word']=='happiness']

In [None]:
draw_freq_chart(
    morning_frame[10:],
    50,
    'avg_dist',
    'frequency',
    palette_function = purple,
    background = [0,0,0],
    wheel = 'avg_dist',
    heat = 'frequency',
    bump = 0.2
)

In [None]:
%time rimbaud_frame = single_word_distance_frame('rimbaud',freq_chart,lemmalist,text_positions)

In [None]:
%time robert_frame = single_word_distance_frame('robert',freq_chart,lemmalist,text_positions)

In [None]:
robert_frame.sort_values(by="avg_dist").loc[robert_frame['frequency'] > 5][0:50]

In [None]:
freq_chart[50:100]

In [None]:
trim_frame = rimbaud_frame.loc[rimbaud_frame['word'] != 'robert']

In [None]:
draw_freq_chart(trim_frame,5,'avg_dist','std_dist',colorfunc=purple)

In [None]:
def two_way_comp(text_positions,item_1,item_2,test_item):
    distances_1 = [
        np.min(text_positions[test_item] - pos)
        for pos in text_positions[item_1]
    ]
    distances_2 = [
        np.min(text_positions[test_item] - pos) 
        for pos in text_positions[item_2]
    ]
    return [
        np.average(distances_1),
        np.average(distances_2),
        np.std(distances_1),
        np.std(distances_2)
    ]

In [None]:
def naive_two_word_distance_frame(
    index_word_1,index_word_2,freq_chart,lemmalist,text_positions
):
    word_frame = freq_chart.reindex(columns=['word','frequency','average_position'])
    distlist = [
        two_way_comp(
            text_positions,index_word_1,index_word_2,word
        ) for word in word_frame['word']
    ]
    avg_dist_1 = pd.Series([stat[0] for stat in distlist],name="avg_dist_1")
    avg_dist_2 = pd.Series([stat[1] for stat in distlist],name="avg_dist_2")
    std_dist_1 = pd.Series([stat[2] for stat in distlist], name="std_dist_1")
    std_dist_2 = pd.Series([stat[3] for stat in distlist], name="std_dist_2")
    naive_3_way = pd.Series(np.sqrt(avg_dist_1**2 + avg_dist_2**2), name = "naive_3_way")
    return pd.concat([
        word_frame,avg_dist_1,avg_dist_2,std_dist_1,std_dist_2,naive_3_way
    ],axis=1).sort_values(
        by='naive_3_way',ascending=True
        ).reset_index(drop=True)

In [None]:
naive_two_word_distance_frame()

In [None]:
robert_rimbaud_frame = naive_two_word_distance_frame(
    'robert','rimbaud',freq_chart,lemmalist,text_positions
)

In [None]:

robert_rimbaud_frame["r"] = (
    robert_rimbaud_frame["avg_dist_1"] ** 2 + robert_rimbaud_frame["avg_dist_2"] ** 2
) ** 0.5
robert_rimbaud_frame["theta"] = (
    robert_rimbaud_frame["avg_dist_1"] / robert_rimbaud_frame["avg_dist_2"])
robert_rimbaud_frame["diff"] = (
    robert_rimbaud_frame["avg_dist_1"] - robert_rimbaud_frame["avg_dist_2"] 
)

robert_rimbaud_frame["pos_norm"] = (
    (robert_rimbaud_frame["average_position"] - len(kids))/len(kids)
)

In [None]:
robert_rimbaud_frame

In [None]:
draw_freq_chart(robert_rimbaud_frame,15,'diff','avg_dist_2',colorfunc=purple)

## chart comparisons

In [None]:
def verbose_compare(token_1,token_2):
    return {
        'similarity':token_1.similarity(token_2),
        'word_1':token_1.text,
        'word_2':token_2.text
    }

In [None]:
simlist = [
    verbose_compare(random.choice(tokenlist),random.choice(tokenlist)) 
    for n in range(2000)
]
simlist = sorted(simlist, key=itemgetter('similarity'), reverse=True)

In [None]:
word=nlp('dog')
simlist = [
    verbose_compare(word,token)
    for token in tokenlist
]
simlist = sorted(simlist, key=itemgetter('similarity'), reverse=True)
fig,ax = plt.subplots()
ax.hist([item['similarity'] for item in simlist], bins=50)

word=nlp('cat')
simlist = [
    verbose_compare(word,token)
    for token in tokenlist
]
simlist = sorted(simlist, key=itemgetter('similarity'), reverse=True)
fig,ax = plt.subplots()
ax.hist([item['similarity'] for item in simlist], bins=50)

In [None]:
from IPython.display import clear_output
import timeit

In [None]:
# so doing the whole novel like this would take 25-27 hours on this laptop. 
# much faster if we parallelized it...
# maybe just the most frequent words?

i = 0
simdict = {}

start = timeit.default_timer()
run_size = len(tokenset)

for token in tokenset:
    i+=1
    print(str(i)+'/'+str(run_size)+':'+token.text)
    
    simdict[token] = [
        verbose_compare(token,word) for word in tokenset
    ]
    cur_time = (timeit.default_timer()-start)
    
    
    print('current runtime: ', cur_time)
    print('expected runtime: ', cur_time * (run_size/i))
    
    if i == 50:
        break
    

# all_similar = [
#     [verbose_compare(word,token) for token in tokenset] 
#     for word in tokenset
# ]

In [None]:
simdict

In [None]:
# compute a list of tokens -- or lemmas? by frequency.
# use that to weight the similarity calculation.

# we could also just go back to an idea about frequency / recency space.

## section for exploring k-means

In [None]:
from sklearn.datasets import make_blobs
X, y_true = make_blobs(n_samples=300, centers=4,
                       cluster_std=0.60, random_state=0)
plt.scatter(X[:, 0], X[:, 1], s=50);