In [1]:
import random
from operator import and_, not_, or_
import math
import copy

from jupyter_plotly_dash import JupyterDash
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.attrs import LEMMA
import en_core_web_lg
import plotly.graph_objs as go


from freqer import *
from palettes import *
from qualmath import *
from miketools import *

# %matplotlib qt

#spacy.require_gpu()

In [2]:
spacy.require_gpu()

True

In [3]:
%time nlp=en_core_web_lg.load()

Wall time: 12.2 s


In [4]:
active_text = None

In [5]:
# warning: make sure to have lots of RAM / GPU memory
# if you're going to feed it something large.

textfile = 'nightwood.txt'
active_text = open(textfile, encoding='utf-8').read()
# if len(active_text) > 1000000:
#      nlp.max_length = 3000000
%time active_text = nlp(active_text)

Wall time: 4.99 s


In [178]:
interest = partial(evaluate_interest, boring_intensity = 1) 
%time interesting_tokens = list(filter(interest,active_text))
%time lemmalist = lemmatize(interesting_tokens)
%time freq_chart = prep_freq_chart(active_text,lemmalist)
# trimwords = ['say','go','come','mr.','mrs.','miss']
# trim_chart = notdf(freq_chart,inloc(freq_chart,"word",trimwords))

Wall time: 220 ms
Wall time: 20.9 ms
Wall time: 4 s


In [179]:
tokenframe = pd.DataFrame([{
        'position':ix,
        'word':token.lemma_.lower(),
        'pos':token.pos_,
        'sentiment':token.sentiment
    }
    for ix, token in enumerate(interesting_tokens)
    ])

In [180]:
def dashrgba(rgba):
    outstring = 'rgba'+str(tuple([
        rgba[0]*255,
        rgba[1]*255,
        rgba[2]*255,
        rgba[3]
    ]))
    return outstring

In [181]:
def jitarray(mean, intensity, length):
    return np.array([mean*random.random()*intensity - intensity/2 for i in range(length)])

In [182]:
def randrgb(alpha):
    return [random.random() for i in range(3)] + [alpha]

In [183]:
# insane scatter

chart = tokenframe

jitmean = chart["position"].mean()

fig = go.Figure()
for pos in chart["pos"].unique():  
    poschart = eqloc(chart,"pos",pos)

    fig.add_trace(go.Scattergl(
        x = poschart["position"],
        y = poschart["position"] + jitarray(jitmean,10,len(poschart)),
        text = poschart["word"],
        mode = "text",
        textfont = {
            'size':18,
            'color':dashrgba(randrgb(0.5))
            },
        name=pos
    ))
fig.update_layout({
    'paper_bgcolor':'black','plot_bgcolor':'black','xaxis':{'visible':False},'yaxis':{'visible':False},
})

2


2

In [184]:
app = JupyterDash(__name__)
app.layout = html.Div(children=[
    dcc.Graph(
        id = 'scatter',
        figure = fig,
        style={'height':'100vh'}
        ),
    ],
    style = {'backgroundcolor':'black'}              
)

In [185]:
app

In [None]:
# 'normal' scatter
chart = geloc(freq_chart,"frequency",10)
fig = go.Figure()
colormap = colormap = [
            dashrgba(
                purple(
                    heat=row["frequency"],peak_heat=max(chart["frequency"])
                ))
                    for row in rows(chart)
            ]

# could add, like, POS distinctions to this
# or traces with character groups

fig.add_trace(go.Scattergl(
    x = chart["average_position"],
    y = chart["frequency"],
    text = chart["word"],
    marker = {
        'color':colormap
    },
    mode = "text",
    textfont = {
        'size':18,
        'color':colormap
        },

    ))
fig.update_layout({
    'paper_bgcolor':'black','plot_bgcolor':'black','xaxis':{'visible':False},'yaxis':{'visible':False}
})
app = JupyterDash(__name__)
app.layout = html.Div(children=[
    dcc.Graph(
        id = 'scatter',
        figure = fig,
        style = {
            'height':'100vh'
            }
        ),
    ],
    style = {'backgroundcolor':'black'}
               
)
app

In [None]:
app=None

In [None]:
words = random.choices(geloc(freq_chart,"frequency",5)["word"],k=5)

colors = ["blue", "red", "green", "purple", "teal"]
yattr = "position"
xattr = "word"

fig = go.Figure()
for word in words:
    selected = eqloc(tokenframe,"word",word)
    fig.add_trace(
        go.Violin(
            y=selected["position"],
            points=False,
            line = {
                'color':colors.pop()
            },
            marker = {
                'color':'red'
            },
            name = word,
            spanmode='hard'
        )
    ),
app = JupyterDash(__name__)
app.layout = html.Div(children=[
    dcc.Graph(
        id='violins',
        figure=fig
        )
    ])
app

In [239]:
%%time
from itertools import chain
# return tokens along with parent and all its family members who are 'interesting'
# reapply 'interesting' function rather than searching with in_me / contains, because it is expensive
# and increasingly so (geometrically?) with text length
# we might want to modify this with separate interest functions if we, like, want to filter pos separately

families = [
    [token,token.head,list(filter(interest,chain(token.head.children,[token.head])))]
     for token in interesting_tokens
]

Wall time: 601 ms


In [240]:
familyframe = pd.DataFrame([{
    'word':family[0].lemma_.lower(),
    'parent':family[1].lemma_.lower(),
    'siblings':lemmatize(family[2]),
    'sentence':family[0].sent
    } for family in  families
    ])

In [241]:
familyframe

Unnamed: 0,word,parent,siblings,sentence
0,chapter,one,[chapter],"(CHAPTER, ONE, \n\n)"
1,bow,bow,[bow],"(Bow, Down, \n\n)"
2,early,stamp,"[early, lie, valance, stamp]","(Early, in, 1880, ,, in, spite, of, a, well, -..."
3,spite,in,[spite],"(Early, in, 1880, ,, in, spite, of, a, well, -..."
4,found,suspicion,"[found, suspicion]","(Early, in, 1880, ,, in, spite, of, a, well, -..."
...,...,...,...,...
19831,eye,bloodshot,"[eye, flat, bloodshot]","(He, ran, this, way, and, that, ,, low, down, ..."
19832,bloodshot,lie,"[bloodshot, lie]","(He, ran, this, way, and, that, ,, low, down, ..."
19833,head,flat,"[head, flat]","(He, ran, this, way, and, that, ,, low, down, ..."
19834,flat,bloodshot,"[eye, flat, bloodshot]","(He, ran, this, way, and, that, ,, low, down, ..."


In [259]:
this_word = "eye"
make_freq_frame(
    [word for word in eqloc(familyframe,"word",this_word)['siblings'].explode()
     if word != this_word]
)

Unnamed: 0,word,frequency
0,turn,5
1,close,4
2,fix,4
3,open,4
4,see,3
...,...,...
82,flow,1
83,flitter,1
84,flat,1
85,find,1


In [264]:
neighbor_dict = {}
for word in familyframe["word"].unique():
    neighbor_frame = make_freq_frame([
        member for member in eqloc(familyframe,"word",word)['siblings'].explode()
        if member != word
    ])
    neighbor_dict[word] = neighbor_frame

In [266]:
len(neighbor_dict)

4827

In [275]:
def neighbor_frequencies(neighbor_frame, freq_chart):
    """chart of frequency (in entire text) of each lemma in this word's neighbors"""
    return [
        float(eqloc(freq_chart,"word",word)["frequency"]) for word in 
        neighbor_frame["word"]
        ]

In [290]:
# 'normal' scatter

fig = go.Figure()
# colormap = colormap = [
#             dashrgba(
#                 purple(
#                     heat=row["frequency"],peak_heat=max(chart["frequency"])
#                 ))
#                     for row in rows(chart)
#             ]

colors = ["blue", "red", "purple", "aqua"]

minimum_frequency = 10
trimmed_chart = geloc(freq_chart,"frequency",minimum_frequency)

for word in trimmed_chart["word"]:
#for word in random.choices(trimmed_chart["word"],k=4):
#for word in ["doctor"]:
    neighbor_frame = neighbor_dict[word]
    fig.add_trace(go.Scattergl(
        x = neighbor_frame["frequency"],
        y = neighbor_frequencies(neighbor_frame,freq_chart),
        text = neighbor_frame["word"],
        mode = "text",
        textfont = {
            'size':18,
            'color':dashrgba(randrgb(0.5))
            },
        name = word
        ))
    fig.update_layout({
        'paper_bgcolor':'black','plot_bgcolor':'black','xaxis':{'visible':False},'yaxis':{'visible':False}
    })
app = JupyterDash(__name__)
app.layout = html.Div(children=[
    dcc.Graph(
        id = 'scatter',
        figure = fig,
        style = {
            'height':'100vh'
            }
        ),
    ],
    style = {'backgroundcolor':'black'}
               
)
app