In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import spacy
import textacy
import gc
from glob import glob
from sklearn.feature_extraction.text import TfidfVectorizer
%matplotlib inline
plt.rcParams['figure.figsize'] = (24,12)
plt.style.use('ggplot')

In [None]:
nlp = spacy.load('en')

In [None]:
filenames = glob('text/*')
texts = [open(text).read() for text in filenames]
metadata = [{'filename': fn} for fn in filenames]

In [None]:
james = textacy.Corpus('en', texts, metadatas=metadata)

In [None]:
james.save('.', 'james', compression='gzip')

In [2]:
james = textacy.Corpus.load('.', 'james', compression='gzip')

In [5]:
allSents = []
for doc in james:
    for sent in doc.sents:
        allSents.append((sent, len(sent), doc.metadata['filename']))

In [6]:
def splitSents(n):
    a, b = [], []
    for sent in allSents:
        if sent[1] < n:
            a.append(sent)
        if sent[1] >= n: 
            b.append(sent)
    return a, b

def countWords(sentList):
    return sum([sent[1] for sent in sentList])    

In [7]:
short, long = splitSents(100)


In [8]:
countWords(short), countWords(long)

(3252491, 105658)

In [9]:
def randomSent(sents):
    return sents[np.random.choice(range(len(short)))]

In [10]:
# Randomly sample from short sentences until we get
# the same wordcount as in long.
shortSample = [randomSent(short)]
print(shortSample)
while countWords(shortSample) < countWords(long): 
    print('\r', countWords(shortSample), end='')
    shortSample.append(randomSent(short))

[(The resentment was rather Delia’s, but she kept it to herself, for
she was capable of reflecting with complacency that the key of the
house would after all be hers, so that she could open the door for the
Proberts if the Proberts should knock., 53, 'text/1888-reverberator.txt')]
 10565637953537087970

In [13]:
def makeSoup(sentList):
    justSents = [sent[0] for sent in sentList]
    stringSoup = " ".join([sent.string for sent in justSents])
    return stringSoup

In [None]:
# Try to free up some memory.
james = ""
gc.collect()

In [11]:
short = shortSample

In [14]:
shortSoup = makeSoup(short)
longSoup = makeSoup(long)

In [15]:
shortDoc = textacy.Doc(shortSoup)
longDoc = textacy.Doc(longSoup)

In [None]:
def bag(doc):
    return pd.Series(doc.to_bag_of_terms(lemmatize=True, as_strings=True, weighting='freq', filter_punct=True))

In [None]:
df = pd.DataFrame([bag(shortDoc), bag(longDoc)], index=['short', 'long'])

In [None]:
df = df.fillna(0)

In [None]:
trial3 = (df.loc['short'] - df.loc['long']).sort_values()

In [None]:
trial1 = (df.loc['short'] - df.loc['long']).sort_values()

In [None]:
trial2 = (df.loc['short'] - df.loc['long']).sort_values()

In [None]:
trials = pd.DataFrame([trial1, trial2, trial3])

In [None]:
trials.mean().sort_values()

In [None]:
def POSRepresentation(text): 
    return pd.Series([w.tag_ for w in text.spacy_doc])

In [None]:
shortPOS = POSRepresentation(shortDoc)
longPOS = POSRepresentation(longDoc)

In [None]:
POSs = pd.DataFrame([shortPOS.value_counts(), longPOS.value_counts()], index=['short', 'long'])

In [None]:
POSs

In [None]:
(POSs.loc['short'] - POSs.loc['long']).plot(kind='bar')

In [None]:
POSs.columns[6]

In [None]:
s = short[0][0]
l = long[0][0]

In [None]:
s.start, l.start

In [None]:
shortStarts = [s[0].start for s in short]
longStarts = [s[0].start for s in long]

In [None]:
bins = np.histogram(shortStarts + longStarts)[1]

In [None]:
pd.Series(shortStarts).median()

In [None]:
pd.Series(longStarts).median()

In [None]:
pd.Series(shortStarts).describe(), pd.Series(longStarts).describe()

# Probabilities

In [None]:
len(short)

In [None]:
shortProbs = [w.prob for sent in short for w in sent[0]]
longProbs = [w.prob for sent in long for w in sent[0]]

In [None]:
pd.Series(shortProbs).describe()

In [None]:
pd.Series(longProbs).describe()

In [None]:
shortProbs[:10]

In [None]:
longProbs[:10]

In [16]:
shortDoc.spacy_doc.similarity(longDoc.spacy_doc)

0.99315724181061438