In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import spacy
import textacy
from glob import glob
from sklearn.feature_extraction.text import TfidfVectorizer
%matplotlib inline
plt.rcParams['figure.figsize'] = (20,8)
plt.style.use('ggplot')

In [2]:
nlp = spacy.load('en')

In [3]:
filenames = glob('text/*')
texts = [open(text).read() for text in filenames]
metadata = [{'filename': fn} for fn in filenames]

In [4]:
james = textacy.Corpus('en', texts, metadatas=metadata)

In [5]:
allSents = []
for doc in james:
    for sent in doc.sents:
        allSents.append((sent, len(sent), doc.metadata['filename']))

In [9]:
def splitSents(n):
    a, b = [], []
    for sent in allSents:
        if sent[1] < n:
            a.append(sent)
        if sent[1] >= n: 
            b.append(sent)
    return a, b

def countWords(sentList):
    return sum([sent[1] for sent in sentList])    

In [10]:
short, long = splitSents(34)

In [11]:
countWords(short), countWords(long)

(1711915, 1646234)

In [12]:
def makeSoup(sentList):
    justSents = [sent[0] for sent in sentList]
    stringSoup = " ".join([sent.string for sent in justSents])
    return stringSoup

In [13]:
shortSoup = makeSoup(short)
longSoup = makeSoup(long)

In [14]:
shortDoc = textacy.Doc(shortSoup)
longDoc = textacy.Doc(longSoup)

In [17]:
def bag(doc):
    return pd.Series(doc.to_bag_of_terms(lemmatize=True, as_strings=True, weighting='freq', filter_punct=True))

In [18]:
df = pd.DataFrame([bag(shortDoc), bag(longDoc)], index=['short', 'long'])

In [20]:
df = df.fillna(0)

In [22]:
(df.loc['short'] - df.loc['long']).sort_values()

's         -0.001270
           -0.000883
hour       -0.000480
hyacinth   -0.000451
time       -0.000396
verena     -0.000395
place      -0.000377
sense      -0.000371
day        -0.000344
present    -0.000326
fact       -0.000316
young      -0.000307
air        -0.000293
person     -0.000266
small      -0.000264
olive      -0.000258
feel       -0.000253
occasion   -0.000251
manner     -0.000251
find       -0.000246
high       -0.000240
room       -0.000226
face       -0.000222
one        -0.000216
evening    -0.000216
take       -0.000213
lady       -0.000211
new        -0.000206
first      -0.000199
effect     -0.000194
              ...   
n’t         0.000377
ralph       0.000412
dear        0.000446
come        0.000452
look        0.000457
have        0.000512
rowland     0.000517
go          0.000546
’re         0.000577
mr.         0.000585
like        0.000588
good        0.000597
yes         0.000619
shall       0.000648
ah          0.000680
isabel      0.000722
want        0

# Use Scikit-Learn Instead

In [148]:
# Let's try this a different way.
shortSoup = makeSoup(short)
longSoup = makeSoup(long)

In [149]:
docs = [shortSoup, longSoup]

In [170]:
tfidf = TfidfVectorizer(use_idf=False)
docTermMatrix = tfidf.fit_transform(docs)
docTermMatrixDense = docTermMatrix.todense()
labels = tfidf.get_feature_names()
df = pd.DataFrame(docTermMatrixDense, columns=labels)
# Short minus long
(df.loc[0] - df.loc[1]).sort_values()

the       -0.129872
of        -0.112184
had       -0.053067
and       -0.046326
in        -0.042988
which     -0.034696
as        -0.026812
was       -0.016450
by        -0.014569
his       -0.013315
their     -0.011851
from      -0.010602
or        -0.008986
might     -0.007952
who       -0.007788
with      -0.006969
been      -0.006604
her       -0.006558
its       -0.005766
that      -0.005126
an        -0.005100
though    -0.005060
herself   -0.004825
even      -0.004766
sense     -0.003743
on        -0.003727
having    -0.003625
himself   -0.003499
air       -0.003144
while     -0.002904
             ...   
well       0.017572
be         0.018325
not        0.019449
see        0.019449
think      0.019785
oh         0.019826
ve         0.020195
will       0.020709
to         0.021445
has        0.022112
we         0.023487
have       0.023658
very       0.023785
can        0.024530
are        0.024942
then       0.026887
your       0.028029
know       0.028298
my         0.034375


In [171]:
# Let's try it again with Tf-IDF
tfidf = TfidfVectorizer(use_idf=True)
docTermMatrix = tfidf.fit_transform(docs)
docTermMatrixDense = docTermMatrix.todense()
labels = tfidf.get_feature_names()
df = pd.DataFrame(docTermMatrixDense, columns=labels)
# Short minus long
(df.loc[0] - df.loc[1]).sort_values()

the       -0.129872
of        -0.112183
had       -0.053067
and       -0.046325
in        -0.042988
which     -0.034696
as        -0.026812
was       -0.016450
by        -0.014569
his       -0.013315
their     -0.011851
from      -0.010602
or        -0.008986
might     -0.007952
who       -0.007788
with      -0.006969
been      -0.006604
her       -0.006558
its       -0.005766
that      -0.005126
an        -0.005100
though    -0.005060
herself   -0.004825
even      -0.004766
sense     -0.003743
on        -0.003726
having    -0.003625
himself   -0.003499
air       -0.003144
while     -0.002904
             ...   
well       0.017572
be         0.018325
not        0.019449
see        0.019449
think      0.019785
oh         0.019826
ve         0.020195
will       0.020709
to         0.021445
has        0.022112
we         0.023487
have       0.023659
very       0.023785
can        0.024530
are        0.024942
then       0.026887
your       0.028029
know       0.028298
my         0.034375
