In [1]:
import spacy
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer

In [16]:
vectorizer = CountVectorizer(input='filename', stop_words='english')
dfTransformer = TfidfTransformer()
subLinearTransformer = TfidfTransformer(sublinear_tf=True)
limitedVectorizer = TfidfVectorizer(input='filename', stop_words='english', max_features=5000)

In [17]:
filenames = ['chaps-firstParas.txt', 'chaps-midParas.txt', 'chaps-lastParas.txt']

In [18]:
colLabels = [fn[6:-4] for fn in filenames]
colLabels

['firstParas', 'midParas', 'lastParas']

In [86]:
docTermMatrix = vectorizer.fit_transform(filenames)

In [87]:
tfidfMatrix = dfTransformer.fit_transform(docTermMatrix)

In [88]:
subTfidfMatrix = subLinearTransformer.fit_transform(docTermMatrix)

In [19]:
limTfidfMatrix = limitedVectorizer.fit_transform(filenames)

In [None]:
rowLabels = vectorizer.get_feature_names()

In [20]:
rowLabels = limitedVectorizer.get_feature_names()

In [90]:
df = pd.DataFrame(docTermMatrix.todense().T, columns=colLabels, index=rowLabels)

In [91]:
dfTfidf = pd.DataFrame(tfidfMatrix.todense().T, columns=colLabels, index=rowLabels)

In [92]:
dfSubTfidf = pd.DataFrame(subTfidfMatrix.todense().T, columns=colLabels, index=rowLabels)

In [21]:
dfLimTfidf = pd.DataFrame(limTfidfMatrix.todense().T, columns=colLabels, index=rowLabels)

In [11]:
def addDistinctivenessScores(df): 
    df['midDistinct'] = df['midParas'] - df['firstParas'] - df['lastParas']
    df['firstDistinct'] = df['firstParas'] - df['lastParas'] - df['midParas']
    df['lastDistinct'] = df['lastParas'] - df['firstParas'] - df['midParas']

In [None]:
for dfI in [df, dfTfidf, dfSubTfidf]: 
    addDistinctivenessScores(dfI)

In [22]:
addDistinctivenessScores(dfLimTfidf)

# Raw Counts

In [None]:
df.sort_values('firstDistinct', ascending=False).head(15)

In [None]:
df.sort_values('lastDistinct', ascending=False).head(15)

In [None]:
df.sort_values('midDistinct', ascending=False).head(15)

# Using TFIDF

In [120]:
dfTfidf.sort_values('firstDistinct', ascending=False).head(n=20)

Unnamed: 0,firstParas,midParas,lastParas,diff,midDistinct,firstDistinct,lastDistinct
morning,0.107762,0.043859,0.051558,-0.056204,-0.115461,0.012345,-0.100063
early,0.036501,0.014811,0.015098,-0.021403,-0.036788,0.006592,-0.036213
breakfast,0.023046,0.009439,0.008763,-0.014283,-0.022371,0.004844,-0.023722
afternoon,0.031063,0.01354,0.01287,-0.018193,-0.030393,0.004653,-0.031733
summer,0.022759,0.009023,0.009812,-0.012948,-0.023548,0.003924,-0.021971
autumn,0.007374,0.002029,0.002481,-0.004893,-0.007827,0.002864,-0.006922
winter,0.018943,0.007983,0.008493,-0.01045,-0.019453,0.002467,-0.018433
sunday,0.012306,0.005559,0.004395,-0.007911,-0.011141,0.002351,-0.01347
weather,0.014248,0.006688,0.00533,-0.008918,-0.012889,0.00223,-0.015606
october,0.004655,0.001109,0.001433,-0.003223,-0.004979,0.002113,-0.004332


In [121]:
dfTfidf.sort_values('midDistinct', ascending=False).head(20)

Unnamed: 0,firstParas,midParas,lastParas,diff,midDistinct,firstDistinct,lastDistinct
replied,0.008011,0.034013,0.022481,0.01447,0.003522,-0.048484,-0.019543
isn,0.00501,0.014171,0.007348,0.002338,0.001813,-0.016509,-0.011833
ve,0.016573,0.048551,0.030187,0.013614,0.001791,-0.062165,-0.034936
retorted,0.000248,0.002428,0.001319,0.001072,0.000861,-0.003499,-0.001356
interposed,0.000546,0.001594,0.000542,-4e-06,0.000506,-0.00159,-0.001598
aren,0.000743,0.00244,0.001206,0.000463,0.000492,-0.002903,-0.001978
hasn,0.000771,0.002652,0.001407,0.000635,0.000474,-0.003287,-0.002016
mustn,0.000445,0.0021,0.001258,0.000813,0.000397,-0.002914,-0.001287
queried,0.000175,0.000747,0.000218,4.4e-05,0.000354,-0.000791,-0.000704
shouldn,0.001238,0.003713,0.002167,0.000928,0.000308,-0.004642,-0.002785


In [122]:
dfTfidf.sort_values('lastDistinct', ascending=False).head(n=20)

Unnamed: 0,firstParas,midParas,lastParas,diff,midDistinct,firstDistinct,lastDistinct
pg,0.020012,0.043192,0.072265,0.052253,-0.049086,-0.095445,0.009061
jondo,0.0,0.0,0.004157,0.004157,-0.004157,-0.004157,0.004157
kissed,0.003107,0.005546,0.012555,0.009448,-0.010116,-0.014994,0.003901
farewell,0.003124,0.003233,0.00989,0.006766,-0.009782,-0.009999,0.003533
bye,0.002668,0.004239,0.009873,0.007205,-0.008302,-0.011444,0.002966
vol,0.000152,0.000209,0.002525,0.002373,-0.002468,-0.002582,0.002164
beverly,0.000276,0.000144,0.00249,0.002214,-0.002622,-0.002359,0.00207
clarenden,0.0,3e-06,0.001924,0.001924,-0.001921,-0.001927,0.001921
muttered,0.002291,0.005368,0.009567,0.007276,-0.006491,-0.012644,0.001908
parted,0.004295,0.003953,0.009751,0.005456,-0.010093,-0.009408,0.001503


# TFIDF Limited to Top 5000 MFW

In [23]:
dfLimTfidf.sort_values('firstDistinct', ascending=False).head(n=20)

Unnamed: 0,firstParas,midParas,lastParas,midDistinct,firstDistinct,lastDistinct
morning,0.108085,0.043943,0.051671,-0.115813,0.012471,-0.100357
early,0.03661,0.014839,0.015131,-0.036902,0.00664,-0.036318
breakfast,0.023116,0.009457,0.008783,-0.022441,0.004876,-0.02379
afternoon,0.031156,0.013566,0.012898,-0.030488,0.004692,-0.031823
summer,0.022828,0.00904,0.009833,-0.023621,0.003954,-0.022034
autumn,0.007397,0.002033,0.002487,-0.00785,0.002877,-0.006943
winter,0.018999,0.007998,0.008511,-0.019513,0.00249,-0.018486
sunday,0.012343,0.00557,0.004404,-0.011177,0.002368,-0.013508
weather,0.014291,0.006701,0.005341,-0.012931,0.002248,-0.01565
october,0.004669,0.001111,0.001436,-0.004994,0.002122,-0.004345


In [24]:
dfLimTfidf.sort_values('midDistinct', ascending=False).head(n=20)

Unnamed: 0,firstParas,midParas,lastParas,midDistinct,firstDistinct,lastDistinct
replied,0.008035,0.034078,0.02253,0.003514,-0.048574,-0.019583
isn,0.005025,0.014198,0.007364,0.001809,-0.016537,-0.011859
ve,0.016622,0.048643,0.030253,0.001767,-0.062274,-0.035012
retorted,0.000248,0.002432,0.001322,0.000862,-0.003506,-0.001358
aren,0.000745,0.002445,0.001208,0.000491,-0.002908,-0.001982
hasn,0.000774,0.002657,0.00141,0.000473,-0.003293,-0.00202
mustn,0.000446,0.002104,0.001261,0.000397,-0.002919,-0.001289
shouldn,0.001242,0.003721,0.002172,0.000307,-0.00465,-0.002791
dey,0.000519,0.002171,0.001366,0.000286,-0.003018,-0.001324
inquired,0.003591,0.005684,0.001856,0.000237,-0.00395,-0.007419


In [25]:
dfLimTfidf.sort_values('lastDistinct', ascending=False).head(n=20)

Unnamed: 0,firstParas,midParas,lastParas,midDistinct,firstDistinct,lastDistinct
pg,0.020072,0.043274,0.072424,-0.049222,-0.095625,0.009078
kissed,0.003117,0.005557,0.012583,-0.010142,-0.015023,0.003909
farewell,0.003134,0.003239,0.009912,-0.009807,-0.010018,0.003539
bye,0.002676,0.004247,0.009895,-0.008324,-0.011465,0.002971
muttered,0.002298,0.005378,0.009588,-0.006508,-0.012668,0.001912
parted,0.004308,0.00396,0.009772,-0.01012,-0.009424,0.001504
disappeared,0.007013,0.006453,0.014956,-0.015516,-0.014396,0.001491
sank,0.004127,0.004625,0.010201,-0.009704,-0.010698,0.001449
page,0.004195,0.003768,0.009343,-0.00977,-0.008916,0.00138
asleep,0.008232,0.006921,0.016462,-0.017773,-0.015151,0.001309
