In [134]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

In [258]:
characters = ['Adam', 'Eve', 'God', 'Satan']
files = ['sp/' + character for character in characters]
contents = [open(file, encoding='utf-8', errors='ignore').read() 
            for file in files]

In [263]:
contents = ["This is very strange",
          "This is very nice"]

In [266]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(contents)
feature_names = vectorizer.get_feature_names()
dense = tfidf_matrix.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)

In [267]:
df

Unnamed: 0,is,nice,strange,this,very
0,0.448321,0.0,0.630099,0.448321,0.448321
1,0.448321,0.630099,0.0,0.448321,0.448321


In [248]:
df = pd.DataFrame(denselist, columns=feature_names, index=characters)
df

Unnamed: 0,abandon,abhor,abide,abject,abjure,able,abode,abolish,abominable,abortive,...,yoke,yon,yonder,you,younger,your,yours,youth,zodiac,zone
Adam,0.0,0.001805,0.0,0.0,0.00229,0.002923,0.0,0.00229,0.0,0.001805,...,0.001805,0.003611,0.00478,0.0,0.00229,0.001195,0.0,0.00229,0.00229,0.00229
Eve,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.003393,0.0,0.0,0.003393,0.0,0.0,0.0,0.0
God,0.0,0.0,0.004788,0.0,0.0,0.003876,0.0,0.0,0.0,0.0,...,0.0,0.0,0.003169,0.0,0.0,0.006338,0.0,0.0,0.0,0.0
Satan,0.003159,0.002491,0.002491,0.006319,0.0,0.002017,0.003159,0.0,0.003159,0.002491,...,0.004982,0.002491,0.001649,0.031594,0.0,0.047812,0.009478,0.0,0.0,0.0


In [249]:
df['next']

Adam     0.001195
Eve      0.010180
God      0.006338
Satan    0.008243
Name: next, dtype: float64

In [250]:
s = pd.Series(df.loc['Adam'])
s[s > 0].sort_values(ascending=False)

In [251]:
s[s > 0].sort_values()

and            0.497077
to             0.387147
the            0.316648
of             0.298724
in             0.186404
with           0.144583
his            0.140998
that           0.130244
not            0.127854
my             0.126659
or             0.118295
all            0.115905
me             0.109931
but            0.109931
from           0.103956
this           0.099177
thou           0.096787
what           0.095592
by             0.090812
thee           0.088422
so             0.087228
as             0.082448
be             0.081253
us             0.081253
thy            0.078863
our            0.075279
then           0.075279
which          0.074084
for            0.070499
her            0.069304
                 ...   
went           0.001195
please         0.001195
satisfy        0.001195
remember       0.001195
repose         0.001195
yielded        0.001195
choose         0.001195
appear         0.001195
dread          0.001195
your           0.001195
pleased        0

In [207]:
adam = denselist[0]
table = list(zip(adam, feature_names))
sorted(table, key=lambda t: t[0], reverse=True)

[(0.49711077707446394, 'and'),
 (0.38717281675991905, 'to'),
 (0.31666912481906956, 'the'),
 (0.29874445737648075, 'of'),
 (0.186416541402924, 'in'),
 (0.1445923173702167, 'with'),
 (0.14100738388169892, 'his'),
 (0.1302525834161456, 'that'),
 (0.12786262775713375, 'not'),
 (0.12666764992762783, 'my'),
 (0.11830280512108637, 'or'),
 (0.11591284946207453, 'all'),
 (0.1099379603145449, 'but'),
 (0.1099379603145449, 'me'),
 (0.1039630711670153, 'from'),
 (0.0991831598489916, 'this'),
 (0.09679320418997976, 'thou'),
 (0.09559822636047384, 'what'),
 (0.09081831504245015, 'by'),
 (0.08842835938343829, 'thee'),
 (0.08723338155393237, 'so'),
 (0.08245347023590868, 'as'),
 (0.08125849240640276, 'be'),
 (0.08125849240640276, 'us'),
 (0.07886853674739092, 'thy'),
 (0.07528360325887315, 'our'),
 (0.07528360325887315, 'then'),
 (0.07408862542936723, 'which'),
 (0.07050369194084946, 'for'),
 (0.06930871411134353, 'her'),
 (0.06572378062282576, 'on'),
 (0.05496898015727245, 'god'),
 (0.05496898015727

In [159]:
def getDistinctiveWords(contents, character, ngramRange): 
    vectorizer = TfidfVectorizer(analyzer='word', 
                             token_pattern='[a-zA-Z]\w+\'?\w*', 
                             stop_words = 'english')
                             #ngram_range=ngramRange)
    tfidf_matrix = vectorizer.fit_transform(contents)
    feature_names = vectorizer.get_feature_names()
    dense = tfidf_matrix.todense()
    words = dense.tolist()[character]
    scores = [pair for pair in zip(range(0, len(words)), words)] 
    toplist = sorted(scores, key=lambda t: t[1], reverse=True)[:20]
    for phrase, score in [(feature_names[word_id], score) 
                      for (word_id, score) in toplist][:20]:
        print('{0: <30} {1}'.format(phrase, score))

In [162]:
getDistinctiveWords(contents, 1, (1,1))

thee                           0.4068069005874434
thy                            0.3894959686475522
thou                           0.32890770685793297
shall                          0.16445385342896648
death                          0.1471429214890753
adam                           0.1270425286171027
god                            0.12117652357923846
love                           0.12117652357923846
fair                           0.10386559163934725
fruit                          0.10386559163934725
hath                           0.10386559163934725
good                           0.09521012566940165
heaven                         0.09521012566940165
till                           0.09521012566940165
day                            0.08655465969945604
tree                           0.08655465969945604
serpent                        0.08469501907806846
night                          0.07789919372951044
sweet                          0.07410814169330991
fear                           0.06

In [102]:
feature_names[:10]

['abandon fear to',
 'abhor sight hateful',
 'abhor to dream',
 'abide that boast',
 'abide united as',
 'abject posture have',
 'abject thoughts and',
 'abjure when out',
 'able and as',
 'able to make']

In [150]:
def lookupWord(word, dense, feature_names): 
    idx = feature_names.index(word)
    return dense[0].tolist()[0][idx]

In [151]:
lookupWord('fruit', dense, feature_names)

ValueError: 'fruit' is not in list

In [None]:
words=dense.tolist()[3]
scores = [pair for pair in zip(range(0, len(words)), words)] 

In [81]:
toplist = sorted(scores, key=lambda t: t[1], reverse=True)[:20]

In [82]:
for phrase, score in [(feature_names[word_id], score) 
                      for (word_id, score) in toplist][:20]:
    print('{0: <30} {1}'.format(phrase, score))

and in the                     0.03470451834031863
for whom all                   0.03470451834031863
this new world                 0.03470451834031863
to tell thee                   0.03470451834031863
not to be                      0.027361440879970535
all good to                    0.02313634556021242
all these his                  0.02313634556021242
all these shining              0.02313634556021242
and for him                    0.02313634556021242
and the more                   0.02313634556021242
and this imperial              0.02313634556021242
and to repair                  0.02313634556021242
can it be                      0.02313634556021242
consult how we                 0.02313634556021242
copies have me                 0.02313634556021242
ease to my                     0.02313634556021242
fame in heaven                 0.02313634556021242
for who can                    0.02313634556021242
from him who                   0.02313634556021242
gods of men                   