In [1]:

from nltk.corpus import reuters
 
print (reuters.fileids() )        # The list of file names inside the corpus


['test/14826', 'test/14828', 'test/14829', 'test/14832', 'test/14833', 'test/14839', 'test/14840', 'test/14841', 'test/14842', 'test/14843', 'test/14844', 'test/14849', 'test/14852', 'test/14854', 'test/14858', 'test/14859', 'test/14860', 'test/14861', 'test/14862', 'test/14863', 'test/14865', 'test/14867', 'test/14872', 'test/14873', 'test/14875', 'test/14876', 'test/14877', 'test/14881', 'test/14882', 'test/14885', 'test/14886', 'test/14888', 'test/14890', 'test/14891', 'test/14892', 'test/14899', 'test/14900', 'test/14903', 'test/14904', 'test/14907', 'test/14909', 'test/14911', 'test/14912', 'test/14913', 'test/14918', 'test/14919', 'test/14921', 'test/14922', 'test/14923', 'test/14926', 'test/14928', 'test/14930', 'test/14931', 'test/14932', 'test/14933', 'test/14934', 'test/14941', 'test/14943', 'test/14949', 'test/14951', 'test/14954', 'test/14957', 'test/14958', 'test/14959', 'test/14960', 'test/14962', 'test/14963', 'test/14964', 'test/14965', 'test/14967', 'test/14968', 'test

In [2]:
print (len(reuters.fileids()) )           # Number of files in the corpus = 10788
 
# Print the categories associated with a file
print (reuters.categories('training/999') )        # [u'interest', u'money-fx']
 
# Print the contents of the file
print (reuters.raw('test/14829') )

10788
['interest', 'money-fx']
JAPAN TO REVISE LONG-TERM ENERGY DEMAND DOWNWARDS
  The Ministry of International Trade and
  Industry (MITI) will revise its long-term energy supply/demand
  outlook by August to meet a forecast downtrend in Japanese
  energy demand, ministry officials said.
      MITI is expected to lower the projection for primary energy
  supplies in the year 2000 to 550 mln kilolitres (kl) from 600
  mln, they said.
      The decision follows the emergence of structural changes in
  Japanese industry following the rise in the value of the yen
  and a decline in domestic electric power demand.
      MITI is planning to work out a revised energy supply/demand
  outlook through deliberations of committee meetings of the
  Agency of Natural Resources and Energy, the officials said.
      They said MITI will also review the breakdown of energy
  supply sources, including oil, nuclear, coal and natural gas.
      Nuclear energy provided the bulk of Japan's electric power
 

In [3]:
from string import punctuation
from nltk.corpus import stopwords
from nltk import word_tokenize
 
stop_words = stopwords.words('english') + list(punctuation)
 

In [4]:
def tokenize(text):
    words = word_tokenize(text)
    words = [w.lower() for w in words]
    return [w for w in words if w not in stop_words and not w.isdigit()]

In [5]:
# build the vocabulary in one pass
vocabulary = set()
for file_id in reuters.fileids():
    #print(file_id)
    words = tokenize(reuters.raw(file_id))
    #print(words)
    vocabulary.update(words)


In [6]:

vocabulary = list(vocabulary)
word_index = {w: idx for idx, w in enumerate(vocabulary)}
print(word_index)
 
VOCABULARY_SIZE = len(vocabulary)
DOCUMENTS_COUNT = len(reuters.fileids())
 
print (VOCABULARY_SIZE, DOCUMENTS_COUNT )     # 10788, 51581
 

51558 10788


In [7]:
import numpy as np
 
word_idf = np.zeros(VOCABULARY_SIZE)
for file_id in reuters.fileids():
    words = set(tokenize(reuters.raw(file_id)))
    indexes = [word_index[word] for word in words]
    #print(indexes)
    word_idf[indexes] += 1.0

word_idf = np.log(DOCUMENTS_COUNT / (1 + word_idf).astype(float))
print (word_idf[word_index['deliberations']] )     # 7.49443021503
print (word_idf[word_index['committee']] )        # 3.61286641709
 

7.494430215031565
3.612866417088128


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
 
tfidf = TfidfVectorizer(stop_words=stop_words, tokenizer=tokenize, vocabulary=vocabulary)
 
# Fit the TfIdf model
tfidf.fit([reuters.raw(file_id) for file_id in reuters.fileids()])
 
# Transform a document into TfIdf coordinates
X = tfidf.transform([reuters.raw('test/14829')])
 

# Check out some frequencies
print ( X[0, tfidf.vocabulary_['year']] )                  # 0.0562524229373
print (X[0, tfidf.vocabulary_['following']] )             # 0.057140265658
print (X[0, tfidf.vocabulary_['provided']] )              # 0.0689364372666
print (X[0, tfidf.vocabulary_['structural']] )            # 0.0900802810906
print (X[0, tfidf.vocabulary_['japanese']] )              # 0.114492409303
print (X[0, tfidf.vocabulary_['downtrend']] )             # 0.111137191743
 

  sorted(inconsistent))


0.056252422937258545
0.057140265657981984
0.06893643726660394
0.09008028109060313
0.1144924093027866
0.11113719174292074
