In [1]:
import nltk

In [2]:
from nltk.corpus import brown #import Brown corpus from nltk

In [3]:
brown.categories() #Brown corpus has been categorized by genre

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [4]:
brown.words(categories = "adventure")[:4] #gives the first four words of the adventure genre

['Dan', 'Morgan', 'told', 'himself']

In [5]:
brown.words(categories='lore') #gives the words of the lore genre

['In', 'American', 'romance', ',', 'almost', 'nothing', ...]

In [6]:
brown.words(fileids=['cg22']) #fileids are identifiers of genres in the brown corpus
#cg22 is the fileid for the genre belles-lettres

['Does', 'our', 'society', 'have', 'a', 'runaway', ',', ...]

In [7]:
brown.sents(categories=['news', 'editorial', 'reviews'])
#The sents() function divides the text up into its sentences, where each sentence is a list of words

[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ...]

In [8]:
#The Brown Corpus is a convenient resource for studying systematic differences between genres, a kind of linguistic inquiry known as stylistics. 
#Let's compare genres in their usage of modal verbs. The first step is to produce the counts for a particular genre.
news_text = brown.words(categories='news')
fdist = nltk.FreqDist(w.lower() for w in news_text)
modals = ['can', 'could', 'may', 'might', 'must', 'will']
for m in modals:
    print(m + ':', fdist[m], end = ' ') #We need to include end=' ' in order for the print function to put its output on a single line.

can: 94 could: 87 may: 93 might: 38 must: 53 will: 389 

In [9]:
#Next, we need to obtain counts for each genre of interest. We'll use NLTK's support for conditional frequency distributions.
cfd = nltk.ConditionalFreqDist((genre, word) for genre in brown.categories() for word in brown.words(categories=genre))

In [10]:
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']

In [11]:
modals = ['can', 'could', 'may', 'might', 'must', 'will']

In [12]:
cfd.tabulate(conditions=genres, samples=modals)
#Observe that the most frequent modal in the news genre is will, while the most frequent modal in the romance genre is could.
#There is an idea that word counts might distinguish genres

                  can could   may might  must  will 
           news    93    86    66    38    50   389 
       religion    82    59    78    12    54    71 
        hobbies   268    58   131    22    83   264 
science_fiction    16    49     4    12     8    16 
        romance    74   193    11    51    45    43 
          humor    16    30     8     8     9    13 
