In [None]:
# Make the NLTK functions available.
import nltk

Some features are not installed by default and need to be installed manually
by invoking the following commands via the command line:

To open a Python session:

`> python`
  
Make nltk available:

`> import nltk`
  
Start the NLTK downloader:

`> nltk.download()`
  
Now a program starts, make sure that under
* 'Corpora', alpino - Alpino Dutch Treebank
* 'Corpora', treebank - Penn Treebank Sample
* 'Models', averaged_perceptron_tagger - Averaged Perceptron Tagger
* 'Models', maxent_treebank_pos_tagger - Treebank Part of Speech Tagger (Maximum entropy)
* 'Models', tagset - Help on Tagsets

is installed, or do so manually.

## Loading a corpus

You might need to replace the filenames `"2016.txt"` with one of your own files of interest.

In [None]:
# From http://www.nltk.org/book/ch02.html 1.9 Loading your own Corpus

from nltk.corpus import PlaintextCorpusReader
# Use the following root folder that contains the documents of interest.
corpusRoot = './per_year'
# Put all files in the root folder in a corpus.
wordlists = PlaintextCorpusReader(corpusRoot, '.*')

In [None]:
# Print all file names in the corpus.
print("These filenames are in folder {}".format(corpusRoot))
print(wordlists.fileids())

In [None]:
# Print all the words in this file.
print("\nThese words are in file 2016.txt")
print(wordlists.words('2016.txt'))

### Load the words and preprocess

In [None]:
# Load the words from this file.
words = wordlists.words('2016.txt')

In [None]:
# Make all words lowercase.
words = [word.lower() for word in words]

In [None]:
# Load the stopword list.
from nltk.corpus import stopwords
stopwordList = []#stopwords.words('dutch')

The stopwords are luckily provided by NLTK for Dutch.

If you want to use some other language not provided by NLTK or a stopword list of your own, remove the `#`'s below and run the code below.
Your own stopword list should be a comma-separated list of words, such as:
`en,de,het`

In [None]:
#stopwordList = []
#with open("stopwords.txt") as f:
#    stopwords = f.read().split(",")

In [None]:
# Filter out all stopwords.
words = [word for word in words if not word in stopwordList]

## Word frequencies

In [None]:
# Count the word frequencies with a frequency distribution.
fdist = nltk.FreqDist(words)

In [None]:
# Print the 10 most common words with their frequencies.
print("\nThese are the 10 most common words")
for word, frequency in fdist.most_common(10):
    print(u'{}\t{}'.format(word, frequency))

The code above creates a frequency distribution object first, and then asks for the most common words with the function `most_common` with the parameter `10`.

Note that you can change this parameter to any number of most common words you want.

Stopwords will bloat the results of this method. If any such words slipped through, add those words to the stoplist and repeat the stopword removal process.

## Part of speech

In [None]:
# From http://www.nltk.org/book/ch05.html

# Use the default Penn Treebank tagset.
# A complete overview is available here http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
# Request help on a tag (definition & examples) with:
print("\nThe tag 'NN' means:")
nltk.help.upenn_tagset('NN')

In [None]:
# Tag the words with parts of speech as in the Penn Treebank tagset.
taggedWords = nltk.pos_tag(words)
print("\nThe first 10 words are tagged as:")
print(taggedWords[:10])

In [None]:
# Or using the Dutch tagger for more accurate results.
# (In fact, don't use the tagger above for non-English texts)
# Created using: https://github.com/evanmiltenburg/Dutch-tagger
from nltk.tag.perceptron import PerceptronTagger

tagger = PerceptronTagger(load=False)
tagger.load('model.perc.dutch_tagger_small.pickle')

taggedWords = tagger.tag(words)
print("\nThe first 10 words are tagged as:")
print(taggedWords[:10])

`taggedWords` contains all the words and their tags.
You can find interesting facts by using `print(taggedWords)` for all words, or using slicing `print(taggedWords[x:y])` to select all words between `x` and `y`.

## Collocations

Collocations are multiple words that commonly occur together.
Some examples are:
- `(hundred, years)`
- `(living, creature)`
We focus on 2-word combinations (bigrams) for now.
Any kind of word combinations can be used: 3-word (trigram), ..., `n`-word (`n`-gram).

If you're interested in the latter, use the reference materials below or search for NLTK `n`-gram methods yourself.
Adapting the code shouldn't be difficult if you can find NLTK's methods.

In [None]:
# From http://www.nltk.org/howto/collocations.html
import nltk.collocations
import collections

bigramMeasures = nltk.collocations.BigramAssocMeasures()
finder = nltk.collocations.BigramCollocationFinder.from_words(taggedWords)
# Scored is a list of bigram tuples and their likelihood ratio:
#   [((('word1', 'tag1'), ('word2', 'tag2')), likelihood ratio), ...]
# For example:
#   [((('de', 'IN'), ('president', 'NN'), 0.019015), ...]
scored = finder.score_ngrams(bigramMeasures.likelihood_ratio)

The code below shows the first `n`-grams for some tag of interest.
By default it is set to singular nouns, but you can change this to any tag that is in the tagger that you used.

Note that each element `x` is a `((('word1', 'tag1'), ('word2', 'tag2')), likelihood ratio)`.
- So `x[0]` selects `(('word1', 'tag1'), ('word2', 'tag2'))`,
- then `x[0][0]` selects `('word1', 'tag1')`,
- and finally `x[0][0][1]` selects `'tag1'`.

If you're not interested in particular parts of speech, comment out the tag-line by placing a comment sign `#` in front of the line of code.
Also remove the comment sign `#` from the line below it to use that line of code.

*Commenting/uncommenting lines of code is not good coding practice, but it's easier than a proper workaround for now.*

In [None]:
# Filter to contain only words tagged as 'nounsg'.
scored = [x for x in scored if x[0][0][1] == 'nounsg']
#scored = [x for x in scored]

# Show the first 5 bigrams.
print("\nThe first bigrams found:")
print(scored[:5])

In [None]:
# Group bigrams by first word in bigram.
prefixKeys = collections.defaultdict(list)
for key, scores in scored:
    prefixKeys[key[0]].append((key[1], scores))

In [None]:
# Sort keyed bigrams by strongest association.
# Highly associated bigrams are placed first.
for key in prefixKeys:
    prefixKeys[key].sort(key = lambda x: -x[1])

print("\nThe best bigrams ordered by score:")
print(scored[:5])

In [None]:
# Search the collocations for these words where they appear as 'nounsg'.
print("\nThe top 5 collocations found:")
print('president:', prefixKeys[('president', 'nounsg')][:5])
print('zuid:', prefixKeys[('zuid', 'nounsg')][:5])

You can change the parameters of the collocation search by replacing the search terms and the tag of interest.
Use array slicing `[x:y]` to show the results from `x` to `y`.
Or `[:y]` for the first `y` results.

You can use this to gain understanding of often simultaneously occurring words.

## TF-IDF

TF-IDF is not a native function of NLTK.

It is possible to use some other library, such as sci-kit learn.
This library is installed with Anaconda.

For examples, see
http://www.bogotobogo.com/python/NLTK/tf_idf_with_scikit-learn_NLTK.php

or the `Tf-Idf in Scikit-Learn` part of http://www.cs.duke.edu/courses/spring14/compsci290/assignments/lab02.html

# Per year

All the code above is combined to iterate over all the yearly data.
You can request data almost similarly to the methods above, but with the addition of selecting a year for that method.

Example:
`fdist.most_common(10)`
becomes
`fdists['2016'].most_common(10)`

In [None]:
# Load all words.
allWords = {}
years = [fileid.split('.')[0] for fileid in wordlists.fileids()]
for year in years:
    originalWords = wordlists.words(year + '.txt')
    processedWords = [word.lower() for word in originalWords]
    allWords[year] = processedWords

# Filter out all stopwords.
for year in years:
    allWords[year] = [word for word in allWords[year] if not word in stopwordList]  

# Create word frequency distributions.
fdists = {}
for year in years:
    fdists[year] = nltk.FreqDist(allWords[year])

# Tag all texts.
tagged = {}
for year in years:
    tagged[year] = tagger.tag(allWords[year])


# Create bigram scores and prefix keys.
bigramScore = {}
bigramKeys = {}
for year in years:
    bigramMeasures = nltk.collocations.BigramAssocMeasures()
    finder = nltk.collocations.BigramCollocationFinder.from_words(tagged[year])

    scored = finder.score_ngrams(bigramMeasures.likelihood_ratio)

    scored = [x for x in scored if x[0][0][1] == 'nounsg']
    bigramScore[year] = scored

    prefixKeys = collections.defaultdict(list)
    for key, scores in scored:
        prefixKeys[key[0]].append((key[1], scores))

    for key in prefixKeys:
        prefixKeys[key].sort(key = lambda x: -x[1])

    bigramKeys[year] = prefixKeys

# Your research

Use the above functions to research some interesting facts.
Slight alterations might be necessary, but most of the code below should only be function calls and print statements.

In [None]:
# Insert your research code here
print(fdists['2016'].most_common(10))

print(tagged['2016'][:10])

print(bigramScore['2016'][:5])

print('president:', bigramKeys['2016'][('president', 'nounsg')][:5])