In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

#### Overview of NLTK collection 
NLTK collection is a collection of sets of documents possibly from different sources. Each set is called a corpus. Some popular corpora in NLTK are

* gutenberg
* brown
* word_net (thesaurus)
* books
* movie_reviews
* chat_logs

Since they are collected from different sources they may be stored in different formats like txt, xml etc. Therefore it is advisable to access them using only access tools provided by NLTK. 

On windows these documents are stores in the following structure 
\users\username\AppData\Roaming\nltk_data\corpora

Each document is a text file. Following example shows how to access these file via NLTK 

In [None]:
from nltk.corpus import gutenberg
from nltk.tokenize import sent_tokenize, word_tokenize

In [None]:
sample = gutenberg.raw("bible-kjv.txt")
sent1 = sent_tokenize(sample)
word1 = word_tokenize(sample)

In [None]:
for x in range(10):
    print(sent1[x])

In [None]:
for x in range(10):
    print(word1[x])

#### The brown corpus:
The brown corpus is a very popular corpus for research and model building in text mining. We will use this corpus to demonstate typical text mining tasks. These are
* Listing files
* Listing categories
* Finding size of categories
* Finding length of files
* Studying lexical diversity
* Tabulating word occurances 

In [None]:
from nltk.corpus import brown

In [None]:
brown.fileids()

In [None]:
brown.categories()

In [None]:
brown.words(categories=['news'])

In [None]:

df = pd.DataFrame()
genre = ['hobbies', 'humor', 'science_fiction', 'news', 'romance', 'religion']

for categ in genre:
    tokens = len(brown.words(categories = [categ]))
    types = len(set(brown.words(categories = [categ])))
    ldiv = round(tokens/types, 2)
    df1 = pd.DataFrame({'genre':categ, 'tokens':tokens, 'types': types, 'lex_diversity': ldiv}, index=[' '])
    df = df.append(df1)
df


In [None]:
import nltk
cfd = nltk.ConditionalFreqDist(
      (genre, word)
      for genre in brown.categories()
      for word in brown.words(categories=genre))
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
modals = ['can', 'could', 'may', 'might', 'must', 'will']
cfd.tabulate(conditions=genres, samples=modals)

#### Example of temporal analysis
In this example we visualize how the words are being used at different times in history.

In [None]:
from nltk.corpus import inaugural
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [14, 8]
inaugural.fileids()[:5]

In [None]:
cfd = nltk.ConditionalFreqDist(
      (target, fileid[:4])
      for fileid in inaugural.fileids()
      for w in inaugural.words(fileid)
      for target in ['america', 'citizen']
      if w.lower().startswith(target))
cfd.plot()

Another interesting example of size of words used in different languages for expressing same idea. From the graph below you can see that english uses more small words than other european languages. (udhr = Universal Declaration of Human Rights)

In [None]:
from nltk.corpus import udhr
languages = ['English', 'German_Deutsch', 'Hungarian_Magyar']
cfd = nltk.ConditionalFreqDist(
      (lang, len(word))
      for lang in languages
      for word in udhr.words(lang+ '-Latin1'))
cfd.plot(cumulative=True)

In [None]:
udhr.fileids()

from nltk.corpus import udhr
languages = ['English-Latin1', 'German_Deutsch-Latin1', 'Hindi-UTF8']
cfd = nltk.ConditionalFreqDist(
      (lang, len(word))
      for lang in languages
      for word in udhr.words(lang))
cfd.plot(cumulative=True)
plt.show()

Can we guess the gender of a person from their names? See the illustartion below:

In [None]:
names = nltk.corpus.names
names.fileids() 
cfd = nltk.ConditionalFreqDist(
      (fileid, name[-1])
      for fileid in names.fileids()
      for name in names.words(fileid))
cfd.plot()

In the following tasks, we will demonstate how to get documents from different formats (or output from different software programs) like
* Web (.htm, .html, .xml)
* pdf (.pdf)
* word (.docx)
* text (.txt)

In [None]:
# Reading raw text files

from urllib import request
url = "http://www.gutenberg.org/files/2554/2554-0.txt"
response = request.urlopen(url)
raw = response.read().decode('utf8')
print(type(raw))
print(len(raw))
print(raw[:75])

In [None]:
tokens = nltk.word_tokenize(raw)
print(type(tokens))
print(len(tokens))
print(tokens[:15])

In [None]:
text101 = nltk.Text(tokens)
print(type(text101))
print(text101[1020:1060])

In [None]:
# reading HTML pages

url = "http://news.bbc.co.uk/2/hi/health/2284783.stm"
html = request.urlopen(url).read().decode('utf8')
html[:60]

In [None]:
from bs4 import BeautifulSoup
raw = BeautifulSoup(html, 'html.parser').get_text()
tokens = nltk.word_tokenize(raw)
tokens[125:160]

In [None]:
tokens = tokens[110:594]
text = nltk.Text(tokens)
text.concordance('gene')

In [None]:
# ' '.join(tokens)

In [None]:
# read local files
import os
os.chdir("C:/Users/Sid/Desktop/Text Mining/Day1/Decks")
f = open('wolf.txt')
raw = f.read()
raw

In [None]:
f = open('wolf.txt')
for line in f:
    print (line.strip())


In [None]:
# reading pdf files

# !pip install PyPDF2

In [None]:
import PyPDF2

In [None]:
pdfFileObj = open('boy and wolf.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
print(pdfReader.numPages)

In [None]:
pageObj = pdfReader.getPage(0)
f = pageObj.extractText()
print(f)

In [None]:
pdfFileObj.close()

In [None]:
# !pip install python-docx

In [None]:
import docx
doc = docx.Document('boy and wolf.docx')
len(doc.paragraphs)

In [None]:
doc.paragraphs[0].text

In [None]:
doc.paragraphs[2].text

In [None]:
###############  End of Lab ###################