In [1]:
import nltk
from nltk.corpus import gutenberg

In [2]:
milton_paradise = gutenberg.raw('milton-paradise.txt')

In [3]:
milton_paradise[-10:]

u'he End]\x1a\x1a\n'

In [4]:
def text_preprocess(document):
    sentences = nltk.sent_tokenize(document)
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    sentences = [nltk.pos_tag(sent) for sent in sentences]
    return(sentences)

In [5]:
milton_paradise_sents = text_preprocess(milton_paradise)

In [6]:
import pandas as pd

In [7]:
pdf = pd.DataFrame({
        'sentences': nltk.sent_tokenize(milton_paradise)
    })
df = spark.createDataFrame(pdf)

In [8]:
df.show(n=5)

+--------------------+
|           sentences|
+--------------------+
|[Paradise Lost by...|
|And chiefly thou,...|
|Say first--for He...|
|Who first seduced...|
|Th' infernal Serp...|
+--------------------+
only showing top 5 rows



In [87]:
sent_to_tag_words(df.head()['sentences'])[:5]

[(u'[', 'JJ'),
 (u'Paradise', 'NNP'),
 (u'Lost', 'VBN'),
 (u'by', 'IN'),
 (u'John', 'NNP')]

In [9]:
from pyspark.sql.functions import udf
from pyspark.sql.types import *

In [84]:
def sent_to_tag_words(sent):
    wordlist = nltk.word_tokenize(sent)
    tagged_words = nltk.pos_tag(wordlist)
    return(tagged_words)
schema = ArrayType(StructType([
            StructField('word', StringType()),
            StructField('pos_tag', StringType())
        ]))
sent_to_tag_words_udf = udf(sent_to_tag_words, schema)
# word_tokenize_udf = udf(lambda x: nltk.word_tokenize(x), ArrayType(StringType()))
# pos_tag_udf = udf(lambda x: nltk.pos_tag(x), ArrayType(ArrayType(StringType())))

In [85]:
df_tagged_words = df.select(sent_to_tag_words_udf(df.sentences).alias('tagged_words'))
df_tagged_words.show(5)

+--------------------+
|        tagged_words|
+--------------------+
|[[[,JJ], [Paradis...|
|[[And,CC], [chief...|
|[[Say,NNP], [firs...|
|[[Who,WP], [first...|
|[[Th,NNP], [',POS...|
+--------------------+
only showing top 5 rows



In [12]:
# df_tokenized_words = df.select(word_tokenize_udf(df.sentences).alias('tokenized_words'))
# df_pos_tagged_words = df_tokenized_words.select(pos_tag_udf(df_tokenized_words.tokenized_words).alias('pos_tagged_words'))

In [13]:
# nltk.pos_tag(df_tokenized_words.head()['tokenized_words'])
# df_pos_tagged_words.show(truncate=False)

## Chunking
**A simple noun phrase chunking**

In [105]:
import nltk
from pyspark.sql.functions import udf
from pyspark.sql.types import *

# define a udf function to chunk noun phrases from pos-tagged words
grammar = "NP: {<DT>?<JJ>*<NN>}"
chunk_parser = nltk.RegexpParser(grammar)

# schema = ArrayType(StructType([
#             StructField('f1', StringType()),
#             StructField('f2', StringType())
#         ]))
chunk_parser_udf = udf(lambda x: str(chunk_parser.parse(x)), StringType())

In [106]:
tagged_word_sent = df_tagged_words.head()['tagged_words']
tagged_word_sent[:5]

[Row(word=u'[', pos_tag=u'JJ'),
 Row(word=u'Paradise', pos_tag=u'NNP'),
 Row(word=u'Lost', pos_tag=u'VBN'),
 Row(word=u'by', pos_tag=u'IN'),
 Row(word=u'John', pos_tag=u'NNP')]

In [134]:
chunk_sent = chunk_parser.parse(tagged_word_sent)
chunk_sent[:50]

[Row(word=u'[', pos_tag=u'JJ'),
 Row(word=u'Paradise', pos_tag=u'NNP'),
 Row(word=u'Lost', pos_tag=u'VBN'),
 Row(word=u'by', pos_tag=u'IN'),
 Row(word=u'John', pos_tag=u'NNP'),
 Row(word=u'Milton', pos_tag=u'NNP'),
 Row(word=u'1667', pos_tag=u'CD'),
 Row(word=u']', pos_tag=u'NNP'),
 Row(word=u'Book', pos_tag=u'NNP'),
 Row(word=u'I', pos_tag=u'PRP'),
 Row(word=u'Of', pos_tag=u'IN'),
 Row(word=u'Man', pos_tag=u'NNP'),
 Row(word=u"'s", pos_tag=u'POS'),
 Tree('NP', [Row(word=u'first', pos_tag=u'JJ'), Row(word=u'disobedience', pos_tag=u'NN')]),
 Row(word=u',', pos_tag=u','),
 Row(word=u'and', pos_tag=u'CC'),
 Tree('NP', [Row(word=u'the', pos_tag=u'DT'), Row(word=u'fruit', pos_tag=u'NN')]),
 Row(word=u'Of', pos_tag=u'IN'),
 Tree('NP', [Row(word=u'that', pos_tag=u'DT'), Row(word=u'forbidden', pos_tag=u'JJ'), Row(word=u'tree', pos_tag=u'NN')]),
 Row(word=u'whose', pos_tag=u'WP$'),
 Tree('NP', [Row(word=u'mortal', pos_tag=u'JJ'), Row(word=u'taste', pos_tag=u'NN')]),
 Row(word=u'Brought', pos_ta

In [108]:
df_NP_chunks = df_tagged_words.select(chunk_parser_udf(df_tagged_words.tagged_words).alias('NP_chunk'))

In [109]:
df_NP_chunks.show(2, truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [112]:
NP_chunk_string = df_NP_chunks.head()['NP_chunk']

In [115]:
# nltk.ne_chunk(NP_chunk_string, binary=True)

In [117]:
NP_chunk_tree = nltk.chunk.tagstr2tree(NP_chunk_string)

In [129]:
type(NP_chunk_tree)

nltk.tree.Tree

In [131]:
NP_chunk_tree.leaves()

[(u'(S', None),
 (u'', u'JJ'),
 (u'Paradise', u'NNP'),
 (u'Lost', u'VBN'),
 (u'by', u'IN'),
 (u'John', u'NNP'),
 (u'Milton', u'NNP'),
 (u'1667', u'CD'),
 (u'', u'NNP'),
 (u'Book', u'NNP'),
 (u'I', u'PRP'),
 (u'Of', u'IN'),
 (u'Man', u'NNP'),
 (u"'s", u'POS'),
 (u'(NP', None),
 (u'first', u'JJ'),
 (u'disobedience', u'NN)'),
 (u',', u','),
 (u'and', u'CC'),
 (u'(NP', None),
 (u'the', u'DT'),
 (u'fruit', u'NN)'),
 (u'Of', u'IN'),
 (u'(NP', None),
 (u'that', u'DT'),
 (u'forbidden', u'JJ'),
 (u'tree', u'NN)'),
 (u'whose', u'WP$'),
 (u'(NP', None),
 (u'mortal', u'JJ'),
 (u'taste', u'NN)'),
 (u'Brought', u'NNP'),
 (u'(NP', None),
 (u'death', u'NN)'),
 (u'into', u'IN'),
 (u'the', u'DT'),
 (u'World', u'NNP'),
 (u',', u','),
 (u'and', u'CC'),
 (u'all', u'DT'),
 (u'our', u'PRP$'),
 (u'(NP', None),
 (u'woe', u'NN)'),
 (u',', u','),
 (u'With', u'IN'),
 (u'(NP', None),
 (u'loss', u'NN)'),
 (u'of', u'IN'),
 (u'Eden', u'NNP'),
 (u',', u','),
 (u'till', u'VB'),
 (u'one', u'CD'),
 (u'greater', u'JJR'),


In [133]:
print(nltk.ne_chunk(df_tagged_words.head()['tagged_words']))

(S
  [/JJ
  (ORGANIZATION Paradise/NNP)
  Lost/VBN
  by/IN
  (PERSON John/NNP Milton/NNP)
  1667/CD
  ]/NNP
  Book/NNP
  I/PRP
  Of/IN
  (PERSON Man/NNP)
  's/POS
  first/JJ
  disobedience/NN
  ,/,
  and/CC
  the/DT
  fruit/NN
  Of/IN
  that/DT
  forbidden/JJ
  tree/NN
  whose/WP$
  mortal/JJ
  taste/NN
  (PERSON Brought/NNP)
  death/NN
  into/IN
  the/DT
  World/NNP
  ,/,
  and/CC
  all/DT
  our/PRP$
  woe/NN
  ,/,
  With/IN
  loss/NN
  of/IN
  (GPE Eden/NNP)
  ,/,
  till/VB
  one/CD
  greater/JJR
  Man/NN
  Restore/NNP
  us/PRP
  ,/,
  and/CC
  regain/VB
  the/DT
  blissful/JJ
  seat/NN
  ,/,
  (GPE Sing/NNP)
  ,/,
  (PERSON Heavenly/NNP Muse/NNP)
  ,/,
  that/WDT
  ,/,
  on/IN
  the/DT
  secret/JJ
  top/NN
  Of/IN
  (GPE Oreb/NNP)
  ,/,
  or/CC
  of/IN
  (GPE Sinai/NNP)
  ,/,
  didst/NN
  inspire/NN
  That/WDT
  shepherd/NN
  who/WP
  first/RB
  taught/VBD
  the/DT
  chosen/NN
  seed/NN
  In/IN
  the/DT
  beginning/NN
  how/WRB
  the/DT
  heavens/NNS
  and/CC
  earth/NN
  Rose/NNP
 