In [1]:
import nltk

In [2]:
def Learn_Default_Tagger(simple_sentence):
    words_in_sentence = nltk.word_tokenize(simple_sentence)
    tagger = nltk.DefaultTagger("NN")
    pos_enable_tag = tagger.tag(words_in_sentence)
    print(pos_enable_tag)

In [3]:
def Learn_Re_Tagger(simple_sentence):
    customer_patterns = [
        (r".*ing$", "ADJECTIVE"),
        (r".*ly$", "ADVERB"),
        (r".*ion", "NOUN"),
        (r"(.*ate|.*en|is)$", "VERB"),
        (r"^an$", "INDEFINITE-ARTICLE"),
        (r"^(with|on|at)$", "PREPOSITION"),
        (r"^\-?[0-9]+(\.[0-9]+)$", "NUMBER"),
        (r".*$", None)
    ]
    tagger = nltk.RegexpTagger(customer_patterns)
    words_in_sentence = nltk.word_tokenize(simple_sentence)
    pos_enable_tags = tagger.tag(words_in_sentence)
    print(pos_enable_tags)

In [4]:
def Learn_LookUp_Tagger(simple_sentence):
    mapping = {
        ".": ".",
        "place": "NN",
        "on": "IN",
        "earth": "NN",
        "Mysore": "NNP",
        "is": "VBZ",
        "an": "DT",
        "amazing": "JJ"
    }
    tagger = nltk.UnigramTagger(model= mapping)
    words_in_sentence = nltk.word_tokenize(simple_sentence)
    pos_enable_tag = tagger.tag(words_in_sentence)
    print(pos_enable_tag)

In [5]:
if __name__ == "__main__":
    test_sentence = "Mysore is an amazing place on earth. I have visited Mysore 10 times."
    Learn_Default_Tagger(test_sentence)
    Learn_Re_Tagger(test_sentence)
    Learn_LookUp_Tagger(test_sentence)

[('Mysore', 'NN'), ('is', 'NN'), ('an', 'NN'), ('amazing', 'NN'), ('place', 'NN'), ('on', 'NN'), ('earth', 'NN'), ('.', 'NN'), ('I', 'NN'), ('have', 'NN'), ('visited', 'NN'), ('Mysore', 'NN'), ('10', 'NN'), ('times', 'NN'), ('.', 'NN')]
[('Mysore', None), ('is', 'VERB'), ('an', 'INDEFINITE-ARTICLE'), ('amazing', 'ADJECTIVE'), ('place', None), ('on', 'PREPOSITION'), ('earth', None), ('.', None), ('I', None), ('have', None), ('visited', None), ('Mysore', None), ('10', None), ('times', None), ('.', None)]
[('Mysore', 'NNP'), ('is', 'VBZ'), ('an', 'DT'), ('amazing', 'JJ'), ('place', 'NN'), ('on', 'IN'), ('earth', 'NN'), ('.', '.'), ('I', None), ('have', None), ('visited', None), ('Mysore', 'NNP'), ('10', None), ('times', None), ('.', '.')]


In [6]:
import pickle

In [7]:
def Sample_Data():
    return [
        "Paris is the capital of France.",
        "Steve Jobs was the CEO of Apple.",
        "iPhone was Invented by Apple.",
        "Books can be purchased in Market."
    ]

In [8]:
def Build_Dictionary():
    dictionary = {}
    for sent in Sample_Data():
        parts_of_speech_tags = nltk.pos_tag(nltk.word_tokenize(sent))
        for tag in parts_of_speech_tags:
            value = tag[0]
            pos = tag[1]
            dictionary[value] = pos
    return dictionary

In [9]:
def Save_my_Tagger(tagger, file_name):
    file_handle = open(file_name, "wb")
    pickle.dump(tagger, file_handle)
    file_handle.close()

In [10]:
def Save_my_Traning(file_name):
    tagger = nltk.UnigramTagger(model= Build_Dictionary())
    Save_my_Tagger(tagger, file_name)

In [11]:
def Load_my_Tagger(file_name):
    return pickle.load(open(file_name, "rb"))

In [12]:
sentence = "Iphone is purchased by Steve Jobs in Paris Market"
file_name = "my_Tagger.pickle"

In [13]:
Save_my_Traning(file_name)

In [14]:
my_Tagger = Load_my_Tagger(file_name)

In [15]:
print(my_Tagger.tag(nltk.word_tokenize(sentence)))

[('Iphone', None), ('is', 'VBZ'), ('purchased', 'VBN'), ('by', 'IN'), ('Steve', 'NNP'), ('Jobs', 'NNP'), ('in', 'IN'), ('Paris', 'NNP'), ('Market', 'NNP')]


In [16]:
import string
from nltk.parse.generate import generate

In [17]:
productions = [
    "ROOt -> WORD",
    "WORd -> ' '",
    "WORD -> NUMBER LETTER",
    "WORD -> LETTER NUMBER"
]

In [18]:
digits = list(string.digits)

for digit in digits[:4]:
    productions.append("NUMBER -> '{w}'".format(w= digit))

In [19]:
letters = "' | '".join(list(string.ascii_lowercase)[:4])
productions.append("LETTER -> '{w}'".format(w= letters))

In [20]:
grammar_string = "\n".join(productions)

In [21]:
grammar = nltk.CFG.fromstring(grammar_string)
print(grammar)

Grammar with 12 productions (start state = ROOt)
    ROOt -> WORD
    WORd -> ' '
    WORD -> NUMBER LETTER
    WORD -> LETTER NUMBER
    NUMBER -> '0'
    NUMBER -> '1'
    NUMBER -> '2'
    NUMBER -> '3'
    LETTER -> 'a'
    LETTER -> 'b'
    LETTER -> 'c'
    LETTER -> 'd'


In [22]:
for sentence in generate(grammar, n= 5, depth= 5):
    palindrom = "".join(sentence).replace(" ", "")
    print("Generated Word: {}".format(palindrom, len(palindrom)))

Generated Word: 0a
Generated Word: 0b
Generated Word: 0c
Generated Word: 0d
Generated Word: 1a


In [23]:
productions = [
    "ROOT -> WORD [1.0]",
    "WORD -> P1 P2 [0.25]",
    "WORD -> P1 P2 [0.25]",
    "WORD -> P1 P2 P3 [0.25]",
    "WORD -> P1 P2 P3 P4 [0.25]",
    "P1 -> 'A' [1.0]",
    "P2 -> 'B' [0.5]",
    "P2 -> 'C' [0.5]",
    "P3 -> 'D' [0.3]",
    "P3 -> 'E' [0.3]",
    "P3 -> 'F' [0.4]",
    "P4 -> 'G' [0.9]",
    "P4 -> 'H' [0.1]"
]

In [24]:
grammar_string = "\n".join(productions)

In [25]:
grammar = nltk.PCFG.fromstring(grammar_string)
print(grammar)

Grammar with 13 productions (start state = ROOT)
    ROOT -> WORD [1.0]
    WORD -> P1 P2 [0.25]
    WORD -> P1 P2 [0.25]
    WORD -> P1 P2 P3 [0.25]
    WORD -> P1 P2 P3 P4 [0.25]
    P1 -> 'A' [1.0]
    P2 -> 'B' [0.5]
    P2 -> 'C' [0.5]
    P3 -> 'D' [0.3]
    P3 -> 'E' [0.3]
    P3 -> 'F' [0.4]
    P4 -> 'G' [0.9]
    P4 -> 'H' [0.1]


In [26]:
for sentence in generate(grammar, n= 10, depth= 5):
    palindrom = "".join(sentence).replace(" ", "")
    print("String: {}, Size: {}".format(palindrom, len(palindrom)))

String: AB, Size: 2
String: AC, Size: 2
String: AB, Size: 2
String: AC, Size: 2
String: ABD, Size: 3
String: ABE, Size: 3
String: ABF, Size: 3
String: ACD, Size: 3
String: ACE, Size: 3
String: ACF, Size: 3
