In [1]:
import nltk

# Tokenization

In [2]:
# NLTK Word Tokenization

from nltk.tokenize import word_tokenize

text = "Waxaan ahay Arday. Waxaan dhigtaa Jaamacadda."
word_tokenize(text)

['Waxaan', 'ahay', 'Arday', '.', 'Waxaan', 'dhigtaa', 'Jaamacadda', '.']

In [3]:
# NLTK Sentence Tokenization

from nltk.tokenize import sent_tokenize

text = "Waxaan ahay arday. Waxaan dhigtaa Jaamacadda. Waxaan daganahay Xaafadda Xawaadle"
sent_tokenize(text)

['Waxaan ahay arday.',
 'Waxaan dhigtaa Jaamacadda.',
 'Waxaan daganahay Xaafadda Xawaadle']

# Stemming & Lemmatization

In [4]:
# Stemming in NLTK

from nltk.stem.porter import PorterStemmer

words = ['first', 'time', 'sees', 'seconds', 'looking', 'look', 'definitely', 'watches', 'change', 'ones', 'started']
stemmed_words = [PorterStemmer().stem(w) for w in words]
print(stemmed_words)

['first', 'time', 'see', 'second', 'look', 'look', 'definit', 'watch', 'chang', 'one', 'start']


In [5]:
# Lemmatization in NLTK

from nltk.stem.wordnet import WordNetLemmatizer

lemmed_words = [WordNetLemmatizer().lemmatize(w) for w in words]
print(lemmed_words)

['first', 'time', 'see', 'second', 'looking', 'look', 'definitely', 'watch', 'change', 'one', 'started']


# Context Free Grammar

In [3]:
# Context Free Grammar

grammar = nltk.CFG.fromstring("""

    S -> NP Conj N | Adj NP
    NP -> N Conj N | Adj N
    Adj -> 'smart'
    N -> 'girls' | 'boys'
    Conj -> 'and'

""")
sent = "smart girls and boys".split()
sent

['smart', 'girls', 'and', 'boys']

In [6]:
parser = nltk.RecursiveDescentParser(grammar)
parsed_sent = parser.parse(sent)
for tree in parser.parse(sent):
    tree.draw()

In [8]:
mygrammar = nltk.PCFG.fromstring("""

    S -> NP Conj N [0.6]
    S -> Adj NP [0.4]
    NP -> N Conj N [0.5]
    NP -> Adj N [0.5]
    Adj -> 'smart' [1]
    N -> 'girls' [0.5]
    N -> 'boys' [0.5]
    Conj -> 'and' [1]

""")
mysent = "smart girls and boys".split()
mysent

['smart', 'girls', 'and', 'boys']

In [5]:
myparser = nltk.ViterbiParser(mygrammar)
myparsed_sent = parser.parse(mysent)
for tree in myparser.parse(mysent):
    tree.draw()

NameError: name 'mygrammar' is not defined

In [9]:
import nltk

                         ##############################################
                         #    J = Jumlad       = Sentence             #
                         #    M = Magac        = Noun                 #
                         #    Y = Magac u yaal = Pronoun              #
                         #    S = Sifo         = Adjective            #
                         #    F = Ficil        = Verb                 #
                         #    E = Meeleeye     = Proposition          #
                         #    Q = Qodob        = Determinant (Article)#
                         #    W = Weedh        = Phrase               #
                         # 1. iyadu waxay aragtay suuqa weyn          #
                         # 2. isagu wuxuu ku socday gaadhiga yar      #
                         # 3. annigu suuqa waan tagay                 #
                         # 4. iyadu wey socotay                       #
                         # 5. casho da kahor annigu waan seexday      #
                         ##############################################

grammar = nltk.CFG.fromstring("""
    J -> MW FW | EW FW | MW MW EW FW | EW EW FW | MW Q EW FW 

    SW -> S | S SW
    MW -> M | M Q | M S | EW M
    EW -> E | E MW
    FW -> F | E F | E E F M S | E F M S 

    S -> "weyn" | "cagaar" | "yar" | "qalayl" | "balaadhan"
    Q -> "ka" | "ga" | "ha" | "a" | "ta" | "da" | "sha"
    M -> "iyadu" | "annigu" | "suuqa" | "gaadhiga" | "wado" | "ey" | "isagu" | "casho"
    E -> "waxay" | "wuxuu" | "wey" | "kadib" | "kahor" | "hoose" | "la" | "waan" | "wuu" | "ku"
    F -> "aragtay" | "socday" | "maqashay" | "tagay" | "seexday"
""")

parser = nltk.ChartParser(grammar)

sentence = input("Geli Jumladda aad doonayso in aad ogaato: ").split()
try:
    for tree in parser.parse(sentence):
        tree.pretty_print()
        tree.draw()
        break
except ValueError:
    print("No parse tree possible.")

Geli Jumladda aad doonayso in aad ogaato: casho da kahor annigu waan seexday
            J                              
   _________|____________________           
  |    |         EW              |         
  |    |     ____|____           |          
  MW   |    |         MW         FW        
  |    |    |         |      ____|_____     
  M    Q    E         M     E          F   
  |    |    |         |     |          |    
casho  da kahor     annigu waan     seexday



# POS Tagging

In [7]:
# NLTK Tagger

text="This is a text to test part of speech tagging in NLTK"
token= nltk.word_tokenize(text)
nltk.pos_tag(token) 

[('This', 'DT'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('text', 'NN'),
 ('to', 'TO'),
 ('test', 'VB'),
 ('part', 'NN'),
 ('of', 'IN'),
 ('speech', 'NN'),
 ('tagging', 'NN'),
 ('in', 'IN'),
 ('NLTK', 'NNP')]

In [9]:
# Default Tagger

text1=" Annigu shalay waxaan tagay Burco oo waxaan booqday saaxiibbaday "
token1= nltk.word_tokenize(text1)
#nltk.pos_tag(token1) 
som_defalut_tagger=nltk.DefaultTagger('NN')
som_defalut_tagger.tag(token1)

[('Annigu', 'NN'),
 ('shalay', 'NN'),
 ('waxaan', 'NN'),
 ('tagay', 'NN'),
 ('Burco', 'NN'),
 ('oo', 'NN'),
 ('waxaan', 'NN'),
 ('booqday', 'NN'),
 ('saaxiibbaday', 'NN')]

# NLTK N-grams

In [14]:
# Unigram 

from nltk.util import ngrams

n = 1
sentence = 'You will face many defeats in life, but never let yourself be defeated.'
unigrams = ngrams(sentence.split(), n)

for item in unigrams:
    print(item)

('You',)
('will',)
('face',)
('many',)
('defeats',)
('in',)
('life,',)
('but',)
('never',)
('let',)
('yourself',)
('be',)
('defeated.',)


In [15]:
# Bigram

from nltk.util import ngrams

n = 2
sentence = 'The purpose of our life is to happy'
bigrams = ngrams(sentence.split(), n)

for item in bigrams:
    print(item)

('The', 'purpose')
('purpose', 'of')
('of', 'our')
('our', 'life')
('life', 'is')
('is', 'to')
('to', 'happy')


In [16]:
# Trigram

from nltk.util import ngrams

n = 3
sentence = 'Whoever is happy will make others happy too'
trigrams = ngrams(sentence.split(), n)

for item in trigrams:
    print(item)

('Whoever', 'is', 'happy')
('is', 'happy', 'will')
('happy', 'will', 'make')
('will', 'make', 'others')
('make', 'others', 'happy')
('others', 'happy', 'too')


In [11]:
from collections import Counter

import math
import nltk
import os
import sys


def main():
    """Calculate top term frequencies for a corpus of documents."""

    print("Loading data...")

    corpus = load_data('Sheekooyin')

    # Compute n-grams
    ngrams = Counter(nltk.ngrams(corpus, 1))

    # Print most common n-grams
    for ngram, freq in ngrams.most_common(10):
        print(f"{freq}: {ngram}")


def load_data(directory):
    contents = []

    # Read all files and extract words
    for filename in os.listdir(directory):
        with open(os.path.join(directory, filename)) as f:
            contents.extend([
                word.lower() for word in
                nltk.word_tokenize(f.read())
                if any(c.isalpha() for c in word)
            ])
    return contents


if __name__ == "__main__":
    main()

Loading data...
136: ('oo',)
105: ('ku',)
90: ('uu',)
67: ('ka',)
66: ('soo',)
63: ('u',)
41: ('yidhi',)
41: ('ayuu',)
39: ('ee',)
38: ('dheeg',)
