<a href="https://colab.research.google.com/github/GianFederico/MD-repo-Natural_Language_Processing/blob/main/NLP_lab2_NLTK_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
import nltk
import re
import string
from nltk.tokenize import sent_tokenize, word_tokenize


# split the input on anything other than a word character
def onlywords(text):
    cleaned_tokens = re.split(r'\W+', text)
    return cleaned_tokens

# split on whitespace and then remove punct
def wordsnopunct(text):
    tokens=text.split()
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    return stripped

def wordmatch(text):
    cleaned_tokens = re.findall(r"\w+(?:[-']\w+)*|'|[-.(]+|\S\w*", text)
    return cleaned_tokens

def NLTKTokenize(text):
    nltk_words = word_tokenize(text)
    return nltk_words

def NLTKregtokenize(text):
    pattern = r''' (?x) (?:[A-Z]\.)+  |  \w+(?:-\w+)*  |  \$?\d+(?:\.\d+)?%?  |  \.\.\.  |  [][.,;"'?():-_`] '''
    tokens=nltk.regexp_tokenize(text, pattern)
    return tokens

def onlypunct(text):
    waste=re.findall(r"[!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~]+", text)
    return waste



nltk.download('inaugural')
nltk.download('punkt')
from nltk.corpus import inaugural

text=inaugural.raw("2009-Obama.txt")

# splitting text into tokens & cleaning#
# cleaning =  removing all undesirable content (e.g. puntcuation)

print ("PUNCTUATION: ", string.punctuation)

#1: only words with regex
print ("__________")
cleaned_tokens1=onlywords(text)
print ("STRATEGY 1 split the input on anything other than a word character:", sorted(cleaned_tokens1))
print ("# tokens: ", len(cleaned_tokens1))

#2: split on whitespace + remove punct
print ("__________")
cleaned_tokens2=wordsnopunct(text)
print ("STRATEGY 2 # split on whitespace and then remove punct:", sorted(cleaned_tokens2))
print ("# tokens: ", len(cleaned_tokens2))

diff1 = [t for t in cleaned_tokens1 if t not in cleaned_tokens2]
print ("tokens in 1 but not in 2: ", sorted(diff1))
diff2 = [t for t in cleaned_tokens2 if t not in cleaned_tokens1]
print ("tokens in 2 but not in 1", sorted(diff2))
print ("__________")
#3 matching a specific regex

cleaned_tokens3=wordmatch(text)
print ("STRATEGY 3 # matching specific regex:", sorted(cleaned_tokens3))
print ("# tokens: ", len(cleaned_tokens3))
diff31 = [t for t in cleaned_tokens3 if t not in cleaned_tokens1]
print ("tokens in 3 but not in 1: ", sorted(diff31))
diff13 = [t for t in cleaned_tokens1 if t not in cleaned_tokens3]
print ("tokens in 1 but not in 3", sorted(diff13))
print ("__________")
#4: NLTK tokenizer
tokens4=NLTKTokenize(text)
print ("STRATEGY 4 # NLTK Tokenizer:", sorted(tokens4))
print ("# tokens: ", len(tokens4))
diff42 = [t for t in tokens4 if t not in cleaned_tokens2]
print ("tokens in 4 but not in 2", sorted(diff42))
print ("__________")
#5: NLTK tokenizer
tokens5=NLTKregtokenize(text)
print ("STRATEGY 5 # NLTK regex Tokenizer:", sorted(tokens5))
print ("# tokens: ", len(tokens5))
diff54 = [t for t in tokens5 if t not in tokens4]
print ("tokens in 5 but not in 4", sorted(diff54))


print ("____________________________________")
print ("________________or__________________")
print ("____________________________________")
nltk.download('stopwords')
porter = nltk.PorterStemmer()

print ("Punctuation: ",string.punctuation)

#tokens=cleaning.wordsnopunct(text)
tokens=wordmatch(text)
print ("# tokens: ", len(tokens))
print ("tokens: ", sorted(tokens))
print ("__________")

# a little bit of cleaning --> remove specific tokens
waste = onlypunct(text)
cleaned_tokens = [t for t in tokens if not t in waste]
print ("# cleaned tokens: ", len(cleaned_tokens))
print ("CLEANED tokens: ", sorted(cleaned_tokens))
print ("__________")

# filter out stopwords
stop_words = nltk.corpus.stopwords.words('english')
nostop_tokens = [t for t in cleaned_tokens if not t in stop_words]

# normalization
words = [word.lower() for word in nostop_tokens]

# stemming / lemmatization
stemmed = [porter.stem(word) for word in words]

print ("# final stemmed tokens: ", len(stemmed))
print ("FINAL STEMMED tokens: ", sorted(stemmed))
print ("__________")

print ("# vocabulary: ", len(set(stemmed)))
print ("Vocabulary: ", sorted(set(stemmed)))

[nltk_data] Downloading package inaugural to /root/nltk_data...
[nltk_data]   Package inaugural is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


PUNCTUATION:  !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
__________
STRATEGY 1 split the input on anything other than a word character: ['', 'Afghanistan', 'All', 'All', 'America', 'America', 'America', 'America', 'America', 'America', 'America', 'America', 'America', 'America', 'American', 'American', 'Americans', 'Americans', 'Americans', 'And', 'And', 'And', 'And', 'And', 'And', 'Arlington', 'As', 'As', 'At', 'At', 'Bush', 'But', 'But', 'But', 'Christians', 'Concord', 'Domestic', 'Earth', 'Earth', 'Fathers', 'For', 'For', 'For', 'For', 'For', 'For', 'For', 'For', 'Forty', 'Founding', 'Gettysburg', 'God', 'God', 'God', 'God', 'God', 'Gross', 'Guided', 'Hindus', 'Homes', 'I', 'I', 'I', 'In', 'In', 'In', 'Instead', 'Iraq', 'It', 'It', 'It', 'It', 'Its', 'Jews', 'Khe', 'Less', 'Let', 'Let', 'Muslim', 'Muslims', 'My', 'Nor', 'Normandy', 'Now', 'On', 'On', 'Our', 'Our', 'Our', 'Our', 'Our', 'Our', 'Our', 'Our', 'Our', 'People', 'President', 'Product', 'Rather', 'Recall', 'Sahn', 'Scripture', 'So', 

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
