In [1]:
# Load all items from NLTK book module
from nltk.book import *



*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [2]:
# Exploring brown corpus
from nltk.corpus import brown
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [3]:
# extracting a list of words from the review category of the Brown Corpus 
review_words = brown.words(categories='reviews')
review_words

['It', 'is', 'not', 'news', 'that', 'Nathan', ...]

In [4]:
# find the length of the list
len(review_words)

40704

# Text Extration and Preprocessing

### Tokenization: 
It the process of removing sensative data and placing unique symbols of identification in it's place to retain all the essential information

### N-grams:
N-grams are a fundamental concept in natural language processing and computational linguistics. An n-gram is a contiguous sequence of n items from a given sample of text or speech. The items can be phonemes, syllables, letters, words, or base pairs according to the application. 

N-grams are used in various applications including text mining, spell checking, speech recognition, and statistical machine learning models for language.

### Here's a breakdown of how n-grams work:

- Unigrams (1-grams): These are single units (like words or characters) in the text. 
##### For example, in the sentence "The cat sat on the mat," unigrams would include 'The', 'cat', 'sat', 'on', 'the', 'mat'.

- Bigrams (2-grams): These are sequences of two contiguous items. 
##### In the same sentence, bigrams would include 'The cat', 'cat sat', 'sat on', 'on the', 'the mat'.

- Trigrams (3-grams): These are sequences of three contiguous items. 
##### In our example, trigrams would be 'The cat sat', 'cat sat on', 'sat on the', 'on the mat'.

- Higher-order n-grams: Similarly, you can have 4-grams, 5-grams, etc., which involve longer sequences of words or characters.

In [5]:
from nltk.util import ngrams
from nltk import word_tokenize

# Example text
text = "The quick brown fox jumps over the lazy dog"

# Tokenize the text into words
tokens = word_tokenize(text)

# Generate bigrams,
bigrams = list(ngrams(tokens, 2))
bigrams

[('The', 'quick'),
 ('quick', 'brown'),
 ('brown', 'fox'),
 ('fox', 'jumps'),
 ('jumps', 'over'),
 ('over', 'the'),
 ('the', 'lazy'),
 ('lazy', 'dog')]

In [6]:
# Generate bigrams, trigrams, and 4-grams
trigrams = list(ngrams(tokens, 3))
trigrams

[('The', 'quick', 'brown'),
 ('quick', 'brown', 'fox'),
 ('brown', 'fox', 'jumps'),
 ('fox', 'jumps', 'over'),
 ('jumps', 'over', 'the'),
 ('over', 'the', 'lazy'),
 ('the', 'lazy', 'dog')]

In [7]:
fourgrams = list(ngrams(tokens, 4))
fourgrams

[('The', 'quick', 'brown', 'fox'),
 ('quick', 'brown', 'fox', 'jumps'),
 ('brown', 'fox', 'jumps', 'over'),
 ('fox', 'jumps', 'over', 'the'),
 ('jumps', 'over', 'the', 'lazy'),
 ('over', 'the', 'lazy', 'dog')]

### Stop word removal: 
It is a common preprocessing step in natural language processing (NLP) and text mining. "Stop words" are words that are filtered out before or after processing text. 

They are usually common words in a language (such as "the", "is", "in", "on", etc.) that do not add much meaning to a sentence and are thus omitted from the analysis to reduce noise and computational cost.

In [8]:
import nltk
from nltk.corpus import stopwords

# Set of English stop words
stop_words = set(stopwords.words('english'))
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [9]:
# After stop word removal we can see:
from nltk.tokenize import word_tokenize

# Example sentence
text = "This is an example showing off stop word filtration."

# Tokenize the text
word_tokens = word_tokenize(text)

# Filter out the stop words
filtered_sentence = [word for word in word_tokens if not word.lower() in stop_words]

print(filtered_sentence)

['example', 'showing', 'stop', 'word', 'filtration', '.']


### Stemming: 

It is a process in natural language processing and text mining that involves reducing words to their word stem, base or root form. 

Stemming is often used in search engines, text mining, and information retrieval to improve query matching. 

By reducing a word to its stem, different forms of the same word (like "running", "ran", "runs") are treated as the same, which can simplify text processing.

### Popular Stemming Algorithms:
- Porter Stemmer: One of the most widely used and oldest stemmers, developed by Martin Porter in 1980. It's known for its simplicity and speed.

- Lancaster Stemmer: Developed at Lancaster University, it is more aggressive than the Porter stemmer. It iterates over the word more times and has more reduction rules.

- Snowball Stemmer: Also known as the "Porter2" stemmer, it's a slightly improved version of the Porter stemmer and part of a framework of stemming algorithms.

In [10]:
from nltk.stem import PorterStemmer

# Example text
text = "The runner likes running and runs fast"

# Tokenize the text
tokens = word_tokenize(text)

# Create a stemmer
porter = PorterStemmer()

# Stem each word in the tokenized text
stemmed_words = [porter.stem(word) for word in tokens]

print(stemmed_words)

['the', 'runner', 'like', 'run', 'and', 'run', 'fast']


### Lemmatization: 

This is the method of grouping the various inflected of word so that they can be analyzed as one item. It uses a vocabulary list and morphological analysis ( POS of a word) to get the root word.

In [11]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

print(lemmatizer.lemmatize('feet'))
print(lemmatizer.lemmatize('cacti'))
print(lemmatizer.lemmatize('geese'))

foot
cactus
goose


In [12]:
# without a POS tag, lemmatizer assumes everything as a noun
print(lemmatizer.lemmatize('loving'))
print(lemmatizer.lemmatize('loving', 'v')) #with POS tag

loving
love


### Part of Speech(POS) Tagging

Part-of-Speech (POS) tagging is an important process in natural language processing (NLP) where each word in a sentence is assigned a part-of-speech tag, such as noun, verb, adjective, adverb, etc. POS tagging is a fundamental step in text analysis and helps in understanding the grammar and structure of sentences, which is crucial for various NLP tasks like parsing, named entity recognition, question answering, and machine translation.

### Types of POS Tags:

- Noun (NN): Names of things, places, people, etc.
- Verb (VB): Actions or states of being.
- Adjective (JJ): Describes or modifies a noun.
- Adverb (RB): Modifies a verb, an adjective, or another adverb.
- Pronoun (PRP): Stands in for a noun or noun phrase.
- Preposition (IN): Shows relationships between nouns (or pronouns) and other words.
- Article (DT): The articles of a sentence
- VBZ: This tag denotes a verb that is in the third person singular present tense.

### Applications : POS tagging is widely used in:

- Syntax Parsing: Understanding the grammatical structure of sentences.
- Word Sense Disambiguation: Determining the meaning of a word based on context.
- Information Retrieval: Enhancing search algorithms.
- Machine Translation: Translating text from one language to another.

### Challenges:
- Ambiguity: A word may have multiple POS tags depending on the context (e.g., "book" can be a noun or a verb).
- Language Complexity: Different languages have different grammatical rules and structures, making POS tagging more challenging.

In [13]:
# Example text
text = "The quick brown fox jumps over the lazy dog"

# Tokenize the text
tokens = word_tokenize(text)

# POS Tagging
tagged = nltk.pos_tag(tokens)

print(tagged)

[('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumps', 'VBZ'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN')]


## Name Entity Recognition ( NER )

Named Entity Recognition (NER) is a subtask of information extraction in natural language processing (NLP) that involves identifying and classifying named entities in text into predefined categories. Named entities are real-world objects that have a name, such as people, organizations, locations, dates, products, etc. The purpose of NER is to assign these entities to categories such as PERSON, ORGANIZATION, LOCATION, DATE, TIME, PRODUCT, etc.

### Techniques Used in NER
- Rule-Based Approaches: Use hand-crafted linguistic rules. For example, a rule might say that anything that looks like a proper name and is preceded by a title (like Mr., Mrs., Dr., etc.) should be classified as a PERSON.

- Statistical Approaches: Use algorithms like Hidden Markov Models (HMMs), Support Vector Machines (SVMs), or Conditional Random Fields (CRFs). These models are trained on large datasets of annotated text.

- Deep Learning Approaches: Employ neural network models, which can learn complex patterns from large amounts of data. Models like LSTM (Long Short-Term Memory), BiLSTM (Bidirectional LSTM), or BERT (Bidirectional Encoder Representations from Transformers) are popular.

### Applications of NER
- Information Retrieval: Improves the search for specific information in large datasets.
- Content Classification: Helps in categorizing content and understanding the main topics in text.
- Customer Support: Automates the extraction of relevant information from customer inquiries.
- Sentiment Analysis: Identifies entities in text to understand sentiments towards specific products, services, or brands.
- Machine Translation: Improves the accuracy of translation by understanding the role of named entities in sentences

In [15]:
# Importing necessary libraries from NLTK
import nltk

# Sample text for Named Entity Recognition
doc = """Google is an American multinational technology company that specializes in
internet-related services and products, which include online advertising technologies,
search engine, cloud computing, and hardware. It was founded in 1998 by Larry Page and Sergey Brin
while they were Ph.D. students at Stanford University in California."""

# Tokenize the doc into sentences
tokenized_doc = sent_tokenize(doc)

# Tokenize each sentence into words and then POS-tagging
tagged_sentences = [pos_tag(word_tokenize(sentence)) for sentence in tokenized_doc]

# Chunk sentences to NE (Named Entity) chunks
ne_chunked_sents = [ne_chunk(tagged) for tagged in tagged_sentences]

# Extract all named entities
named_entities = []

for ne_tagged_sentence in ne_chunked_sents:
    for tagged_tree in ne_tagged_sentence:
        # Extracting NEs here
        if hasattr(tagged_tree, 'label'):
            entity_name = ' '.join(c[0] for c in tagged_tree.leaves())  # Get the NE name
            entity_type = tagged_tree.label()  # Get the NE label
            named_entities.append((entity_name, entity_type))

named_entities

NameError: name 'sent_tokenize' is not defined

### Working with Brown Corpus 