## Using NLTK

In [None]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('punkt_tab')

text = "Hello world! NLP is amazing. Let's tokenize this sentence."

# Word Tokenization
print("Word Tokens:", word_tokenize(text))

# Sentence Tokenization
print("Sentence Tokens:", sent_tokenize(text))

Word Tokens: ['Hello', 'world', '!', 'NLP', 'is', 'amazing', '.', 'Let', "'s", 'tokenize', 'this', 'sentence', '.']
Sentence Tokens: ['Hello world!', 'NLP is amazing.', "Let's tokenize this sentence."]


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## Using Spacy

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

doc = nlp(text)
print("Word Tokens:", [token.text for token in doc])
print("Sentence Tokens:", [sent.text for sent in doc.sents])

Word Tokens: ['Hello', 'world', '!', 'NLP', 'is', 'amazing', '.', 'Let', "'s", 'tokenize', 'this', 'sentence', '.']
Sentence Tokens: ['Hello world!', 'NLP is amazing.', "Let's tokenize this sentence."]


## NLP Tasks

## Basic Word Count & Frequency Distribution

In [None]:
from collections import Counter
words = word_tokenize(text.lower())
word_freq = Counter(words)
print("Total Word Count:", len(words))
print("Word Frequency Distribution:", word_freq.most_common(5))

Total Word Count: 13
Word Frequency Distribution: [('.', 2), ('hello', 1), ('world', 1), ('!', 1), ('nlp', 1)]


### Stopwords removal with frequency analysis

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word not in stop_words]
filtered_freq = Counter(filtered_words)

print("Filtered Word Frequency Distribution:", filtered_freq.most_common(5))

Filtered Word Frequency Distribution: [('.', 2), ('hello', 1), ('world', 1), ('!', 1), ('nlp', 1)]


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### NER with Context Analysis

In [None]:
import spacy
from collections import Counter

nlp = spacy.load("en_core_web_sm")  # Load spaCy model
text = "Apple is looking at buying U.K. startup for $1 billion."
doc = nlp(text)  # Process the text


entity_freq = Counter([ent.text for ent in doc.ents])
for ent, freq in entity_freq.most_common():
    print(f"Entity: {ent}, Count: {freq}, Label: {nlp(ent)[0].ent_type_}")

Entity: Apple, Count: 1, Label: ORG
Entity: U.K., Count: 1, Label: GPE
Entity: $1 billion, Count: 1, Label: MONEY


### N-Grams with frequency analysis

In [None]:
from nltk.util import ngrams
bigrams = list(ngrams(words, 2))
trigrams = list(ngrams(words, 3))
bigram_freq = Counter(bigrams)
trigram_freq = Counter(trigrams)
print("Most Common Bigrams:", bigram_freq.most_common(5))
print("Most Common Trigrams:", trigram_freq.most_common(5))

Most Common Bigrams: [(('hello', 'world'), 1), (('world', '!'), 1), (('!', 'nlp'), 1), (('nlp', 'is'), 1), (('is', 'amazing'), 1)]
Most Common Trigrams: [(('hello', 'world', '!'), 1), (('world', '!', 'nlp'), 1), (('!', 'nlp', 'is'), 1), (('nlp', 'is', 'amazing'), 1), (('is', 'amazing', '.'), 1)]


### Keyword Extraction using TF-IDF with Tokenization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform([text])
feature_array = vectorizer.get_feature_names_out()
importance = X.toarray().flatten()
important_words = sorted(zip(feature_array, importance), key=lambda x: x[1], reverse=True)[:5]
print("Top Keywords:", [word for word, _ in important_words])

Top Keywords: ['apple', 'at', 'billion', 'buying', 'for']


### POS Tagging

In [None]:
for token in doc:
    print(f"Token: {token.text}, POS: {token.pos_}")

Token: Apple, POS: PROPN
Token: is, POS: AUX
Token: looking, POS: VERB
Token: at, POS: ADP
Token: buying, POS: VERB
Token: U.K., POS: PROPN
Token: startup, POS: VERB
Token: for, POS: ADP
Token: $, POS: SYM
Token: 1, POS: NUM
Token: billion, POS: NUM
Token: ., POS: PUNCT


### Sentence Similarity

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
sentences = ["I love NLP", "NLP is great for text processing", "Machine learning is amazing"]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(sentences)
print("Sentence Similarity Matrix:")
print((X * X.T).toarray())

Sentence Similarity Matrix:
[[2 1 0]
 [1 6 1]
 [0 1 4]]


### Text Summarization

In [None]:
sentence_scores = {}
for sent in doc.sents:
    for word in word_tokenize(sent.text.lower()):
        if word in filtered_freq:
            sentence_scores[sent.text] = sentence_scores.get(sent.text, 0) + filtered_freq[word]
sorted_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)
summary = " ".join(sorted_sentences[:2])
print("Summary:", summary)

Summary: Apple is looking at buying U.K. startup for $1 billion.


In [None]:
import spacy
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter

# Download required NLTK tokenizer
nltk.download('punkt')

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Input text
text = """Natural Language Processing (NLP) is a field of AI that enables machines to understand human language.
It is used in chatbots, sentiment analysis, and language translation.
One of the key challenges in NLP is understanding context and ambiguity in sentences.
Deep learning models like transformers have significantly improved NLP applications."""

# Process text with spaCy
doc = nlp(text)

# Tokenize words and compute frequency
words = [word.lower() for word in word_tokenize(text) if word.isalnum()]
filtered_freq = Counter(words)

# Compute sentence scores
sentence_scores = {}
for sent in doc.sents:
    for word in word_tokenize(sent.text.lower()):
        if word in filtered_freq:
            sentence_scores[sent.text] = sentence_scores.get(sent.text, 0) + filtered_freq[word]

# Sort sentences by score
sorted_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)

# Select top-ranked sentences for summary
summary = " ".join(sorted_sentences[:2])

# Print summary
print("Summary:", summary)
