#Text Corpora & Preprocessing

In [None]:
import nltk
nltk.download('gutenberg')
from nltk.corpus import gutenberg

# Load and view a sample
sample = gutenberg.raw('austen-emma.txt')[:500]
print(sample)

[Emma by Jane Austen 1816]

VOLUME I

CHAPTER I


Emma Woodhouse, handsome, clever, and rich, with a comfortable home
and happy disposition, seemed to unite some of the best blessings
of existence; and had lived nearly twenty-one years in the world
with very little to distress or vex her.

She was the youngest of the two daughters of a most affectionate,
indulgent father; and had, in consequence of her sister's marriage,
been mistress of his house from a very early period.  Her mother
had died t


[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [None]:
#Task1:
#Print the first 300 characters from a different book
from nltk.corpus import gutenberg
sample = gutenberg.raw('shakespeare-macbeth.txt')[:300]
print(sample)
print ('---------------------------------------------------------------------')
#Try basic cleanup: lowercase, remove punctuation
import re
cleaned = sample.lower()
print(re.sub(r'[^\s\w]','',cleaned))

[The Tragedie of Macbeth by William Shakespeare 1603]


Actus Primus. Scoena Prima.

Thunder and Lightning. Enter three Witches.

  1. When shall we three meet againe?
In Thunder, Lightning, or in Raine?
  2. When the Hurley-burley's done,
When the Battaile's lost, and wonne

   3. That will be ere 
---------------------------------------------------------------------
the tragedie of macbeth by william shakespeare 1603


actus primus scoena prima

thunder and lightning enter three witches

  1 when shall we three meet againe
in thunder lightning or in raine
  2 when the hurleyburleys done
when the battailes lost and wonne

   3 that will be ere 


#Bag of Words (BoW)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

docs = ["I love NLP", "NLP is fun and powerful"]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(docs)

print(vectorizer.get_feature_names_out())
print(X.toarray())

['and' 'fun' 'is' 'love' 'nlp' 'powerful']
[[0 0 0 1 1 0]
 [1 1 1 0 1 1]]


In [None]:
# Task:
# Add a third document
docs = ["I love NLP", "NLP is fun and powerful","NLP is part of AI"]
vectorizer = CountVectorizer()

# Check how BoW vector changes

X = vectorizer.fit_transform(docs)
print(vectorizer.get_feature_names_out())
print(X.toarray())


['ai' 'and' 'fun' 'is' 'love' 'nlp' 'of' 'part' 'powerful']
[[0 0 0 0 1 1 0 0 0]
 [0 1 1 1 0 1 0 0 1]
 [1 0 0 1 0 1 1 1 0]]


#TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
X = tfidf.fit_transform(docs)

print(tfidf.get_feature_names_out())
print(X.toarray())


['ai' 'and' 'fun' 'is' 'love' 'nlp' 'of' 'part' 'powerful']
[[0.         0.         0.         0.         0.861037   0.50854232
  0.         0.         0.        ]
 [0.         0.50461134 0.50461134 0.38376993 0.         0.29803159
  0.         0.         0.50461134]
 [0.50461134 0.         0.         0.38376993 0.         0.29803159
  0.50461134 0.50461134 0.        ]]


In [None]:
#  Task:
# Compare BoW and TF-IDF outputs side by side
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

docs = ["I love NLP", "NLP is fun and powerful","NLP is part of AI"]

vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(docs)

print("Bag of Words:")
print(vectorizer.get_feature_names_out())
print(X_bow.toarray())

tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(docs)

print("\nTF-IDF:")
print(tfidf.get_feature_names_out())
print(X_tfidf.toarray())

Bag of Words:
['ai' 'and' 'fun' 'is' 'love' 'nlp' 'of' 'part' 'powerful']
[[0 0 0 0 1 1 0 0 0]
 [0 1 1 1 0 1 0 0 1]
 [1 0 0 1 0 1 1 1 0]]

TF-IDF:
['ai' 'and' 'fun' 'is' 'love' 'nlp' 'of' 'part' 'powerful']
[[0.         0.         0.         0.         0.861037   0.50854232
  0.         0.         0.        ]
 [0.         0.50461134 0.50461134 0.38376993 0.         0.29803159
  0.         0.         0.50461134]
 [0.50461134 0.         0.         0.38376993 0.         0.29803159
  0.50461134 0.50461134 0.        ]]


#Word Embeddings (Pre-trained)

In [None]:
pip install gensim



In [None]:
import gensim.downloader as api
model = api.load("glove-wiki-gigaword-100")

print(model['king'])  # embedding of 'king'
print(model.similarity('king', 'queen'))


[-0.32307  -0.87616   0.21977   0.25268   0.22976   0.7388   -0.37954
 -0.35307  -0.84369  -1.1113   -0.30266   0.33178  -0.25113   0.30448
 -0.077491 -0.89815   0.092496 -1.1407   -0.58324   0.66869  -0.23122
 -0.95855   0.28262  -0.078848  0.75315   0.26584   0.3422   -0.33949
  0.95608   0.065641  0.45747   0.39835   0.57965   0.39267  -0.21851
  0.58795  -0.55999   0.63368  -0.043983 -0.68731  -0.37841   0.38026
  0.61641  -0.88269  -0.12346  -0.37928  -0.38318   0.23868   0.6685
 -0.43321  -0.11065   0.081723  1.1569    0.78958  -0.21223  -2.3211
 -0.67806   0.44561   0.65707   0.1045    0.46217   0.19912   0.25802
  0.057194  0.53443  -0.43133  -0.34311   0.59789  -0.58417   0.068995
  0.23944  -0.85181   0.30379  -0.34177  -0.25746  -0.031101 -0.16285
  0.45169  -0.91627   0.64521   0.73281  -0.22752   0.30226   0.044801
 -0.83741   0.55006  -0.52506  -1.7357    0.4751   -0.70487   0.056939
 -0.7132    0.089623  0.41394  -1.3363   -0.61915  -0.33089  -0.52881
  0.16483  -0.98878

In [None]:
# Task:
# Try similarity between different word pairs
#print(model['king'])  # embedding of 'king'
print(model.similarity('man', 'women'))
print(model.similarity('doctor', 'hospital'))
print(model.similarity('cat', 'dog'))

0.5303662
0.69009304
0.8798075


#Cosine Similarity Between TextsA

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(X[0], X[1])
print("Cosine similarity between doc 1 and doc 2:", similarity)


Cosine similarity between doc 1 and doc 2: [[0.15156167]]


In [None]:
# task:
# Try with 3+ documents, find the most similar pair
docs = ["I love NLP", "NLP is fun and powerful","NLP is part of AI"]

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(docs)

similarity_matrix = cosine_similarity(tfidf_matrix)
print(similarity_matrix)

[[1.         0.15156167 0.15156167]
 [0.15156167 1.         0.23610219]
 [0.15156167 0.23610219 1.        ]]


#Precision, Recall, F1-Score & Confusion Matrix

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

# Example data
y_true = [1, 0, 1, 1, 0, 1, 0]
y_pred = [1, 0, 1, 0, 0, 1, 1]

print("Precision:", precision_score(y_true, y_pred))
print("Recall:", recall_score(y_true, y_pred))
print("F1 Score:", f1_score(y_true, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))


Precision: 0.75
Recall: 0.75
F1 Score: 0.75
Confusion Matrix:
 [[2 1]
 [1 3]]


In [None]:
# task:
# Change predictions and observe metric changes
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

# Example data
y_true = [1, 0, 1, 1, 0, 1, 0]
y_pred = [1, 0, 1, 1, 0, 0, 0]

print("Precision:", precision_score(y_true, y_pred))
print("Recall:", recall_score(y_true, y_pred))
print("F1 Score:", f1_score(y_true, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))


Precision: 1.0
Recall: 0.75
F1 Score: 0.8571428571428571
Confusion Matrix:
 [[3 0]
 [1 3]]


#Word Co-occurrence (Intro)

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from collections import Counter
import nltk
from nltk.corpus import reuters
from nltk.tokenize import TreebankWordTokenizer

nltk.download('reuters')

# Use Treebank tokenizer (no 'punkt' needed)
tokenizer = TreebankWordTokenizer()

# Load and tokenize text
text = reuters.raw(fileids=['test/14826'])[:1000]
tokens = tokenizer.tokenize(text)

# Build co-occurrence window
window_size = 2
pairs = []

for i in range(len(tokens) - window_size):
    window = tokens[i:i + window_size + 1]
    for j in range(1, len(window)):
        pairs.append((window[0], window[j]))

# Count and print top word pairs
co_occurrence = Counter(pairs)
print(co_occurrence.most_common(10))


[nltk_data] Downloading package reuters to /root/nltk_data...


[(('in', 'the'), 3), (('the', 'U.S.'), 2), (('the', 'And'), 2), (('U.S.', 'And'), 2), (('that', 'the'), 2), (('on', 'imports'), 2), (('imports', 'of'), 2), ((',', 'in'), 2), (('of', 'tariffs'), 2), (('ASIAN', 'EXPORTERS'), 1)]


In [None]:
# Task:
# Try changing window_size, and observe the new top pairs
window_size = 3
pairs = []

for i in range(len(tokens) - window_size):
    window = tokens[i:i + window_size + 1]
    for j in range(1, len(window)):
        pairs.append((window[0], window[j]))

# Count and print top word pairs
co_occurrence = Counter(pairs)
print(co_occurrence.most_common(10))



[(('in', 'the'), 3), (('the', 'U.S.'), 2), (('the', 'And'), 2), (('U.S.', 'And'), 2), (('U.S.', 'Japan'), 2), (('that', 'the'), 2), (('to', 'on'), 2), (('on', 'imports'), 2), (('on', 'of'), 2), (('imports', 'of'), 2)]
