## Build and Evaluate a Sentiment Classifier (Naive Bayes + Feature Engineering)

In [None]:
import nltk
from nltk.corpus import movie_reviews
import random

nltk.download('movie_reviews')
docs = [(list(movie_reviews.words(fileid)), category)
        for category in movie_reviews.categories()
        for fileid in movie_reviews.fileids(category)]

random.shuffle(docs)

def extract_features(words):
    return {word: True for word in words}

featuresets = [(extract_features(doc), category) for (doc, category) in docs]
train_set, test_set = featuresets[:1500], featuresets[1500:]

classifier = nltk.NaiveBayesClassifier.train(train_set)
print("Accuracy:", nltk.classify.accuracy(classifier, test_set))
classifier.show_most_informative_features(10)


[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


Accuracy: 0.666
Most Informative Features
               insulting = True              neg : pos    =     13.3 : 1.0
             outstanding = True              pos : neg    =     13.0 : 1.0
            effortlessly = True              pos : neg    =     11.4 : 1.0
                 freddie = True              neg : pos    =     11.2 : 1.0
               ludicrous = True              neg : pos    =     11.2 : 1.0
                 idiotic = True              neg : pos    =     10.8 : 1.0
                  smooth = True              pos : neg    =     10.8 : 1.0
                  verbal = True              pos : neg    =     10.8 : 1.0
                  prinze = True              neg : pos    =     10.6 : 1.0
              unbearable = True              neg : pos    =     10.6 : 1.0


## N-gram Phrase Extraction with Frequency Filtering

In [None]:
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures

words = movie_reviews.words()
finder = BigramCollocationFinder.from_words(words)
finder.apply_freq_filter(20)

print(finder.nbest(BigramAssocMeasures.pmi, 10))  # Top 10 meaningful bigrams

[('del', 'toro'), ('salma', 'hayek'), ('san', 'francisco'), ('mortal', 'kombat'), ('charlize', 'theron'), ('ace', 'ventura'), ('natalie', 'portman'), ('ewan', 'mcgregor'), ('los', 'angeles'), ('darth', 'vader')]


## Word2Vec-based Similarity Search using NLTK Corpus

In [None]:
#!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m50.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━

In [None]:
import nltk

nltk.download('brown')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


True

In [None]:
import gensim
from nltk.corpus import brown

sentences = brown.sents()
model = gensim.models.Word2Vec(sentences, vector_size=100, window=5, min_count=5, sg=1)

print("Similar to 'money':", model.wv.most_similar("money"))

Similar to 'money': [('care', 0.8275145888328552), ('job', 0.827122688293457), ('friendship', 0.8178503513336182), ('risk', 0.8029019236564636), ('joy', 0.7983232140541077), ('permission', 0.7982925772666931), ('anywhere', 0.7979304790496826), ('part-time', 0.7957543134689331), ('chances', 0.7951022386550903), ('bringing', 0.7930358648300171)]


## Doc2Vec for Document Classification

In [None]:
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

docs = [TaggedDocument(words=movie_reviews.words(fileid), tags=[fileid])
        for fileid in movie_reviews.fileids()]
model = Doc2Vec(docs, vector_size=50, epochs=30)

X = [model.dv[doc.tags[0]] for doc in docs]
y = [movie_reviews.categories(fileid)[0] for fileid in movie_reviews.fileids()]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf = LogisticRegression()
clf.fit(X_train, y_train)
print("Doc2Vec Classifier Accuracy:", accuracy_score(y_test, clf.predict(X_test)))

Doc2Vec Classifier Accuracy: 0.665


## PPMI Co-occurrence Vector Construction

In [None]:
from collections import Counter
import numpy as np
from nltk.util import bigrams

text = "dog cat bark meow dog bark meow cat".split()
vocab = list(set(text))
co_matrix = Counter(bigrams(text))

word_counts = Counter(text)
ppmi_matrix = {}

for (w1, w2), count in co_matrix.items():
    p_w1 = word_counts[w1] / len(text)
    p_w2 = word_counts[w2] / len(text)
    p_w1_w2 = count / len(text)
    pmi = np.log2(p_w1_w2 / (p_w1 * p_w2))
    ppmi_matrix[(w1, w2)] = max(pmi, 0)

print("PPMI Matrix:", ppmi_matrix)

PPMI Matrix: {('dog', 'cat'): 1.0, ('cat', 'bark'): 1.0, ('bark', 'meow'): 2.0, ('meow', 'dog'): 1.0, ('dog', 'bark'): 1.0, ('meow', 'cat'): 1.0}


## NER Visualization with Displacy for Legal Contracts

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
from spacy import displacy

doc = nlp("This Agreement is made on 4th April 2023 between Apple Inc. and John Doe.")
displacy.render(doc, style="ent", jupyter=True)


## Custom Pattern Matching (e.g., Date + Entity)

In [None]:
from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)
pattern = [{"ENT_TYPE": "DATE"}, {"LOWER": "between"}, {"ENT_TYPE": "ORG"}]
matcher.add("DATE_CONTRACT_PATTERN", [pattern])

doc = nlp("Signed on 5th March 2024 between Microsoft and John.")
matches = matcher(doc)

for match_id, start, end in matches:
    print(doc[start:end].text)

NameError: name 'nlp' is not defined

## Text Clustering using spaCy Vectors

In [None]:
from sklearn.cluster import KMeans
import numpy as np

sentences = ["The cat sits on the mat", "Dogs bark loudly", "The dog is in the yard"]
docs = [nlp(sent) for sent in sentences]
X = np.array([doc.vector for doc in docs])

kmeans = KMeans(n_clusters=2)
kmeans.fit(X)

for i, label in enumerate(kmeans.labels_):
    print(f"Cluster {label}: {sentences[i]}")

Cluster 0: The cat sits on the mat
Cluster 1: Dogs bark loudly
Cluster 0: The dog is in the yard


## Fine-Grained NER Training (Custom Labels)

In [None]:
import spacy
from spacy.training.example import Example
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

# Start from a blank English model
nlp = spacy.blank("en")

# Add NER pipeline component
ner = nlp.add_pipe("ner")
ner.add_label("ORG")
ner.add_label("PRODUCT")

# Create a sample training set with diverse examples
TRAIN_DATA = [
    ("Apple released the new Vision Pro.", {"entities": [(0, 5, "ORG"), (24, 34, "PRODUCT")]}),
    ("Microsoft launched Surface Laptop.", {"entities": [(0, 9, "ORG"), (19, 34, "PRODUCT")]}),
    ("Google unveiled the Pixel 8 phone.", {"entities": [(0, 6, "ORG"), (19, 29, "PRODUCT")]}),
    ("Apple Vision Pro is a mixed reality headset.", {"entities": [(0, 5, "ORG"), (6, 16, "PRODUCT")]}),
    ("Amazon presented the new Echo Show 10.", {"entities": [(0, 6, "ORG"), (25, 39, "PRODUCT")]}),
]

# Convert to Example objects
examples = []
for text, annots in TRAIN_DATA:
    doc = nlp.make_doc(text)
    examples.append(Example.from_dict(doc, annots))

# Training loop
optimizer = nlp.begin_training()
for i in range(30):  # More iterations for better training
    nlp.update(examples, sgd=optimizer)

# Test
test_doc = nlp("Apple introduced the Vision Pro headset today.")
print([(ent.text, ent.label_) for ent in test_doc.ents])

[('Apple', 'ORG'), ('Vision Pro', 'PRODUCT')]
