In [3]:
# CADL1: Preprocessing in Colab
# ---------------------------------

# Install necessary libraries (only once per Colab session)
!pip install nltk spacy

# Download required NLTK datasets
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download("punkt_tab")  # punkt tables (new requirement in recent NLTK versions)
nltk.download('wordnet')

# Import libraries
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import spacy

# Load SpaCy English model
!python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

# Sample text corpus
text = """Artificial Intelligence and Machine Learning are transforming the world.
Natural Language Processing helps computers understand human language."""

print("Original Text:\n", text)

# 1. Tokenization (NLTK)
tokens = word_tokenize(text)
print("\nTokenization:\n", tokens)

# 2. Stopword Removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [w for w in tokens if w.lower() not in stop_words and w.isalpha()]
print("\nAfter Stopword Removal:\n", filtered_tokens)

# 3. Stemming
ps = PorterStemmer()
stemmed = [ps.stem(w) for w in filtered_tokens]
print("\nStemming:\n", stemmed)

# 4. Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(w) for w in filtered_tokens]
print("\nLemmatization:\n", lemmatized)

# 5. Using SpaCy for Lemmatization + POS tagging
doc = nlp(text)
spacy_lemmatized = [(token.text, token.lemma_, token.pos_) for token in doc]
print("\nSpaCy Lemmatization with POS:\n", spacy_lemmatized)




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Collecting en-core-web-sm==3.7.1
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
Original Text:
 Artificial Intelligence and Machine Learning are transforming the world.
Natural Language Processing helps computers understand human language.

Tokenization:
 ['Artificial', 'Intelligence', 'and', 'Machine', 'Learning', 'are', 'transforming', 'the', 'world', '.', 'Natural', 'Language', 'Processing', 'helps', 'computers', 'understand', 'human', 'language', '.']

After Stopword Removal:
 ['Artificial', 'Intelligence', 'Machine', 'Learning', 'transforming', 'world', 'Natural', 'Language', 'Processing', 'helps', 'computers', 'understand', 'human', 'language']

Stemming:
 ['artifici', 'intellig', 'machin', 'learn', 'transform', 'world', 'natur', 'languag', 'process', 'help', 'comput', 'un

In [4]:
# CADL2: Feature Extraction
# -----------------------------

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Sample dataset (movie reviews)
corpus = [
    "The movie was fantastic and I loved it",
    "Absolutely terrible movie, I hated it",
    "The plot was great, but the acting was average",
    "I enjoyed the film, the story was excellent"
]

print("Original Corpus:\n", corpus)

# 1. Bag of Words (BoW)
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(corpus)
print("\nBag of Words Vocabulary:\n", vectorizer.get_feature_names_out())
print("\nBoW Matrix:\n", X_bow.toarray())

# 2. TF-IDF
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(corpus)
print("\nTF-IDF Vocabulary:\n", tfidf.get_feature_names_out())
print("\nTF-IDF Matrix:\n", X_tfidf.toarray())


Original Corpus:
 ['The movie was fantastic and I loved it', 'Absolutely terrible movie, I hated it', 'The plot was great, but the acting was average', 'I enjoyed the film, the story was excellent']

Bag of Words Vocabulary:
 ['absolutely' 'acting' 'and' 'average' 'but' 'enjoyed' 'excellent'
 'fantastic' 'film' 'great' 'hated' 'it' 'loved' 'movie' 'plot' 'story'
 'terrible' 'the' 'was']

BoW Matrix:
 [[0 0 1 0 0 0 0 1 0 0 0 1 1 1 0 0 0 1 1]
 [1 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 1 0 0]
 [0 1 0 1 1 0 0 0 0 1 0 0 0 0 1 0 0 2 2]
 [0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 0 2 1]]

TF-IDF Vocabulary:
 ['absolutely' 'acting' 'and' 'average' 'but' 'enjoyed' 'excellent'
 'fantastic' 'film' 'great' 'hated' 'it' 'loved' 'movie' 'plot' 'story'
 'terrible' 'the' 'was']

TF-IDF Matrix:
 [[0.         0.         0.44464184 0.         0.         0.
  0.         0.44464184 0.         0.         0.         0.35056073
  0.44464184 0.35056073 0.         0.         0.         0.28380913
  0.28380913]
 [0.48546061 0.    

In [5]:
# CADL3: NER and Structured Info Extraction
# -------------------------------------------

import pandas as pd

text = """Elon Musk is the CEO of Tesla and SpaceX.
Sundar Pichai works at Google.
Satya Nadella is the CEO of Microsoft."""

doc = nlp(text)

print("Named Entities:\n")
for ent in doc.ents:
    print(ent.text, "->", ent.label_)

# Extract persons and organizations into a table
persons = []
organizations = []

for ent in doc.ents:
    if ent.label_ == "PERSON":
        persons.append(ent.text)
    elif ent.label_ == "ORG":
        organizations.append(ent.text)

# Create structured table
data = {"Person": persons, "Organization": organizations}
df = pd.DataFrame.from_dict(data, orient='index').transpose()

print("\nStructured Information (Persons & Orgs):\n")
print(df)


Named Entities:

Elon Musk -> PERSON
Tesla -> ORG
Sundar Pichai -> PERSON
Google -> ORG
Satya Nadella -> PERSON
Microsoft -> ORG

Structured Information (Persons & Orgs):

          Person Organization
0      Elon Musk        Tesla
1  Sundar Pichai       Google
2  Satya Nadella    Microsoft


In [6]:
# CADL4: Topic Modeling (LDA)
# -------------------------------

!pip install gensim pyLDAvis

from gensim import corpora, models
import pyLDAvis.gensim_models
import pyLDAvis

# Sample corpus
documents = [
    "Artificial Intelligence is revolutionizing healthcare and finance",
    "Machine Learning algorithms improve data analysis",
    "Natural Language Processing is used in chatbots and translators",
    "Deep Learning techniques power self-driving cars and image recognition"
]

# Preprocessing
texts = [[word for word in doc.lower().split() if word not in stop_words] for doc in documents]

# Create dictionary and corpus
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# Apply LDA
lda_model = models.LdaModel(corpus, num_topics=2, id2word=dictionary, passes=10)

# Print topics
print("\nTopics found by LDA:\n")
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic {idx}: {topic}")

# Visualization
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
vis



Topics found by LDA:

Topic 0: 0.062*"algorithms" + 0.062*"improve" + 0.062*"data" + 0.062*"machine" + 0.062*"analysis" + 0.062*"learning" + 0.062*"used" + 0.062*"chatbots" + 0.062*"translators" + 0.062*"language"
Topic 1: 0.060*"learning" + 0.060*"image" + 0.060*"techniques" + 0.060*"deep" + 0.060*"self-driving" + 0.060*"power" + 0.060*"cars" + 0.060*"recognition" + 0.060*"revolutionizing" + 0.060*"intelligence"


  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return date

  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
