# Text as Data â€“ Tokenization & Cleaning

In [2]:
text = "Hello!!! This is my FIRST GenAI & NLP class... #excited"

import re

# 1) Basic cleaning
text_lower = text.lower()
text_clean = re.sub(r"[^a-z0-9\s]", " ", text_lower)  # keep letters, digits, spaces
text_clean = re.sub(r"\s+", " ", text_clean).strip()

print("Original:", text)
print("Lower + cleaned:", text_clean)

# 2) Tokenization (simple split)
tokens = text_clean.split()
print("Tokens:", tokens)


Original: Hello!!! This is my FIRST GenAI & NLP class... #excited
Lower + cleaned: hello this is my first genai nlp class excited
Tokens: ['hello', 'this', 'is', 'my', 'first', 'genai', 'nlp', 'class', 'excited']


# Bag of Words & TF-IDF

In [27]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd

docs = [
    "Amazon sells beautiful sarees online",
    "Sarees and kurtis are available in our online store",
    "We sell ethnic wear online and offline"
]

# Bag of Words
bow_vec = CountVectorizer()
bow_matrix = bow_vec.fit_transform(docs)

print("Vocabulary (BoW):", bow_vec.get_feature_names_out())
pd.DataFrame(bow_matrix.toarray(), columns=bow_vec.get_feature_names_out())


Vocabulary (BoW): ['amazon' 'and' 'are' 'available' 'beautiful' 'ethnic' 'in' 'kurtis'
 'offline' 'online' 'our' 'sarees' 'sell' 'sells' 'store' 'we' 'wear']


Unnamed: 0,amazon,and,are,available,beautiful,ethnic,in,kurtis,offline,online,our,sarees,sell,sells,store,we,wear
0,1,0,0,0,1,0,0,0,0,1,0,1,0,1,0,0,0
1,0,1,1,1,0,0,1,1,0,1,1,1,0,0,1,0,0
2,0,1,0,0,0,1,0,0,1,1,0,0,1,0,0,1,1


In [5]:
# TF-IDF
tfidf_vec = TfidfVectorizer()
tfidf_matrix = tfidf_vec.fit_transform(docs)

print("Vocabulary (TF-IDF):", tfidf_vec.get_feature_names_out())
pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vec.get_feature_names_out()).round(3)

Vocabulary (TF-IDF): ['amazon' 'and' 'are' 'available' 'beautiful' 'ethnic' 'in' 'kurtis'
 'offline' 'online' 'our' 'sarees' 'sell' 'sells' 'store' 'we' 'wear']


Unnamed: 0,amazon,and,are,available,beautiful,ethnic,in,kurtis,offline,online,our,sarees,sell,sells,store,we,wear
0,0.529,0.0,0.0,0.0,0.529,0.0,0.0,0.0,0.0,0.0,0.0,0.402,0.0,0.529,0.0,0.0,0.0
1,0.0,0.284,0.374,0.374,0.0,0.0,0.374,0.374,0.0,0.0,0.374,0.284,0.0,0.0,0.374,0.0,0.0
2,0.0,0.297,0.0,0.0,0.0,0.39,0.0,0.0,0.39,0.39,0.0,0.0,0.39,0.0,0.0,0.39,0.39


# Word/Sentence Embeddings + Cosine Similarity

In [7]:
# !pip install sentence-transformers   # if not already installed

from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer("all-MiniLM-L6-v2")  # small, fast model

sentences = [
    "Amazon sells sarees and ethnic wear.",
    "We offer traditional sarees in our shop.",
    "I love eating pizza on weekends."
]

embs = model.encode(sentences, convert_to_tensor=True)

# Similarity between sentence 0 and others
cos_sim = util.cos_sim(embs[0], embs)

for i, s in enumerate(sentences):
    print(f"Similarity with: {s!r} = {cos_sim[0][i].item():.3f}")


  if not hasattr(np, "object"):



Similarity with: 'Amazon sells sarees and ethnic wear.' = 1.000
Similarity with: 'We offer traditional sarees in our shop.' = 0.741
Similarity with: 'I love eating pizza on weekends.' = -0.001


# Simple Text Classification (Sentiment-ish) with TF-IDF + Logistic Regression

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

texts = [
    "I love this product",        # 1
    "This saree is amazing",      # 1
    "Worst experience ever",      # 0
    "I hate this quality",        # 0
    "Very good material",         # 1
    "Terrible service",           # 0
]
labels = [1, 1, 0, 0, 1, 0]   # 1 = positive, 0 = negative

X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.3, random_state=42)

vec = TfidfVectorizer()
X_train_vec = vec.fit_transform(X_train)
X_test_vec  = vec.transform(X_test)

clf = LogisticRegression()
clf.fit(X_train_vec, y_train)

y_pred = clf.predict(X_test_vec)
print("Test texts:", X_test)
print("Predictions:", y_pred)
print("Accuracy:", accuracy_score(y_test, y_pred))


Test texts: ['I love this product', 'This saree is amazing']
Predictions: [0 0]
Accuracy: 0.0


# NER and POS Tagging with spaCy

In [23]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---- ----------------------------------- 1.3/12.8 MB 6.7 MB/s eta 0:00:02
     -------- ------------------------------- 2.6/12.8 MB 6.6 MB/s eta 0:00:02
     ------------ --------------------------- 3.9/12.8 MB 6.5 MB/s eta 0:00:02
     ----------------- ---------------------- 5.5/12.8 MB 6.8 MB/s eta 0:00:02
     ---------------------- ----------------- 7.3/12.8 MB 7.2 MB/s eta 0:00:01
     --------------------------- ------------ 8.9/12.8 MB 7.5 MB/s eta 0:00:01
     --------------------------------- ------ 10.7/12.8 MB 7.5 MB/s eta 0:00:01
     -------------------------------------- - 12.3/12.8 MB 7.6 MB/s eta 0:00:01
     ---------------------------------------- 12.8/12.8 MB 7.6 MB/s eta 0:00:00
Installing collected packages: en-core

In [25]:
import spacy

nlp = spacy.load("en_core_web_sm")

text = "Keerthi visited Dubai last week and bought sarees from branded shop."

doc = nlp(text)

print("=== Tokens & POS ===")
for token in doc:
    print(token.text, "->", token.pos_)

print("\n=== Named Entities ===")
for ent in doc.ents:
    print(ent.text, "->", ent.label_)


=== Tokens & POS ===
Keerthi -> PROPN
visited -> VERB
Dubai -> PROPN
last -> ADJ
week -> NOUN
and -> CCONJ
bought -> VERB
sarees -> NOUN
from -> ADP
branded -> ADJ
shop -> NOUN
. -> PUNCT

=== Named Entities ===
Keerthi -> ORG
Dubai -> GPE
last week -> DATE
