In [None]:
# ASSIGNMENT 18 — TEXT VECTORIZATION TECHNIQUES
# NLP Feature Extraction Only (NO ML MODELS)


# PART 0 — IMPORT LIBRARIES


# Allowed libraries only
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# ------------------------------
# Step 1: Load Dataset
# ------------------------------

# Download spam.csv from Kaggle dataset
df = pd.read_csv("spam.csv", encoding="latin-1")

# Keep required columns only
df = df[['v1','v2']]
df.columns = ['label','text']

# Use cleaned column (simulate previous assignment output)
df['final_clean_text'] = df['text'].str.lower()

print(df.head())
print("Dataset size:", df.shape)


  label                                               text  \
0   ham  Go until jurong point, crazy.. Available only ...   
1   ham                      Ok lar... Joking wif u oni...   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...   
3   ham  U dun say so early hor... U c already then say...   
4   ham  Nah I don't think he goes to usf, he lives aro...   

                                    final_clean_text  
0  go until jurong point, crazy.. available only ...  
1                      ok lar... joking wif u oni...  
2  free entry in 2 a wkly comp to win fa cup fina...  
3  u dun say so early hor... u c already then say...  
4  nah i don't think he goes to usf, he lives aro...  
Dataset size: (5572, 3)


In [None]:

# PART 1 — TASK 1
# Manual One-Hot Encoding


# Step 1: Select 5 sentences
sentences = df['final_clean_text'].iloc[:5].tolist()

# Step 2: Build vocabulary manually
vocab = sorted(set(" ".join(sentences).split()))
print("Vocabulary:\n", vocab)

# Step 3: Create one-hot matrix manually
one_hot_matrix = []

for sentence in sentences:
    words = sentence.split()
    vector = [1 if word in words else 0 for word in vocab]
    one_hot_matrix.append(vector)

# Step 4: Display
one_hot_df = pd.DataFrame(one_hot_matrix, columns=vocab)
print("\nOne Hot Encoded Matrix:")
print(one_hot_df)


Vocabulary:
 ["08452810075over18's", '2', '2005.', '21st', '87121', 'a', 'already', 'amore', 'apply', 'around', 'available', 'buffet...', 'bugis', 'c', 'cine', 'comp', 'crazy..', 'cup', "don't", 'dun', 'e', 'early', 'entry', 'fa', 'final', 'free', 'go', 'goes', 'got', 'great', 'he', 'here', 'hor...', 'i', 'in', 'joking', 'jurong', 'la', 'lar...', 'lives', 'may', 'n', 'nah', 'ok', 'oni...', 'only', 'point,', 'question(std', "rate)t&c's", 'receive', 'say', 'say...', 'so', 'text', 'then', 'there', 'think', 'though', 'tkts', 'to', 'txt', 'u', 'until', 'usf,', 'wat...', 'wif', 'win', 'wkly', 'world']

One Hot Encoded Matrix:
   08452810075over18's  2  2005.  21st  87121  a  already  amore  apply  \
0                    0  0      0     0      0  0        0      1      0   
1                    0  0      0     0      0  0        0      0      0   
2                    1  1      1     1      1  1        0      0      1   
3                    0  0      0     0      0  0        1      0      0 

In [None]:

# PART 1 — TASK 2
# One-Hot Encoding using CountVectorizer


# Step 1: Initialize vectorizer
vectorizer = CountVectorizer(binary=True)

# Step 2: Fit & Transform
encoded = vectorizer.fit_transform(sentences)

# Step 3: Display vocabulary
print("Vocabulary:\n", vectorizer.vocabulary_)

# Step 4: Display matrix
encoded_df = pd.DataFrame(
    encoded.toarray(),
    columns=vectorizer.get_feature_names_out()
)

print("\nEncoded Matrix:")
print(encoded_df)


Vocabulary:
 {'go': 22, 'until': 55, 'jurong': 31, 'point': 40, 'crazy': 13, 'available': 8, 'only': 39, 'in': 29, 'bugis': 10, 'great': 25, 'world': 61, 'la': 32, 'buffet': 9, 'cine': 11, 'there': 49, 'got': 24, 'amore': 5, 'wat': 57, 'ok': 37, 'lar': 33, 'joking': 30, 'wif': 58, 'oni': 38, 'free': 21, 'entry': 18, 'wkly': 60, 'comp': 12, 'to': 53, 'win': 59, 'fa': 19, 'cup': 14, 'final': 20, 'tkts': 52, '21st': 2, 'may': 35, '2005': 1, 'text': 47, '87121': 3, 'receive': 43, 'question': 41, 'std': 46, 'txt': 54, 'rate': 42, 'apply': 6, '08452810075over18': 0, 'dun': 16, 'say': 44, 'so': 45, 'early': 17, 'hor': 28, 'already': 4, 'then': 48, 'nah': 36, 'don': 15, 'think': 50, 'he': 26, 'goes': 23, 'usf': 56, 'lives': 34, 'around': 7, 'here': 27, 'though': 51}

Encoded Matrix:
   08452810075over18  2005  21st  87121  already  amore  apply  around  \
0                  0     0     0      0        0      1      0       0   
1                  0     0     0      0        0      0      0    

In [None]:

# PART 2 — TASK 3
# Bag of Words


# Step 1: Initialize BoW
bow = CountVectorizer()

# Step 2: Fit entire dataset
bow_matrix = bow.fit_transform(df['final_clean_text'])

# Step 3: Vocabulary size
print("Vocabulary Size:", len(bow.vocabulary_))

# Step 4: Sample feature vectors
print("\nSample BoW vectors:")
print(bow_matrix[:5].toarray())


Vocabulary Size: 8672

Sample BoW vectors:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [6]:
############################################################
# PART 2 — TASK 4
# Word Frequency Analysis
############################################################

# Step 1: Total frequency
word_counts = np.array(bow_matrix.sum(axis=0)).flatten()

words = bow.get_feature_names_out()
freq_df = pd.DataFrame({'word': words, 'count': word_counts})

# Step 2: Top frequent words
print("\nTop 10 frequent words:")
print(freq_df.sort_values(by='count', ascending=False).head(10))

# Step 3: Least frequent words
print("\nLeast frequent words:")
print(freq_df.sort_values(by='count').head(10))

# Explanation:
print("\nBoW captures frequency by counting occurrences of words across documents.")



Top 10 frequent words:
     word  count
7756   to   2242
8609  you   2240
7627  the   1328
1084  and    979
4087   in    898
4206   is    890
4939   me    802
5223   my    762
4218   it    744
3308  for    704

Least frequent words:
             word  count
3591         gosh      1
33    07753741225      1
3592         goss      1
4846  maintaining      1
4842       mailed      1
37         077xxx      1
38            078      1
39    07801543489      1
40          07808      1
41    07808247860      1

BoW captures frequency by counting occurrences of words across documents.


In [None]:

# PART 3 — TASK 5
# N-Gram Comparison


# Unigrams
uni = CountVectorizer(ngram_range=(1,1))
uni_mat = uni.fit_transform(df['final_clean_text'])

# Bigrams
bi = CountVectorizer(ngram_range=(2,2))
bi_mat = bi.fit_transform(df['final_clean_text'])

# Trigrams
tri = CountVectorizer(ngram_range=(3,3))
tri_mat = tri.fit_transform(df['final_clean_text'])

print("Unigram vocab size:", len(uni.vocabulary_))
print("Bigram vocab size:", len(bi.vocabulary_))
print("Trigram vocab size:", len(tri.vocabulary_))


Unigram vocab size: 8672
Bigram vocab size: 41654
Trigram vocab size: 54238


In [None]:

# PART 3 — TASK 6
# Combined N-grams


combined = CountVectorizer(ngram_range=(1,2))
combined_matrix = combined.fit_transform(df['final_clean_text'])

print("Combined vocab size:", len(combined.vocabulary_))

print("\nContext improves because phrases like 'free offer' are captured.")


Combined vocab size: 50326

Context improves because phrases like 'free offer' are captured.


In [None]:

# PART 4 — TASK 7
# TF-IDF Vectorization


tfidf = TfidfVectorizer()

tfidf_matrix = tfidf.fit_transform(df['final_clean_text'])

print("Vocabulary Size:", len(tfidf.vocabulary_))
print("TF-IDF Matrix Shape:", tfidf_matrix.shape)


Vocabulary Size: 8672
TF-IDF Matrix Shape: (5572, 8672)


In [None]:

# PART 4 — TASK 8
# BoW vs TF-IDF Comparison


tfidf_scores = np.array(tfidf_matrix.mean(axis=0)).flatten()

tfidf_df = pd.DataFrame({
    'word': tfidf.get_feature_names_out(),
    'tfidf_score': tfidf_scores
})

print("\nHigh TF-IDF words:")
print(tfidf_df.sort_values(by='tfidf_score', ascending=False).head(10))

print("\nLow TF-IDF words:")
print(tfidf_df.sort_values(by='tfidf_score').head(10))

print("\nTF-IDF down-weights very common words appearing in many documents.")



High TF-IDF words:
      word  tfidf_score
8609   you     0.044145
7756    to     0.036946
7627   the     0.026384
4087    in     0.021946
4939    me     0.021159
1084   and     0.020217
4206    is     0.019703
4218    it     0.018706
5223    my     0.018602
1813  call     0.017207

Low TF-IDF words:
              word  tfidf_score
4850       makiing     0.000013
6002       praises     0.000013
7075       sorrows     0.000013
6115        proove     0.000013
6597        sambar     0.000013
1279    attraction     0.000013
2566    determined     0.000014
7331       stylist     0.000014
6146       pudunga     0.000014
6138  psychologist     0.000014

TF-IDF down-weights very common words appearing in many documents.


In [None]:

# PART 5 — TASK 9
# Parameter Exploration

vec1 = CountVectorizer(max_features=1000)
vec2 = CountVectorizer(min_df=5)
vec3 = CountVectorizer(max_df=0.8)

vec1.fit(df['final_clean_text'])
vec2.fit(df['final_clean_text'])
vec3.fit(df['final_clean_text'])

print("Vocab size (max_features=1000):", len(vec1.vocabulary_))
print("Vocab size (min_df=5):", len(vec2.vocabulary_))
print("Vocab size (max_df=0.8):", len(vec3.vocabulary_))


Vocab size (max_features=1000): 1000
Vocab size (min_df=5): 1812
Vocab size (max_df=0.8): 8672


### **1. Difference between One-Hot Encoding and Bag of Words (BoW)**

* **One-Hot Encoding** represents whether a word is present or absent in a sentence (0 or 1).
* **Bag of Words (BoW)** represents how many times each word appears (frequency count).
* One-Hot ignores repetition, while BoW captures word importance through frequency.



### **2. Why N-grams increase dimensionality**

* N-grams create features using combinations of words (e.g., bigrams, trigrams).
* Each unique word combination becomes a new feature.
* As combinations grow, the vocabulary size increases rapidly, leading to higher dimensional vectors.



### **3. When to prefer TF-IDF over BoW**

* TF-IDF is preferred when common words (like *the*, *is*, *and*) should have less importance.
* It highlights words that are unique or meaningful in specific documents.
* Useful in search, document similarity, and text ranking tasks.



### **4. Limitations of count-based vectorization methods**

* They do not understand semantic meaning or context.
* Produce very sparse and high-dimensional matrices.
* Words with similar meanings are treated as completely different features.
* Cannot capture word order effectively (especially BoW).


