# Lab Assignment 7 – NLP on Trending Product Titles
Spring 2025 – COSC 482

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re
import string
import nltk
import spacy

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler

from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [2]:
# Load Dataset
df = pd.read_csv("cleaned_ebay_deals.csv")

In [5]:
# Task 1: Text Preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = text.lower()
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    lemmas = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens, lemmas

# Print for 5 samples
print("Task 1: Preprocessing Sample\n")
for i in range(5):
    original = df.iloc[i]['title']
    tokens, lemmas = preprocess(original)
    print(f"\nOriginal: {original}")
    print(f"Tokens: {tokens}")
    print(f"Lemmatized: {lemmas}")

Task 1: Preprocessing Sample


Original: Klipsch R-120SWi 12" High-Performance Wireless Subwoofer
Tokens: ['klipsch', 'r120swi', '12', 'highperformance', 'wireless', 'subwoofer']
Lemmatized: ['klipsch', 'r120swi', '12', 'highperformance', 'wireless', 'subwoofer']

Original: Apple iPhone 14 Pro Max 128GB Network Unlocked Very Good Condition
Tokens: ['apple', 'iphone', '14', 'pro', 'max', '128gb', 'network', 'unlocked', 'good', 'condition']
Lemmatized: ['apple', 'iphone', '14', 'pro', 'max', '128gb', 'network', 'unlocked', 'good', 'condition']

Original: Apple iPhone 14 Pro Max 256GB Unlocked Very Good Condition
Tokens: ['apple', 'iphone', '14', 'pro', 'max', '256gb', 'unlocked', 'good', 'condition']
Lemmatized: ['apple', 'iphone', '14', 'pro', 'max', '256gb', 'unlocked', 'good', 'condition']

Original: Apple iPhone 14 Pro A2650 128GB Network Unlocked Very Good Condition
Tokens: ['apple', 'iphone', '14', 'pro', 'a2650', '128gb', 'network', 'unlocked', 'good', 'condition']
Lemmatized: ['a

In [6]:
# Task 2: Keyword Extraction (TF-IDF)
tfidf = TfidfVectorizer(stop_words='english', max_features=1000)
X_tfidf = tfidf.fit_transform(df['title'])

# Get top 10 keywords
sum_words = X_tfidf.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in tfidf.vocabulary_.items()]
words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
print("\nTask 2: Top 10 Keywords")
for word, freq in words_freq[:10]:
    print(f"{word}: {freq:.4f}")


Task 2: Top 10 Keywords
unlocked: 388.7619
apple: 369.5410
iphone: 369.3617
128gb: 301.7780
pro: 299.0688
condition: 294.5750
good: 277.6170
excellent: 262.7766
14: 261.7762
256gb: 212.7358


In [7]:
# Task 3: POS Tagging & Phrase Analysis
print("\nTask 3: POS Tagging & Adjective–Noun Pairs")
adj_noun_pairs = []

for i in range(10):
    doc = nlp(df.iloc[i]['title'])
    print(f"\nTitle {i+1}: {df.iloc[i]['title']}")
    for token in doc:
        print(f"{token.text}: {token.pos_}")

# All titles for common adjective–noun pairs
for title in df['title']:
    doc = nlp(title)
    for i in range(len(doc) - 1):
        if doc[i].pos_ == "ADJ" and doc[i+1].pos_ == "NOUN":
            adj_noun_pairs.append((doc[i].text.lower(), doc[i+1].text.lower()))

# Show most common pairs
from collections import Counter
pair_counts = Counter(adj_noun_pairs)
print("\nMost Common Adjective–Noun Pairs:")
for pair, count in pair_counts.most_common(10):
    print(f"{pair[0]} {pair[1]}: {count}")


Task 3: POS Tagging & Adjective–Noun Pairs

Title 1: Klipsch R-120SWi 12" High-Performance Wireless Subwoofer
Klipsch: PROPN
R-120SWi: VERB
12: NUM
": PUNCT
High: ADJ
-: PUNCT
Performance: NOUN
Wireless: PROPN
Subwoofer: NOUN

Title 2: Apple iPhone 14 Pro Max 128GB Network Unlocked Very Good Condition
Apple: PROPN
iPhone: PROPN
14: NUM
Pro: PROPN
Max: PROPN
128: NUM
GB: PROPN
Network: PROPN
Unlocked: VERB
Very: ADV
Good: PROPN
Condition: NOUN

Title 3: Apple iPhone 14 Pro Max 256GB Unlocked Very Good Condition
Apple: PROPN
iPhone: PROPN
14: NUM
Pro: PROPN
Max: PROPN
256: NUM
GB: PROPN
Unlocked: VERB
Very: ADV
Good: ADJ
Condition: NOUN

Title 4: Apple iPhone 14 Pro A2650 128GB Network Unlocked Very Good Condition
Apple: PROPN
iPhone: PROPN
14: NUM
Pro: PROPN
A2650: PROPN
128: NUM
GB: PROPN
Network: PROPN
Unlocked: VERB
Very: ADV
Good: PROPN
Condition: NOUN

Title 5: Apple iPhone 14 128GB Network Unlocked Very Good Condition
Apple: PROPN
iPhone: PROPN
14: NUM
128: NUM
GB: PROPN
Network:

In [8]:
# Task 4: Product Clustering
kmeans = KMeans(n_clusters=4, random_state=42)
df['cluster'] = kmeans.fit_predict(X_tfidf)

print("\nTask 4: Top 5 Words per Cluster:")
terms = tfidf.get_feature_names_out()
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
for i in range(4):
    print(f"\nCluster {i}:")
    for ind in order_centroids[i, :5]:
        print(f"{terms[ind]}")


Task 4: Top 5 Words per Cluster:

Cluster 0:
smart
4k
sony
black
wireless

Cluster 1:
galaxy
samsung
16gb
ssd
laptop

Cluster 2:
series
lg
inch
oled
tv

Cluster 3:
iphone
unlocked
apple
condition
128gb


In [9]:
# Task 5: Discount Classification
def categorize_discount(d):
    if d >= 50:
        return "High"
    elif 20 <= d < 50:
        return "Medium"
    else:
        return "Low"

df['discount_category'] = df['discount_percentage'].apply(categorize_discount)

X = X_tfidf
y = df['discount_category']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("\nTask 5: Classification Report")
print(classification_report(y_test, y_pred))


Task 5: Classification Report
              precision    recall  f1-score   support

        High       0.95      0.99      0.97       411
         Low       0.98      0.87      0.93        71
      Medium       0.97      0.94      0.95       313

    accuracy                           0.96       795
   macro avg       0.97      0.94      0.95       795
weighted avg       0.96      0.96      0.96       795



In [10]:
# Task 6: Fusion Modeling
df['price'] = pd.to_numeric(df['price'], errors='coerce')
df['discount_flag'] = df['discount_percentage'] >= 50

num_features = df[['price', 'discount_percentage']].fillna(0)
scaler = StandardScaler()
scaled_num = scaler.fit_transform(num_features)

from scipy.sparse import hstack
X_fusion = hstack([X_tfidf, scaled_num])
y_fusion = df['discount_flag']

X_train_f, X_test_f, y_train_f, y_test_f = train_test_split(X_fusion, y_fusion, test_size=0.2, random_state=42)
fusion_model = LogisticRegression(max_iter=1000)
fusion_model.fit(X_train_f, y_train_f)
y_pred_f = fusion_model.predict(X_test_f)

print("\nTask 6: Fusion Model Results")
print("Accuracy:", accuracy_score(y_test_f, y_pred_f))
print("F1 Score:", f1_score(y_test_f, y_pred_f))


Task 6: Fusion Model Results
Accuracy: 0.9987421383647799
F1 Score: 0.9987849331713244


In [11]:
# Bonus: Product Search Tool
def search_products(query, top_n=3):
    query_vec = tfidf.transform([query])
    similarities = cosine_similarity(query_vec, X_tfidf).flatten()
    top_indices = similarities.argsort()[-top_n:][::-1]
    print(f"\nTop {top_n} Results for Query: '{query}'")
    for i in top_indices:
        print(f"- {df.iloc[i]['title']} (Similarity: {similarities[i]:.4f})")

# Example search
search_products("iphone 14")


Top 3 Results for Query: 'iphone 14'
- Apple iPhone 14 128GB Unlocked - Excellent (Similarity: 0.6169)
- Apple iPhone 14 128GB Unlocked - Excellent (Similarity: 0.6169)
- Apple iPhone 14 128GB Unlocked - Excellent (Similarity: 0.6169)
