<a href="https://colab.research.google.com/github/Melissa2001/USTAssignments/blob/main/week2assign.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip -q install transformers accelerate sentencepiece --upgrade

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.4/41.4 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m89.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import json, re, random
from typing import Dict
from transformers import AutoTokenizer, pipeline

random.seed(42)

reviews = [
    # Review 1: Positive
    "I absolutely love the new QuantumX Pro camera! The picture quality is stellar and the battery life is amazing. Shipped super fast too. A++!",

    # Review 2: Negative with specific issue
    "The SonicWave earbuds have a serious design flaw. The left earbud stopped charging after just one week. I expected better for the price. Very disappointed.",

    # Review 3: Mixed with a question
    "The Titan smartwatch is decent. The screen is bright and the features are good, but the step counter seems inaccurate. It's off by at least 20%. Is there a way to calibrate it?",

    # Review 4: Negative with multiple issues
    "My order for the AeroDrone was a disaster. It arrived with a broken propeller and the battery was completely dead on arrival. Customer service has been unresponsive for 3 days.",

    # Review 5: Positive but mentions a minor issue
    "Overall, I'm happy with the PureGlow Air Purifier. It's quiet and effective. My only complaint is that the replacement filters are a bit expensive."
]
print("Loaded", len(reviews), "reviews.")

Loaded 5 reviews.


In [3]:
# ============ 2) Part 1: Understanding Tokenization ============
# We compare GPT-2 and BERT tokenizers on reviews[2] (the Titan smartwatch).

text = reviews[2]

tok_gpt2 = AutoTokenizer.from_pretrained("gpt2")
tok_bert = AutoTokenizer.from_pretrained("bert-base-uncased")

tokens_gpt2 = tok_gpt2.tokenize(text)
tokens_bert = tok_bert.tokenize(text)

print('--- Review (reviews[2]) ---')
print(text)

print('\n--- GPT-2 tokens ---')
print(tokens_gpt2)
print('Token count (GPT-2):', len(tokens_gpt2))

print('\n--- BERT (bert-base-uncased) tokens ---')
print(tokens_bert)
print('Token count (BERT):', len(tokens_bert))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

--- Review (reviews[2]) ---
The Titan smartwatch is decent. The screen is bright and the features are good, but the step counter seems inaccurate. It's off by at least 20%. Is there a way to calibrate it?

--- GPT-2 tokens ---
['The', 'ĠTitan', 'Ġsmart', 'watch', 'Ġis', 'Ġdecent', '.', 'ĠThe', 'Ġscreen', 'Ġis', 'Ġbright', 'Ġand', 'Ġthe', 'Ġfeatures', 'Ġare', 'Ġgood', ',', 'Ġbut', 'Ġthe', 'Ġstep', 'Ġcounter', 'Ġseems', 'Ġinaccurate', '.', 'ĠIt', "'s", 'Ġoff', 'Ġby', 'Ġat', 'Ġleast', 'Ġ20', '%.', 'ĠIs', 'Ġthere', 'Ġa', 'Ġway', 'Ġto', 'Ġcalibr', 'ate', 'Ġit', '?']
Token count (GPT-2): 41

--- BERT (bert-base-uncased) tokens ---
['the', 'titan', 'smart', '##watch', 'is', 'decent', '.', 'the', 'screen', 'is', 'bright', 'and', 'the', 'features', 'are', 'good', ',', 'but', 'the', 'step', 'counter', 'seems', 'inaccurate', '.', 'it', "'", 's', 'off', 'by', 'at', 'least', '20', '%', '.', 'is', 'there', 'a', 'way', 'to', 'cal', '##ib', '##rate', 'it', '?']
Token count (BERT): 44


Are token lists identical?: No
Differences:
      BERT lowercases text and uses WordPiece subwords with '##'.,
      GPT-2 uses byte-level BPE, keeps casing, splits punctuation/spaces differently.
Why different tokenizers?: Models use tokenizers tailored to their pretraining and architecture.

In [9]:
# ============ 3) Part 2: Advanced Prompt Engineering ============

def build_pipeline():
    from transformers import pipeline
    try:
        name = "google/flan-t5-large"
        p = pipeline("text2text-generation", model=name)
    except Exception as e:
        print("Falling back because:", e)
        name = "google/flan-t5-base"
        p = pipeline("text2text-generation", model=name)
    return p

gen = build_pipeline()

def generate(prompt: str, max_new_tokens: int = 128) -> str:
    out = gen(
        prompt,
        max_new_tokens=max_new_tokens,
        do_sample=False,
        num_beams=1
    )
    return out[0]["generated_text"].strip()


def grab_json_block(text: str, default: dict) -> dict:
    m = re.search(r"\{[\s\S]*\}", text)
    if not m:
        return default
    block = m.group(0)
    try:
        return json.loads(block)
    except Exception:
        cleaned = block.replace("\n", " ")
        cleaned = re.sub(r"\s+", " ", cleaned).strip()
        cleaned = cleaned.replace("’", "'").replace("“", '"').replace("”", '"').replace("'", '"')
        try:
            return json.loads(cleaned)
        except Exception:
            return default


Device set to use cpu


In [10]:
# ============ Task A: Sentiment Classification (Few-Shot) ============
few_shot = (
    "You are a sentiment classifier. Only answer with exactly one of: Positive, Negative, Mixed.\n\n"
    "Example 1\n"
    "Review: 'I love this blender. It’s powerful and easy to clean.'\n"
    "Label: Positive\n\n"
    "Example 2\n"
    "Review: 'The sound is muffled and the mic cuts out frequently.'\n"
    "Label: Negative\n\n"
    "Now classify the following review.\n"
    "Review: {text}\n"
    "Label:"
)

sentiment_results = []
for i, rv in enumerate(reviews, start=1):
    prompt = few_shot.format(text=rv)
    label = generate(prompt, max_new_tokens=8).splitlines()[0].strip()
    label = label.split()[0]
    sentiment_results.append({"review_id": i, "sentiment": label})
    print(f"Review {i} → {label}")

print("\nAll sentiments:", sentiment_results)


Review 1 → Positive
Review 2 → Negative
Review 3 → Negative
Review 4 → Negative
Review 5 → Positive

All sentiments: [{'review_id': 1, 'sentiment': 'Positive'}, {'review_id': 2, 'sentiment': 'Negative'}, {'review_id': 3, 'sentiment': 'Negative'}, {'review_id': 4, 'sentiment': 'Negative'}, {'review_id': 5, 'sentiment': 'Positive'}]


In [13]:
# ============ Task B: Structured Data Extraction (JSON) ============
extract_tpl = (
  "Extract fields from the review and return ONLY valid JSON with keys exactly:\n"
  "product_name, issue_summary, sentiment.\n"
  "If a value is missing, set it to 'N/A'. Sentiment must be one of: Positive, Negative, Mixed.\n\n"
  "Format example:\n"
  "{{\n"
  '  "product_name": "Example Product",\n'
  '  "issue_summary": "Short issue text or N/A",\n'
  '  "sentiment": "Positive"\n'
  "}}\n\n"
  "Review: {text}\n\n"
  "JSON:"
)


extracted = []
for i, rv in enumerate(reviews, start=1):
    prompt = extract_tpl.format(text=rv)
    raw = generate(prompt, max_new_tokens=160)
    data = grab_json_block(
        raw,
        default={"product_name": "N/A", "issue_summary": "N/A", "sentiment": "N/A"}
    )
    extracted.append({"review_id": i, **data})
    print(f"Review {i} → {json.dumps(data, ensure_ascii=False)}")

print("\nAll extracted rows:")
for row in extracted:
    print(row)


Review 1 → {"product_name": "N/A", "issue_summary": "N/A", "sentiment": "N/A"}
Review 2 → {"product_name": "N/A", "issue_summary": "N/A", "sentiment": "N/A"}
Review 3 → {"product_name": "N/A", "issue_summary": "N/A", "sentiment": "N/A"}
Review 4 → {"product_name": "N/A", "issue_summary": "N/A", "sentiment": "N/A"}
Review 5 → {"product_name": "N/A", "issue_summary": "N/A", "sentiment": "N/A"}

All extracted rows:
{'review_id': 1, 'product_name': 'N/A', 'issue_summary': 'N/A', 'sentiment': 'N/A'}
{'review_id': 2, 'product_name': 'N/A', 'issue_summary': 'N/A', 'sentiment': 'N/A'}
{'review_id': 3, 'product_name': 'N/A', 'issue_summary': 'N/A', 'sentiment': 'N/A'}
{'review_id': 4, 'product_name': 'N/A', 'issue_summary': 'N/A', 'sentiment': 'N/A'}
{'review_id': 5, 'product_name': 'N/A', 'issue_summary': 'N/A', 'sentiment': 'N/A'}


In [15]:
# ============ Task C: Root Cause Analysis (concise rationale) ============
rca_tpl = (
  "Analyze the customer review to identify the root cause.\n"
  "Return ONLY valid JSON with keys exactly: main_problem, brief_rationale.\n"
  "brief_rationale must be a single concise sentence.\n\n"
  "Format example:\n"
  "{{\n"
  '  "main_problem": "Battery won’t charge",\n'
  '  "brief_rationale": "The user states the left earbud stopped charging after one week."\n'
  "}}\n\n"
  "Review: {text}\n\n"
  "JSON:"
)


targets = [2, 3, 4, 5]
rca_rows = []
for idx in targets:
    rv = reviews[idx-1]
    prompt = rca_tpl.format(text=rv)
    raw = generate(prompt, max_new_tokens=160)
    data = grab_json_block(
        raw,
        default={"main_problem": "N/A", "brief_rationale": "N/A"}
    )
    rca_rows.append({"review_id": idx, **data})
    print(f"Review {idx} → {json.dumps(data, ensure_ascii=False)}")

print("\nAll RCA rows:")
for row in rca_rows:
    print(row)


Review 2 → {"main_problem": "N/A", "brief_rationale": "N/A"}
Review 3 → {"main_problem": "N/A", "brief_rationale": "N/A"}
Review 4 → {"main_problem": "N/A", "brief_rationale": "N/A"}
Review 5 → {"main_problem": "N/A", "brief_rationale": "N/A"}

All RCA rows:
{'review_id': 2, 'main_problem': 'N/A', 'brief_rationale': 'N/A'}
{'review_id': 3, 'main_problem': 'N/A', 'brief_rationale': 'N/A'}
{'review_id': 4, 'main_problem': 'N/A', 'brief_rationale': 'N/A'}
{'review_id': 5, 'main_problem': 'N/A', 'brief_rationale': 'N/A'}
