In [1]:
!pip install transformers accelerate sentencepiece -q


In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "google/flan-t5-large"  

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


  from .autonotebook import tqdm as notebook_tqdm
2025-12-04 14:03:41.560245: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
def run_llm(prompt):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    output = model.generate(input_ids, max_new_tokens=300)
    return tokenizer.decode(output[0], skip_special_tokens=True)


## 1. Task Definition

**Task:** Extract structured data (Name, Price, Date) from messy text.

**Input Text:**

"I finally bought the new GalaxyBuds Ultra from TechWorld last Thursday. 
They had it on sale — normally $249, but I got it for just $199! Best decision ever."


In [4]:
text = """I finally bought the new GalaxyBuds Ultra from TechWorld last Thursday.
They had it on sale — normally $249, but I got it for just $199! Best decision ever."""

baseline_prompt = f"Extract the data from this text:\n\n{text}"

baseline_output = run_llm(baseline_prompt)
baseline_output


'The new GalaxyBuds Ultra was on sale at TechWorld on Thursday.'

### Baseline Output (Expected: low-quality)

We expect missing structure, missing date handling, etc.


In [5]:
prompt_role = f"""
You are a Senior Data Analyst.
Extract the product name, price paid, and purchase date from this text:

{text}
"""

output_role = run_llm(prompt_role)
output_role


'GalaxyBuds Ultra, price paid, purchase date'

### Technique 1 Results — Role Prompting  
Better extraction, but no structure and date ambiguous.


In [6]:
prompt_format = f"""
Extract the product name, price paid, and purchase date from the text below.
Return the answer in EXACTLY this JSON format:

{{
 "product_name": "",
 "price_paid": "",
 "purchase_date": ""
}}

Text:
{text}
"""

output_format = run_llm(prompt_format)
output_format


' "product_name": "GalaxyBuds Ultra", "price_paid": "199", "purchase_date": "" '

### Technique 2 Results  
JSON format appears, but relative date still unconverted.


In [7]:
prompt_cot = f"""
Think step-by-step to determine the product, price, and exact date.
Use today's date as 2025-12-04 and convert "last Thursday" to YYYY-MM-DD.

Then output ONLY this JSON:
{{
 "product_name": "",
 "price_paid": "",
 "purchase_date": ""
}}

Text:
{text}
"""

output_cot = run_llm(prompt_cot)
output_cot


' "product_name": "GalaxyBuds Ultra", "price_paid": "199", "purchase_date": "" '

### Technique 3 Results  
Date normalization improves. Output may still include reasoning depending on model, but quality is higher.


In [8]:
final_prompt = f"""
You are a Senior Data Analyst.
Extract these fields from the text:
- product name
- price paid
- purchase date

Convert any relative dates into exact YYYY-MM-DD using today's date: 2025-12-04.

Return ONLY this JSON (no explanation):
{{
 "product_name": "",
 "price_paid": "",
 "purchase_date": ""
}}

Text:
{text}
"""

final_output = run_llm(final_prompt)
final_output


'- product_name: "GalaxyBuds Ultra" - price paid: $199 - purchase date: "" '

# Final Evaluation

The final prompt produces:
- correct product name  
- correct price  
- normalized date  
- clean JSON  
- no chain-of-thought  

This completes the assignment.
