<a href="https://colab.research.google.com/github/Kristina-26/DEEP-LEARNING-TASK-2/blob/main/DeepLearningTask2_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Kristina Kazlauskaitė
# LSP: S2416112
# full-time studies
# task: realization a semantic search tool for the selected context using a natural language model-based transformer neural networks to be used as a feature
#       extractor and save representations to vector database

In [None]:
# Install required packages
!pip install -q beautifulsoup4 requests transformers sentence-transformers faiss-cpu pandas numpy matplotlib seaborn parsel torch
!pip install datasets accelerate scikit-learn

In [None]:
import httpx
from parsel import Selector
import pandas as pd
import numpy as np
import random
import time
import json
from datetime import datetime
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import AutoModel, AutoConfig
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import faiss
from transformers import BertModel, BertTokenizer

In [None]:
# set up a custom HTTP2 client to simulate how a real web browser (in this case Edge) interacts with a website
session = httpx.Client(
    headers={
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.35",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
        "Accept-Language": "en-US,en;q=0.9",
        "Accept-Encoding": "gzip, deflate, br", # compressed responses
    },
    http2=True, # faster than http1
    follow_redirects=True,
    timeout=30.0
)

# scraping listings in diverse categories
search_urls = [
    "https://www.ebay.com/sch/i.html?_nkw=electronics",
    "https://www.ebay.com/sch/i.html?_nkw=laptops",
    "https://www.ebay.com/sch/i.html?_nkw=cameras",
    "https://www.ebay.com/sch/i.html?_nkw=home+garden",
    "https://www.ebay.com/sch/i.html?_nkw=sporting+goods",
    "https://www.ebay.com/sch/i.html?_nkw=clothing",
    "https://www.ebay.com/sch/i.html?_nkw=jewelry",
    "https://www.ebay.com/sch/i.html?_nkw=video+games",
    "https://www.ebay.com/sch/i.html?_nkw=books",
    "https://www.ebay.com/sch/i.html?_nkw=collectibles"
]

In [None]:
#  extract product info from a listing page with parsel - return a list of product dictionaries
def extract_product_info_httpx(selector):
    products = []

    # find all product listings
    print(f"[{datetime.now().strftime('%H:%M:%S')}] Looking for product listings...")
    listings = selector.css('li.s-item')
    print(f"[{datetime.now().strftime('%H:%M:%S')}] Found {len(listings)} listings")

    for i, listing in enumerate(listings):
        try:
            if i < 5:  # show details for first 5 items
                print(f"[{datetime.now().strftime('%H:%M:%S')}] Processing product {i+1}")

            # title
            title = listing.css('.s-item__title span::text').get('').strip()

            # price
            price = listing.css('.s-item__price::text').get('').strip()

            # condition
            condition = listing.css('.SECONDARY_INFO::text').get('').strip()

            # link
            link = listing.css('.s-item__link::attr(href)').get('')

            # image
            image_url = listing.css('.s-item__image img::attr(src)').get('')

            # location
            location = listing.css('.s-item__location::text').get('').strip()

            # description (if available)
            description = listing.css('.s-item__subtitle::text').get('').strip()

            if i < 5:  # show details for first 5 items
                print(f"  Title: {title[:50]}...")
                print(f"  Price: {price}")

            products.append({
                'title': title,
                'price': price,
                'condition': condition,
                'link': link,
                'image_url': image_url,
                'location': location,
                'description': description,
                'timestamp': datetime.now().isoformat()
            })

        except Exception as e:
            if i < 5:  # show errors for first 5 products
                print(f"Error extracting product {i+1}: {e}")
            continue

    print(f"[{datetime.now().strftime('%H:%M:%S')}] Successfully extracted {len(products)} products")
    return products # a list of dictionaries

# scrape products from a search URL using httpx from pages 1 to max_pages
def scrape_ebay_search(base_url, max_pages=5):
    all_products = []

    for page in range(1, max_pages + 1):
        try:
            # pagination parameter
            page_url = f"{base_url}&_pgn={page}"

            print(f"\n{'*'*20} PAGE {page} {'*'*20}")
            print(f"Scraping page {page} of {max_pages}")
            print(f"URL: {page_url[:80]}...")

            # make request with retry logic
            print(f"[{datetime.now().strftime('%H:%M:%S')}] Sending request...")

            max_retries = 3 # retry up to 3 times if it times out
            for attempt in range(max_retries):
                try:
                    response = session.get(page_url)
                    response.raise_for_status()
                    break
                except httpx.TimeoutException:
                    if attempt < max_retries - 1:
                        wait_time = 5 * (attempt + 1)
                        print(f"Timeout, retrying in {wait_time} seconds...")
                        time.sleep(wait_time)
                    else:
                        raise

            print(f"[{datetime.now().strftime('%H:%M:%S')}] Response received. Status code: {response.status_code}")

            # parse HTML
            print(f"[{datetime.now().strftime('%H:%M:%S')}] Parsing HTML...")
            selector = Selector(response.text)

            # extract products
            print(f"[{datetime.now().strftime('%H:%M:%S')}] Extracting products...")
            products = extract_product_info_httpx(selector)
            all_products.extend(products)

            print(f"[{datetime.now().strftime('%H:%M:%S')}] Extracted {len(products)} products from page {page}")
            print(f"[{datetime.now().strftime('%H:%M:%S')}] Total products so far: {len(all_products)}")

            # use random delay between page requests
            wait_time = random.uniform(3, 7)
            print(f"[{datetime.now().strftime('%H:%M:%S')}] Waiting {wait_time:.1f} seconds before next request...")
            time.sleep(wait_time)

        except httpx.TimeoutException:
            print(f"[{datetime.now().strftime('%H:%M:%S')}] Timeout error on page {page}")
            # try to continue with next page
            continue
        except httpx.HTTPStatusError as e:
            if e.response.status_code == 429:
                print(f"[{datetime.now().strftime('%H:%M:%S')}] Rate limited. Waiting 30 seconds...")
                time.sleep(30)
                continue
            else:
                print(f"[{datetime.now().strftime('%H:%M:%S')}] HTTP error on page {page}: {e}")
                continue
        except Exception as e:
            print(f"[{datetime.now().strftime('%H:%M:%S')}] ERROR on page {page}: {e}")
            import traceback
            traceback.print_exc()
            continue

    return all_products

In [None]:
# collect at least 5000 entries
all_products = []
target_products = 5000

# loop through each category
for idx, search_url in enumerate(search_urls):
    print(f"\n{'='*50}")
    print(f"Scraping search query {idx+1}/{len(search_urls)}")
    print(f"URL: {search_url}")
    print(f"{'='*50}")

    # scrape products from this category
    products = scrape_ebay_search(search_url, max_pages=10)
    all_products.extend(products)

    print(f"Total products collected so far: {len(all_products)}")

    # step after reaching the target
    if len(all_products) >= target_products:
        print(f"\nReached target of {target_products} products!")
        break

    # save intermediate results in a CSV file
    df_intermediate = pd.DataFrame(all_products)
    df_intermediate.to_csv(f'ebay_products_intermediate_{idx+1}.csv', index=False)

    # longer random delay between different searches
    wait_time = random.uniform(10, 15)
    print(f"Waiting {wait_time:.1f} seconds before next search query...")
    time.sleep(wait_time)

# final DataFrame
df = pd.DataFrame(all_products)
print(f"\nFinal dataset shape: {df.shape}")
print(f"Total products collected: {len(df)}")

In [None]:
# remove duplicates
df = df.drop_duplicates(subset=['title', 'price', 'link'])
print(f"After removing duplicates: {len(df)} products")

# create a combined text field for better search
df['combined_text'] = df.apply(lambda row: f"{row['title']} {row['description']} {row['condition']}", axis=1)

# first few entries
df.head()

In [None]:
# save dataset into a CSV
df.to_csv('ebay_products_dataset.csv', index=False)
print("Dataset saved to 'ebay_products_dataset.csv'")

# save a text file with just the combined text for semantic search
with open('ebay_products_text.txt', 'w', encoding='utf-8') as f:
    for text in df['combined_text']:
        f.write(text + '\n')

In [None]:
# load a pre-trained sentence transformer model for embedding
model = SentenceTransformer('all-MiniLM-L6-v2')
print("Sentence transformer model loaded successfully")

# test the model
test_embedding = model.encode("test sentence")
print(f"Embedding dimension: {test_embedding.shape}")

In [None]:
# create embeddings for all products
print("Creating embeddings for all products...")
embeddings = model.encode(df['combined_text'].tolist(), show_progress_bar=True)
print(f"Embeddings shape: {embeddings.shape}")

# save embeddings (NumPy array) to reload later
np.save('ebay_product_embeddings.npy', embeddings)
print("Embeddings saved to 'ebay_product_embeddings.npy'")

In [None]:
# create FAISS index that uses L2 distance (Euclidean) to measure similarity between vectors
dimension = embeddings.shape[1]
print(dimanesion) # for all-MiniLM-L6-v2 this is 384
index = faiss.IndexFlatL2(dimension) # embeddings are normalized, so L2 distance can still reflect semantic similarity

# add item embeddings to the index
index.add(embeddings.astype('float32'))

# save the index to a binary file for reloading later
faiss.write_index(index, "ebay_product_index.bin")
print(f"FAISS index created with {index.ntotal} vectors")

# RAG

In [None]:
# search system that uses RAG for query enhancement and result refinement
class RAGEnhancedSearchSystem:
    def __init__(self, embedding_model, index, df, generator_model_name="google/flan-t5-large"):
        self.embedding_model = embedding_model
        self.index = index
        self.df = df

        # initialize generative model for RAG-based search enhancement
        print(f"Loading generative model for RAG search: {generator_model_name}")
        self.generator_tokenizer = T5Tokenizer.from_pretrained(generator_model_name)
        self.generator_model = T5ForConditionalGeneration.from_pretrained(generator_model_name)

        # move to GPU if available
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.generator_model = self.generator_model.to(self.device)
        print(f"Model loaded on {self.device}")

    # use RAG to expand the search query for better retrieval
    def rag_query_expansion(self, original_query):

        expansion_terms = []

        prompt = f"""Generate 3-5 related search terms for this eBay search query.
Only output the new terms, separated by commas. Do not repeat the original query.

Query: {original_query}

Related terms:"""

        inputs = self.generator_tokenizer(
            prompt,
            max_length=128,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        ).to(self.device)

        outputs = self.generator_model.generate(
            inputs["input_ids"],
            max_length=50,
            num_beams=4,
            early_stopping=True
        )

        enhanced_terms = self.generator_tokenizer.decode(outputs[0], skip_special_tokens=True)
        if enhanced_terms and enhanced_terms.lower() != original_query.lower():
            expansion_terms = enhanced_terms.split(',')

        # clean and format expansion terms
        cleaned_terms = [term.strip() for term in expansion_terms if term.strip()]
        expansion_text = ' '.join(cleaned_terms[:3])  # use top 3 terms

        # combine original query with expansion terms
        if expansion_text:
            combined_query = f"{original_query} {expansion_text}"
        else:
            combined_query = original_query

        return combined_query, expansion_text

    # use RAG to rerank search results based on relevance
    def rag_result_reranking(self, original_query, search_results):
        # prepare context with search results
        context = "Search results to analyze:\n\n"
        for i, result in enumerate(search_results, 1):
            context += f"Result {i}:\n"
            context += f"Title: {result['title']}\n"
            context += f"Price: {result['price']}\n"
            if 'condition' in result:
                context += f"Condition: {result['condition']}\n"
            if 'description' in result and result['description']:
                context += f"Description: {result['description'][:100]}...\n"
            context += "\n"

        prompt = f"""You are a search result analyzer. Rerank these eBay search results based on relevance to the query.

Query: {original_query}

{context}

Instructions:
1. Analyze each result's relevance to the query
2. Consider product specifics, price range, and condition
3. Output the ranking as numbers 1,2,3... in order of relevance

Result ranking (most relevant first):"""

        inputs = self.generator_tokenizer(
            prompt,
            max_length=512,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        ).to(self.device)

        outputs = self.generator_model.generate(
            inputs["input_ids"],
            max_length=50,
            num_beams=4,
            early_stopping=True
        )

        ranking_text = self.generator_tokenizer.decode(outputs[0], skip_special_tokens=True)

        # parse ranking
        try:
            rankings = [int(x) for x in ranking_text.split(',')]
            reranked_results = []
            for rank in rankings:
                if 1 <= rank <= len(search_results):
                    result = search_results[rank - 1].copy()
                    result['rag_rank'] = len(reranked_results) + 1
                    reranked_results.append(result)

            # add any missing results
            for i, result in enumerate(search_results):
                if not any(r['title'] == result['title'] for r in reranked_results):
                    result_copy = result.copy()
                    result_copy['rag_rank'] = len(reranked_results) + 1
                    reranked_results.append(result_copy)

            return reranked_results
        except:
            # fallback to original order if parsing fails
            print("Failed to parse RAG ranking, using original order")
            return search_results

    # perform search using RAG for query enhancement and result reranking
    def rag_enhanced_search(self, query, top_k=5):
        print(f"\nRAG-ENHANCED SEARCH PROCESS:")
        print(f"Original query: '{query}'")

        # 1. RAG Query Expansion
        expanded_query, enhancement_terms = self.rag_query_expansion(query)
        print(f"Enhanced query: '{expanded_query}'")
        print(f"Enhancement terms: '{enhancement_terms}'")

        # 2. Search with expanded query
        query_embedding = self.embedding_model.encode([expanded_query])
        distances, indices = self.index.search(query_embedding.astype('float32'), top_k * 2)  # get extra results

        # 3. format initial results
        initial_results = []
        for i, idx in enumerate(indices[0]):
            if idx != -1:
                product = self.df.iloc[idx]
                initial_results.append({
                    'rank': i + 1,
                    'title': product['title'],
                    'price': product['price'],
                    'condition': product['condition'],
                    'description': product['description'],
                    'link': product['link'],
                    'similarity_score': 1 / (1 + distances[0][i])
                })

        # 4. RAG-based reranking
        reranked_results = self.rag_result_reranking(query, initial_results)

        # return top_k results
        return reranked_results[:top_k]

    # complete RAG pipeline: enhanced search + context-aware generation
    def full_rag_pipeline(self, query, top_k=5):
        # 1. RAG-enhanced search
        search_results = self.rag_enhanced_search(query, top_k=top_k)

        # 2. generate response using retrieved context
        generated_response = self.generate_response(query, search_results)

        return {
            'query': query,
            'search_results': search_results,
            'generated_response': generated_response
        }

    # generate response using retrieved context
    def generate_response(self, query, search_results):
        """Generate a helpful, concrete response using retrieved search results"""

        # create context from search results
        context = "Top search results for your query:\n\n"
        for i, result in enumerate(search_results, 1):
            context += f"Product {i} (RAG Rank: {result.get('rag_rank', i)}):\n"
            context += f"- Title: {result['title']}\n"
            context += f"- Price: {result['price']}\n"
            if 'condition' in result:
                context += f"- Condition: {result['condition']}\n"
            if 'description' in result and result['description']:
                context += f"- Description: {result['description'][:150]}...\n"
            context += f"- Match quality: {result['similarity_score']:.2f}\n\n"

        # prompt with instructions
        prompt = f"""Recommend the best product for: {query}

Products available:
{context}

Write a brief recommendation. Begin with "Based on your search for {query}," then mention the best product by name and price. Explain why it's good. Compare to one alternative if relevant.
"""

        inputs = self.generator_tokenizer(
            prompt,
            max_length=768,  # to give model more context
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        ).to(self.device)

        # output parameters
        outputs = self.generator_model.generate(
            inputs["input_ids"],
            max_length=400,
            min_length=100,
            length_penalty=1.5,   # encourage slightly longer responses
            num_beams=5,          # beam search paths
            temperature=0.8,      # add creativity
            early_stopping=True,
            no_repeat_ngram_size=3  # prevent repetitive text
        )

        response = self.generator_tokenizer.decode(outputs[0], skip_special_tokens=True)

        return response

    # demonstrate the RAG-enhanced search process step by step
    def demonstrate_rag_search_process(self, query):
        print("\n" + "="*60)
        print(f"DEMONSTRATING RAG-ENHANCED SEARCH PROCESS")
        print(f"Query: '{query}'")
        print("="*60)

        # 1. query enhancement
        expanded_query, enhancement_terms = self.rag_query_expansion(query)
        print("\n1. RAG QUERY ENHANCEMENT:")
        print(f"   Original: '{query}'")
        print(f"   Enhanced: '{expanded_query}'")
        print(f"   Added terms: '{enhancement_terms}'")

        # 2. initial search
        print("\n2. VECTOR SEARCH WITH ENHANCED QUERY:")
        query_embedding = self.embedding_model.encode([expanded_query])
        distances, indices = self.index.search(query_embedding.astype('float32'), 5)

        initial_results = []
        for i, idx in enumerate(indices[0]):
            if idx != -1:
                product = self.df.iloc[idx]
                result = {
                    'rank': i + 1,
                    'title': product['title'],
                    'price': product['price'],
                    'similarity_score': 1 / (1 + distances[0][i])
                }
                initial_results.append(result)
                print(f"   {i+1}. {product['title'][:50]}...")
                print(f"      Price: {product['price']} | Similarity: {result['similarity_score']:.3f}")

        # 3.  RAG reranking
        print("\n3. RAG-BASED RERANKING:")
        reranked_results = self.rag_result_reranking(query, initial_results)
        for i, result in enumerate(reranked_results, 1):
            print(f"   {i}. (Original rank: {result['rank']}, New RAG rank: {result.get('rag_rank', i)})")
            print(f"      {result['title'][:50]}...")
            print(f"      Price: {result['price']}")

        # 4. generate final response
        print("\n4. GENERATE FINAL RESPONSE:")
        response = self.generate_response(query, reranked_results)
        print("\nAI Response:")
        print("-" * 50)
        print(response)
        print("-" * 50)

        return reranked_results

# create the RAG-enhanced search system
rag_search_system = RAGEnhancedSearchSystem(model, index, df, generator_model_name="google/flan-t5-large")

In [None]:
# test the system with demonstration
test_queries = [
    "I want to buy a gaming laptop",
    "Can you help me find the best basketball shoes?",
    "Help me pick golden earrings"
]

print("\nTESTING RAG-ENHANCED SEARCH SYSTEM")
print("="*60)

for query in test_queries:
    results = rag_search_system.demonstrate_rag_search_process(query)
    print("\n" + "="*60)

In [None]:
# interactive RAG-enhanced search
def interactive_rag_search_demo():
    """Interactive demonstration of RAG-enhanced search"""
    print("\nINTERACTIVE RAG-ENHANCED SEARCH DEMO")
    print("Type 'quit' to exit")
    print("="*50)

    while True:
        query = input("\nEnter your search query: ").strip()

        if query.lower() in ['quit', 'exit', "q"]:
            break

        if query:
            rag_search_system.demonstrate_rag_search_process(query)

In [None]:
# run interactive demo
interactive_rag_search_demo()