In [None]:
import sys
import os
# Add the project root to the path so we can import our modules
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..'))

In [None]:
import pandas as pd
from types import SimpleNamespace
from tqdm import tqdm
import time
import pickle  # Import pickle to save the index

In [None]:
# Import classes and functions from the other files
from project_progress.part_1.data_preparation import ProcessedDocument
from project_progress.part_1.data_exploration import parse_numeric, normalize_product_details 
from project_progress.part_2.indexing import create_index_tfidf, search_tfidf
from project_progress.part_2.query_preparation import process_query

In [None]:
def load_and_process_data(data_path: str) -> list[ProcessedDocument]:
    """Loads the JSON dataset and applies preprocessing to each product."""
    print(f"Loading data from: {data_path}")
    df = pd.read_json(data_path)
    print(f"Rows loaded: {len(df)}")
    
    processed_docs = []
    errors = 0
    

    for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing documents"):
        try:
            rdict = row.to_dict()
            
            # Clean up some numeric and nested fields
            rdict["product_details"] = normalize_product_details(rdict.get("product_details"))
            for field in ["selling_price", "actual_price", "discount", "average_rating"]:
                rdict[field] = parse_numeric(rdict.get(field))

            doc_obj = SimpleNamespace(**rdict)
            pdoc = ProcessedDocument.from_document(doc_obj)
            pdoc.process_fields() 
            processed_docs.append(pdoc)
        except Exception:
            errors += 1
            continue
            
    print(f"Successfully processed: {len(processed_docs)} (Errors: {errors})")
    return processed_docs

In [None]:
def main():
    # Build the index from the dataset
    start_time = time.time()
    documents = load_and_process_data("data/fashion_products_dataset.json")
    
    print("\nStarting indexing...")
    index, tf, df, idf, title_index = create_index_tfidf(documents)
    end_time = time.time()
    
    print(f"--- Indexing complete! ---")
    print(f"  Time taken: {end_time - start_time:.2f} seconds")
    print(f"  Vocabulary size: {len(index)} terms")


    # We chose queries that are a bit more specific and realistic.
    index_data = {
        "index": index,
        "tf": tf,
        "df": df,
        "idf": idf,
        "title_index": title_index
    }
    index_filename = "project_progress/part_2/irwa_index.pkl"
    with open(index_filename, "wb") as f:
        pickle.dump(index_data, f)
    print(f"--- Index saved to {index_filename} ---")

    
    my_queries = [
        "ARBO cotton track pants for men",         
        "Multicolor track pants combo ECKO",       
        "Black solid women track pants",      
        "Elastic waist cotton blend track pants",     
        "Self design multicolor track pants"  
    ]

    
    print("\n--- Running test queries ---")
    output_filename = "project_progress/part_2/search_results.txt"

    with open(output_filename, "w", encoding="utf-8") as f:
        for query in my_queries:
            search_start = time.time()
            ranked_pids = search_tfidf(query, index, tf, idf)
            search_end = time.time()

            f.write(f"\n[Search] Query: '{query}'\n")
            print(f"\n[Search] Query: '{query}'")

            # Also save how the query looks after preprocessing
            processed = process_query(query)
            f.write(f"  Processed query: {processed}\n")
            print(f"  Processed query: {processed}")

            f.write(f"  Found {len(ranked_pids)} results in {search_end - search_start:.4f} seconds\n")
            print(f"  Found {len(ranked_pids)} results in {search_end - search_start:.4f} seconds")

            if not ranked_pids:
                f.write("    No results found.\n")
                print("    No results found.")
            else:
                for i, pid in enumerate(ranked_pids[:5]):
                    title = title_index.get(pid, "Unknown Title")
                    line = f"    {i + 1}. (PID: {pid}) {title}"
                    f.write(line + "\n")
                    print(line)

    print(f"\nSearch results saved in {output_filename}")

In [None]:
if __name__ == "__main__":
    main()