In [1]:
!pip uninstall -y numpy
!pip install "numpy<2.0" --no-cache-dir --force-reinstall

[0mCollecting numpy<2.0
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m274.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
Successfully installed numpy-1.26.4


In [2]:
import numpy as np
print(np.__version__)

1.26.4


In [3]:
import torch
print("CUDA available:", torch.cuda.is_available())

CUDA available: True


In [8]:
!pip install -q condacolab
import condacolab
condacolab.install()

⏬ Downloading https://github.com/jaimergp/miniforge/releases/download/24.11.2-1_colab/Miniforge3-colab-24.11.2-1_colab-Linux-x86_64.sh...
📦 Installing...
📌 Adjusting configuration...
🩹 Patching environment...
⏲ Done in 0:00:09
🔁 Restarting kernel...


In [4]:
!conda install -c pytorch faiss-gpu -y

Channels:
 - pytorch
 - conda-forge
Platform: linux-64
Collecting package metadata (repodata.json): - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / done
Solving environment: \ | / - \ done

## Package Plan ##

  environment location: /usr/local

  added / updated specs:
    - faiss-gpu


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    _openmp_mutex-4.5          |       3_kmp_llvm           7 KB  conda-forge
    ca-certificates-2025.1.31  |       hbcca054_0         154 KB  conda-forge
    certifi-2025.1.31          |     pyhd8ed1ab_0         159 KB  conda-forge
    conda-24.11.3              |  py311h38be061_0         1.1 MB  conda-forge
    cuda-cudart-12.8.90        |       h5888daf_1          22 KB  conda-forge
    cuda-cudart_linux-64-12.8.90|       h3f2d84a_1         188 KB  conda-forge
    cuda-nvrtc-12.8.93  

In [5]:
import os
import pandas as pd
import numpy as np
import faiss
import re
import torch
import psycopg2
import io
from sentence_transformers import SentenceTransformer
from google.colab import drive
from google.colab import auth

In [6]:
import pickle

In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [8]:
!curl ifconfig.me

curl: /usr/local/lib/libcurl.so.4: no version information available (required by curl)
34.143.147.49

In [9]:
# 🔹 Đường dẫn thư mục trên Google Drive
DATA_DIR = "/content/data"
os.makedirs(DATA_DIR, exist_ok=True)

CSV_FILE = "/content/amazon.csv"
EMBEDDING_FILE = os.path.join(DATA_DIR, "embeddings.npy")
FAISS_INDEX_FILE = os.path.join(DATA_DIR, "faiss_index.bin")

# Cấu hình PostgreSQL trên Google Cloud
DB_CONFIG = {
    "dbname": "amazon",  # Tên database
    "user": "postgres",  # User mặc định
    "password": "user",  # Cập nhật mật khẩu tại đây
    "host": "34.142.193.208",  # Public IP của host
    "port": "5432"
}

def connect_db():
    try:
        conn = psycopg2.connect(**DB_CONFIG)
        conn.autocommit = True
        print("✅ Database conntected successfully.")
        return conn
    except psycopg2.Error as e:
        print(f"❌ Lỗi kết nối PostgreSQL: {e}")
        return None

# 🔹 Tạo bảng trong Cloud SQL
def create_tables(conn):
    """ Tạo bảng theo Star Schema """
    with conn.cursor() as cur:
        cur.execute("""
        DROP TABLE IF EXISTS fact_reviews, dim_products, dim_categories CASCADE;

        CREATE TABLE dim_categories (
            category_id SERIAL PRIMARY KEY,
            main_category TEXT NOT NULL,
            sub_category TEXT NOT NULL,
            UNIQUE(main_category, sub_category)
        );

        CREATE TABLE dim_products (
            product_id SERIAL PRIMARY KEY,
            name TEXT NOT NULL,
            category_id INT REFERENCES dim_categories(category_id),
            image TEXT,
            link TEXT,
            UNIQUE(name, category_id)
        );

        CREATE TABLE fact_reviews (
            review_id SERIAL PRIMARY KEY,
            product_id INT REFERENCES dim_products(product_id),
            rating NUMERIC(3,2),
            rating_count INT,
            actual_price NUMERIC(12,2),
            discount_price NUMERIC(12,2)
        );
        """)
        print("✅ Tables created successfully.")

def clean_data(df):
    """Làm sạch và chuẩn hóa dữ liệu từ CSV"""
    df = df.copy()

    # Chuẩn hóa tên cột
    df["main_category"] = df["category"].astype(str).str.split("|").str[0]
    df["sub_category"] = df["category"].astype(
        str).str.split("|", n=1).str[-1].str.strip()

    df["name"] = df["product_name"].astype(str).str.strip()

    # Xóa ký tự ₹ và dấu phẩy trong giá tiền, chuyển thành số
    def safe_float_convert(value):
        try:
            return float(re.sub(r"[₹,]", "", str(value))) if value not in ["", "nan", None] else None
        except ValueError:
            return None

    df["actual_price"] = df["actual_price"].apply(safe_float_convert)
    df["discount_price"] = df["discounted_price"].apply(safe_float_convert)

    # Làm sạch cột no_of_ratings
    def clean_ratings(value):
        if isinstance(value, str) and value.replace(",", "").isdigit():
            return int(value.replace(",", ""))
        return None  # Trả về None thay vì 0 để lọc bỏ khi dropna

    df["no_of_ratings"] = df["rating_count"].apply(clean_ratings)

    # Làm sạch cột ratings
    def clean_ratings_value(value):
        try:
            return float(value)
        except (ValueError, TypeError):
            return None  # Trả về None thay vì 0 để lọc bỏ khi dropna

    df["ratings"] = df["rating"].apply(clean_ratings_value)

    # 🔹 Loại bỏ hàng có NaN ở các cột quan trọng
    df = df.dropna(
        subset=["actual_price", "discount_price", "ratings", "no_of_ratings"])

    df["image"] = df["img_link"].astype(str).str.strip()
    df["link"] = df["product_link"].astype(str).str.strip()

    return df


def refactor_data(df):
    """ Refactor data without importing from PostgreSQL """

    # 🔹 Simulate category mapping
    category_map = {
        f"{row['main_category']}|{row['sub_category']}": idx
        for idx, row in enumerate(df[["main_category", "sub_category"]].drop_duplicates().to_dict(orient="records"), start=1)
    }

    df["category_id"] = df.apply(lambda x: category_map.get(f"{x['main_category']}|{x['sub_category']}"), axis=1)
    df = df.dropna(subset=["category_id"])

    # 🔹 Simulate product mapping
    product_map = {
        row["name"].lower(): idx
        for idx, row in enumerate(df[["name"]].drop_duplicates().to_dict(orient="records"), start=1)
    }

    df["product_id"] = df["name"].str.lower().map(product_map)
    df = df.dropna(subset=["product_id"])

    print("✅ Data refactored successfully.")
    return df

def import_data(conn, df):
    """ Nhập dữ liệu vào PostgreSQL """
    with conn.cursor() as cur:
        # 🔹 Chèn dữ liệu vào dim_categories
        cur.executemany("""
        INSERT INTO dim_categories (main_category, sub_category)
        VALUES (%s, %s) ON CONFLICT DO NOTHING;
        """, df[["main_category", "sub_category"]].drop_duplicates().values.tolist())

        # 🔹 Lấy category_id từ PostgreSQL
        cur.execute(
            "SELECT category_id, main_category, sub_category FROM dim_categories;")
        category_map = {f"{row[1]}|{row[2]}": row[0] for row in cur.fetchall()}
        df["category_id"] = df.apply(lambda x: category_map.get(
            f"{x['main_category']}|{x['sub_category']}", None), axis=1)
        df = df.dropna(subset=["category_id"])

        # 🔹 Chèn dữ liệu vào dim_products
        cur.executemany("""
        INSERT INTO dim_products (name, category_id, image, link)
        VALUES (%s, %s, %s, %s) ON CONFLICT DO NOTHING;
        """, df[["name", "category_id", "image", "link"]].drop_duplicates().values.tolist())

        # 🔹 Lấy product_id từ PostgreSQL
        cur.execute("SELECT product_id, LOWER(name) FROM dim_products;")
        product_map = {row[1]: row[0] for row in cur.fetchall()}
        df["product_id"] = df["name"].str.lower().map(product_map)
        df = df.dropna(subset=["product_id"])

        # 🔹 Chèn dữ liệu vào fact_reviews
        cur.executemany("""
        INSERT INTO fact_reviews (product_id, rating, rating_count, actual_price, discount_price)
        VALUES (%s, %s, %s, %s, %s);
        """, df[["product_id", "ratings", "no_of_ratings", "actual_price", "discount_price"]].values.tolist())

        print("✅ Data imported successfully.")

def create_faiss_index(df):
    """Tạo FAISS index từ dữ liệu"""
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = SentenceTransformer('all-MiniLM-L6-v2').to(device)

    print("🔹 Đang tạo embeddings...")
    all_embeddings = model.encode(df["name"].str.lower().tolist(), device=device, show_progress_bar=True).astype('float32')
    np.save(EMBEDDING_FILE, all_embeddings)

    dimension = all_embeddings.shape[1]

    # FAISS setup
    index = faiss.IndexFlatL2(dimension)
    if torch.cuda.is_available():
        res = faiss.StandardGpuResources()
        index = faiss.index_cpu_to_gpu(res, 0, index)

    index.add(all_embeddings)
    index = faiss.index_gpu_to_cpu(index)
    faiss.write_index(index, FAISS_INDEX_FILE)
    print(f"✅ FAISS index đã lưu tại {FAISS_INDEX_FILE}")

    # 🔹 Load lại FAISS index để kiểm tra
    index = faiss.read_index(FAISS_INDEX_FILE)
    if torch.cuda.is_available():
        gpu_index = faiss.index_cpu_to_gpu(res, 0, index)
    else:
        gpu_index = index

    # 🔹 Truy vấn thử nghiệm
    query = "cables"
    query_embedding = model.encode([query], device=device).astype('float32')
    gpu_index.nprobe = 10  # Tăng nprobe để cải thiện độ chính xác

    k = 10
    distances, indices = gpu_index.search(query_embedding, k)

    print(f"🔍 Kết quả tìm kiếm cho '{query}':")
    for i in range(k):
        idx = indices[0][i]
        if idx < len(df):  # Kiểm tra chỉ số hợp lệ
            print(f"{i+1}. {df['name'].iloc[idx]} (Khoảng cách: {distances[0][i]:.4f})")

    return FAISS_INDEX_FILE

def main():
    conn = connect_db()
    if not conn:
        return

    create_tables(conn)

    df = pd.read_csv(CSV_FILE, encoding="utf-8")
    # 🔹 Làm sạch dữ liệu
    df_cleaned = clean_data(df)

    # # 🔹 Import data
    # df_final = import_data(conn, df_cleaned)

    # 🔹 Xử lý dữ liệu giống như khi import
    df_final = refactor_data(df_cleaned)

    # 🔹 Tạo FAISS index
    faiss_index_path = create_faiss_index(df_final)

    conn.close()
    print("✅ Hoàn tất quá trình xử lý! 🔹 Kết nối PostgreSQL đã đóng.")

if __name__ == "__main__":
    main()

✅ Database conntected successfully.
✅ Tables created successfully.
✅ Data refactored successfully.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

🔹 Đang tạo embeddings...


Batches:   0%|          | 0/46 [00:00<?, ?it/s]

✅ FAISS index đã lưu tại /content/data/faiss_index.bin
🔍 Kết quả tìm kiếm cho 'cables':
1. Amazon Basics High-Speed HDMI Cable, 6 Feet (2-Pack),Black (Khoảng cách: 0.8475)
2. CableCreation RCA to 3.5mm Male Audio Cable, 3.5mm to 2RCA Cable Male RCA Cable,Y Splitter Stereo Jack Cable for Home Theater,Subwoofer, Receiver, Speakers and More (3Feet/0.9Meter,Black) (Khoảng cách: 0.8761)
3. Amazon Basics High-Speed HDMI Cable, 6 Feet - Supports Ethernet, 3D, 4K video,Black (Khoảng cách: 0.9058)
4. AmazonBasics High-Speed Braided HDMI Cable - 3 Feet - Supports Ethernet, 3D, 4K and Audio Return (Black) (Khoảng cách: 0.9166)
5. Storite USB Extension Cable USB 3.0 Male to Female Extension Cable High Speed 5GBps Extension Cable Data Transfer for Keyboard, Mouse, Flash Drive, Hard Drive, Printer and More- 1.5M - Blue (Khoảng cách: 0.9333)
6. Amazon Basics 10.2 Gbps High-Speed 4K HDMI Cable with Braided Cord, 1.8 Meter, Dark Grey (Khoảng cách: 0.9350)
7. AmazonBasics USB 2.0 Cable - A-Male to B-Mal

In [10]:
!pip install gradio matplotlib pillow requests

Collecting gradio
  Using cached gradio-5.23.3-py3-none-any.whl.metadata (16 kB)
Collecting matplotlib
  Downloading matplotlib-3.10.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting pillow
  Downloading pillow-11.1.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (9.1 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Using cached aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting anyio<5.0,>=3.0 (from gradio)
  Downloading anyio-4.9.0-py3-none-any.whl.metadata (4.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Using cached fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Using cached ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.8.0 (from gradio)
  Using cached gradio_client-1.8.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Using cached groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting httpx>=0.24.1 (from gradio)
  Downloadin

In [14]:
import gradio as gr
import pandas as pd
import numpy as np
import faiss
import torch
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
import io
from PIL import Image
import requests

# Đường dẫn tệp
DATA_DIR = "/content/data"
EMBEDDING_FILE = f"{DATA_DIR}/embeddings.npy"
FAISS_INDEX_FILE = f"{DATA_DIR}/faiss_index.bin"
CSV_FILE = "/content/amazon.csv"

# Tải dữ liệu và mô hình
df = pd.read_csv(CSV_FILE, encoding="utf-8")
df_cleaned = clean_data(df)  # Hàm từ mã của bạn
df_final = refactor_data(df_cleaned)  # Hàm từ mã của bạn

# Tải embeddings và FAISS index
embeddings = np.load(EMBEDDING_FILE)
index = faiss.read_index(FAISS_INDEX_FILE)
model = SentenceTransformer('all-MiniLM-L6-v2')
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Chuyển FAISS index sang GPU nếu có
if torch.cuda.is_available():
    res = faiss.StandardGpuResources()
    gpu_index = faiss.index_cpu_to_gpu(res, 0, index)
else:
    gpu_index = index

# Hàm tìm kiếm sản phẩm
def search_products(query, top_k=5):
    query_embedding = model.encode([query], device=device).astype('float32')
    gpu_index.nprobe = 10
    distances, indices = gpu_index.search(query_embedding, top_k)

    results = []
    for i in range(top_k):
        idx = indices[0][i]
        if idx < len(df_final):
            product = df_final.iloc[idx]
            results.append({
                "name": product["name"],
                "actual_price": product["actual_price"],
                "discount_price": product["discount_price"],
                "rating": product["ratings"],
                "rating_count": product["no_of_ratings"],
                "image": product["image"],
                "distance": distances[0][i]
            })
    return results

# Hàm tạo biểu đồ phân bố đánh giá
def plot_rating_distribution(results):
    ratings = [r["rating"] for r in results]
    plt.figure(figsize=(8, 4))
    plt.hist(ratings, bins=5, range=(0, 5), color='lightgreen', edgecolor='black')  # Đánh giá từ 0-5
    plt.title("Phân bố đánh giá của sản phẩm tìm kiếm")
    plt.xlabel("Đánh giá (0-5)")
    plt.ylabel("Số lượng")
    buf = io.BytesIO()
    plt.savefig(buf, format="png")
    buf.seek(0)
    plt.close()  # Đóng figure để tránh memory leak
    return Image.open(buf)

# Hàm chính cho giao diện
def gradio_interface(query, top_k):
    results = search_products(query, top_k)

    # Chuẩn bị đầu ra
    output_text = ""
    images = []
    default_img = Image.new('RGB', (100, 100), color='gray')  # Ảnh mặc định nếu lỗi

    for i, res in enumerate(results):
        output_text += (f"**{i+1}. {res['name']}**\n"
                        f"- Giá gốc: ₹{res['actual_price']:.2f}\n"
                        f"- Giá giảm: ₹{res['discount_price']:.2f}\n"
                        f"- Đánh giá: {res['rating']}/5 ({res['rating_count']} lượt)\n"
                        f"- Khoảng cách: {res['distance']:.4f}\n\n")
        try:
            # Tải ảnh với timeout và kiểm tra lỗi
            response = requests.get(res["image"], stream=True, timeout=5)
            response.raise_for_status()  # Kiểm tra lỗi HTTP
            img = Image.open(response.raw)
            images.append(img)
        except (requests.RequestException, Exception) as e:
            images.append(default_img)  # Dùng ảnh mặc định nếu lỗi

    # Tạo biểu đồ phân bố đánh giá
    rating_plot = plot_rating_distribution(results)

    return output_text, images, rating_plot

# Xây dựng giao diện Gradio
with gr.Blocks(title="Hệ thống tìm kiếm sản phẩm Amazon") as demo:
    gr.Markdown("# Tìm kiếm sản phẩm thông minh")
    gr.Markdown("Nhập từ khóa để tìm kiếm sản phẩm từ dữ liệu Amazon. Xem thông tin chi tiết, hình ảnh và phân bố đánh giá.")

    with gr.Row():
        query_input = gr.Textbox(label="Từ khóa tìm kiếm", placeholder="Ví dụ: cables")
        top_k_input = gr.Slider(1, 10, value=5, step=1, label="Số lượng kết quả")
        submit_btn = gr.Button("Tìm kiếm")

    with gr.Row():
        with gr.Column(scale=2):
            output_text = gr.Markdown(label="Kết quả tìm kiếm")
        with gr.Column(scale=1):
            output_images = gr.Gallery(label="Hình ảnh sản phẩm", show_label=True)

    with gr.Row():
        output_plot = gr.Image(label="Phân bố đánh giá")

    # Liên kết nút tìm kiếm với hàm xử lý
    submit_btn.click(
        fn=gradio_interface,
        inputs=[query_input, top_k_input],
        outputs=[output_text, output_images, output_plot]
    )

# Khởi chạy giao diện
demo.launch()

✅ Data refactored successfully.
Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://4888807416d18f2c47.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


