Embedding based product search using an amazon reviews dataset. The app is very limited due to the small dataset, larger dataset would improve it performance and value. A large product dataset would be hard to find due to the nature of this type of data usually being proproietary.

In [17]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [14]:
# Function to load the dataset
def load_dataset(file_path):
    return pd.read_csv(file_path, error_bad_lines=False)

# Function to preprocess data
def preprocess_data(df):
    df['Product Name'] = df['Product Name'].fillna('')
    df['Product Description'] = df['Product Description'].fillna('')
    df['Combined Description'] = df['Product Name'] + " " + df['Product Description']
    return df

# Function to generate TF-IDF embeddings
def generate_embeddings(data):
    vectorizer = TfidfVectorizer(max_features=5000)
    embeddings = vectorizer.fit_transform(data['Combined Description'])
    return embeddings, vectorizer

# Function to train Nearest Neighbors model
def train_nn_model(embeddings):
    nn_model = NearestNeighbors(n_neighbors=5, algorithm='auto')
    nn_model.fit(embeddings)
    return nn_model

# Search function
def search_products(query, model, vectorizer, data, top_k=5):
    query_vector = vectorizer.transform([query])
    distances, indices = model.kneighbors(query_vector, n_neighbors=top_k)
    return data.iloc[indices[0]][['Product Name', 'Product Description']]

# Main function to run the application
def main(file_path):
    # Load and preprocess the dataset
    df = load_dataset(file_path)
    df = preprocess_data(df)

    # Generate embeddings
    embeddings, vectorizer = generate_embeddings(df)

    # Train Nearest Neighbors model
    nn_model = train_nn_model(embeddings)

    # Test queries
    test_queries = [
        "kitchen appliances",
        "gaming consoles",
        "running shoes",
        "gardening tools",
        "acrylic paint set",
    ]

    # Iterate over test queries and print search results
    for query in test_queries:
        print(f"Search Results for '{query}':")
        search_results = search_products(query, nn_model, vectorizer, df)
        print(search_results)
        print("\n")

In [15]:
# load data
file_path = '/content/marketing_sample_for_amazon_com-ecommerce__20200101_20200131__10k_data.csv'

In [16]:
main(file_path)



  return pd.read_csv(file_path, error_bad_lines=False)
Skipping line 1072: expected 28 fields, saw 30
Skipping line 2134: expected 28 fields, saw 32
Skipping line 3182: expected 28 fields, saw 36
Skipping line 3714: expected 28 fields, saw 39
Skipping line 4779: expected 28 fields, saw 38



Search Results for 'kitchen appliances':
                                           Product Name Product Description
2975  Step2 Modern Metro Kitchen | Modern Play Kitch...                    
7268  'Deluxe Modern Kitchen' Battery Operated Toy K...                    
2307  Plantoys Dollhouse Furniture Kitchen Set, 6 Pi...                    
157                                                Nuts                    
83                                           Herbaceous                    


Search Results for 'gaming consoles':
                                   Product Name Product Description
999                   Hasbro Gaming Cootie Game                    
4601      Dragon Shield Card Gaming Box, Silver                    
5467  Dragon Shield Gaming Box Card Game, Green                    
83                                   Herbaceous                    
1157                                Diamonsters                    


Search Results for 'running shoes':
                 