In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the dataset
train_data = pd.read_csv('train_data.csv')

# Preprocess item names by converting to lowercase
train_data['processed_name'] = train_data['name'].str.lower()

# Create TF-IDF vectorizer and fit on the processed names
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(train_data['processed_name'])

# Function to search for similar items based on query
def search_similar_items(query, top_n=10):
    
    # Transform the query into a vector
    query_vector = tfidf_vectorizer.transform([query.lower()])
    
    # Compute cosine similarity between the query and all items
    cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    
    # Create a DataFrame with IDs, names, and cosine similarity scores
    results_df = pd.DataFrame({
        'id': train_data['id'].astype(str),  # Convert ID to string
        'name': train_data['name'],
        'cosine_similarity': cosine_similarities
    })
    
    # Sort the results by cosine similarity in descending order
    results_df = results_df.sort_values(by='cosine_similarity', ascending=False)
    
    # Return the top N results
    return results_df.head(top_n)

# Example usage:
query = "samsung"  # Replace with the user's query
top_results = search_similar_items(query, top_n=10)
print(top_results)


               id                                               name  \
137828  tvc218154  D Earphones Headphones for Samsung Galaxy A02s...   
62004   tvc200588  Samsung tv Remote Compatible for Samsung LED/L...   
91163   tvc258170  Hybite® Samsung AC Remote Compatible with Sams...   
11439   tvc032115  D Type C to C Usb Cable for Samsung Galaxy A71...   
153611  tvc055088  ShopMagics In-Ear Headphones Earphones for Sam...   
40700   tvc263702  ShopMagics In-Ear Headphones Earphones for Sam...   
76014   tvc048011  LRIPL Samsung Universal Remote Control Compati...   
10      tvc065211  Samsung AC Remote Control Universal Compatible...   
100455  tvc052477  KITGOHUT 8 Pcs (4 Pair) for S6,S7 Mix Samsung ...   
99266   tvc174372  C&D Samsung Led Remote Compatible for Samsung ...   

        cosine_similarity  
137828           0.652116  
62004            0.616486  
91163            0.604065  
11439            0.580282  
153611           0.505895  
40700            0.505895  
76014      

In [24]:
def find_similar_items(item_id, tfidf_matrix, train_data, top_n=5):
    # Get the index of the item with the given ID
    item_index = train_data[train_data['id'] == item_id].index[0]
    
    # Get the TF-IDF vector for the input item
    item_vector = tfidf_matrix[item_index]

    # Calculate cosine similarity between the input item and all other items
    cosine_similarities = cosine_similarity(item_vector, tfidf_matrix).flatten()

    # Get the indices of the most similar items (excluding the input item itself)
    similar_indices = cosine_similarities.argsort()[-top_n-1:-1][::-1]

    # Get the similar items' names and categories
    similar_items = train_data.iloc[similar_indices][['name', 'main_category', 'sub_category']]
    
    # Add similarity scores to the DataFrame
    similar_items['similarity_score'] = cosine_similarities[similar_indices]

    return similar_items

# Example usage:
item_id = 'kfa066741'  # Replace with the ID of the item you want to find similar items for
similar_items = find_similar_items(item_id, tfidf_matrix, train_data)
print(similar_items)

                                                     name     main_category  \
105464  Men's loafer socks pack of 2 loafer socks Men'...            stores   
110748  Boldfit Bamboo Socks For Men Women Ankle Socks...  women's clothing   
61880   Boldfit Bamboo Socks For Men Women Ankle Socks...            stores   
93262   Dazzlia cotton socks/Anti slip grip socks for ...     kids' fashion   
18077   SADMAX Flip Flops Socks || 3D Pattern Socks ||...  sports & fitness   

         sub_category  similarity_score  
105464  Men's Fashion          0.635105  
110748       Clothing          0.602892  
61880   Men's Fashion          0.602892  
93262    Baby Fashion          0.600579  
18077         Cycling          0.590728  
