In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the dataset
train_data = pd.read_csv('train_data.csv')

# Preprocess item names
train_data['processed_name'] = train_data['name'].str.lower()

# Create TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(train_data['processed_name'])

# Example query
query = "socks"  # Replace with the user's query
query_vector = tfidf_vectorizer.transform([query])

# Compute cosine similarity
cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()

# Create a DataFrame with names, IDs, and cosine similarity
results_df = pd.DataFrame({
    'id': train_data['id'].astype(str),  # Convert ID to string
    'name': train_data['name'],
    'cosine_similarity': cosine_similarities
})

# Sort the results by cosine similarity in descending order
results_df = results_df.sort_values(by='cosine_similarity', ascending=False)

# Display the top results (e.g., top 10)
print(results_df.head(10))

               id                                               name  \
71854   kfa066741  Cuteably™ Baby Girl Cotton Socks Fancy Socks G...   
105464  str084074  Men's loafer socks pack of 2 loafer socks Men'...   
110748  wcl003411  Boldfit Bamboo Socks For Men Women Ankle Socks...   
61880   str177397  Boldfit Bamboo Socks For Men Women Ankle Socks...   
18077   spf123418  SADMAX Flip Flops Socks || 3D Pattern Socks ||...   
101184  str005332  Boldfit Socks for Men & Women Unisex Stylish D...   
80379   wcl112428  Boldfit Socks for Men & Women Unisex Stylish D...   
25930   str265521  Supersox Socks For Men Premium Ankle Length Sp...   
124284  str029826  Supersox Men's Cotton Socks, Men's Running Soc...   
78219   spf135711  Azad Leather Winter Socks Zipper Soft Halaal L...   

        cosine_similarity  
71854            0.811135  
105464           0.775481  
110748           0.743270  
61880            0.743270  
18077            0.728273  
101184           0.710183  
80379      

In [24]:
def find_similar_items(item_id, tfidf_matrix, train_data, top_n=5):
    # Get the index of the item with the given ID
    item_index = train_data[train_data['id'] == item_id].index[0]
    
    # Get the TF-IDF vector for the input item
    item_vector = tfidf_matrix[item_index]

    # Calculate cosine similarity between the input item and all other items
    cosine_similarities = cosine_similarity(item_vector, tfidf_matrix).flatten()

    # Get the indices of the most similar items (excluding the input item itself)
    similar_indices = cosine_similarities.argsort()[-top_n-1:-1][::-1]

    # Get the similar items' names and categories
    similar_items = train_data.iloc[similar_indices][['name', 'main_category', 'sub_category']]
    
    # Add similarity scores to the DataFrame
    similar_items['similarity_score'] = cosine_similarities[similar_indices]

    return similar_items

# Example usage:
item_id = 'kfa066741'  # Replace with the ID of the item you want to find similar items for
similar_items = find_similar_items(item_id, tfidf_matrix, train_data)
print(similar_items)

                                                     name     main_category  \
105464  Men's loafer socks pack of 2 loafer socks Men'...            stores   
110748  Boldfit Bamboo Socks For Men Women Ankle Socks...  women's clothing   
61880   Boldfit Bamboo Socks For Men Women Ankle Socks...            stores   
93262   Dazzlia cotton socks/Anti slip grip socks for ...     kids' fashion   
18077   SADMAX Flip Flops Socks || 3D Pattern Socks ||...  sports & fitness   

         sub_category  similarity_score  
105464  Men's Fashion          0.635105  
110748       Clothing          0.602892  
61880   Men's Fashion          0.602892  
93262    Baby Fashion          0.600579  
18077         Cycling          0.590728  
