In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
train_data = pd.read_csv('train_data.csv')
train_data['processed_name'] = train_data['name'].str.lower()
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(train_data['processed_name'])
def search_similar_items(query, top_n=10):
    query_vector = tfidf_vectorizer.transform([query.lower()])
    cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    results_df = pd.DataFrame({
        'id': train_data['id'].astype(str),  
        'name': train_data['name'],
        'cosine_similarity': cosine_similarities
    })
    results_df = results_df.sort_values(by='cosine_similarity', ascending=False)
    return results_df.head(top_n)
query = "men"
top_results = search_similar_items(query, top_n=10)
print('Search query: ', query)
print(top_results)

Search query:  men
               id                                               name  \
98339   msh101739                     Men's Black Mesh Shoes for Men   
117280  mcl233363  WE PERFECT Men's Stylish Cotton Blend Printed ...   
131747  msh123676                 Men's Stylish Casual BOOTS for Men   
126283  mcl041207                       Reebok Men's' Men's T Shirts   
54470   mcl239574                          Reebok Men Men's T Shirts   
104072  spf224189                       Reebok Men's' Men's T Shirts   
84047   msh231599             Men's Black Synthetic Loafer for Men's   
11025   msh031497                 Men's Black Synthetic Men's Loafer   
103647  msh096402         Men's Formal Leather Lace Up Shoes for Men   
112061  msh113334               Men's Casual Slip On Loafers for Men   

        cosine_similarity  
98339            0.540748  
117280           0.529954  
131747           0.506225  
126283           0.498951  
54470            0.498951  
104072           0.4

In [24]:
def find_similar_items(item_id, tfidf_matrix, train_data, top_n=5):
    # Get the index of the item with the given ID
    item_index = train_data[train_data['id'] == item_id].index[0]
    
    # Get the TF-IDF vector for the input item
    item_vector = tfidf_matrix[item_index]

    # Calculate cosine similarity between the input item and all other items
    cosine_similarities = cosine_similarity(item_vector, tfidf_matrix).flatten()

    # Get the indices of the most similar items (excluding the input item itself)
    similar_indices = cosine_similarities.argsort()[-top_n-1:-1][::-1]

    # Get the similar items' names and categories
    similar_items = train_data.iloc[similar_indices][['name', 'main_category', 'sub_category']]
    
    # Add similarity scores to the DataFrame
    similar_items['similarity_score'] = cosine_similarities[similar_indices]

    return similar_items
item_id = 'kfa066741' 
similar_items = find_similar_items(item_id, tfidf_matrix, train_data)
print(similar_items)

                                                     name     main_category  \
105464  Men's loafer socks pack of 2 loafer socks Men'...            stores   
110748  Boldfit Bamboo Socks For Men Women Ankle Socks...  women's clothing   
61880   Boldfit Bamboo Socks For Men Women Ankle Socks...            stores   
93262   Dazzlia cotton socks/Anti slip grip socks for ...     kids' fashion   
18077   SADMAX Flip Flops Socks || 3D Pattern Socks ||...  sports & fitness   

         sub_category  similarity_score  
105464  Men's Fashion          0.635105  
110748       Clothing          0.602892  
61880   Men's Fashion          0.602892  
93262    Baby Fashion          0.600579  
18077         Cycling          0.590728  


In [4]:
random_id = train_data['id'].sample(n=1).values[0]

print("Random item ID:", random_id)

Random item ID: app200847
