In [16]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

In [17]:
df = pd.read_csv('../datasets/cleaned_data.csv')

In [18]:
df.isnull().sum()

product_name                0
product_category_tree       0
description                 1
brand                    3770
dtype: int64

In [19]:
df.head()

Unnamed: 0,product_name,product_category_tree,description,brand
0,alisha solid womens cycle short,clothe womens clothe lingerie sleep swimwear s...,key feature alisha solid womens cycle short co...,alisha
1,fabhomedecor fabric double sofa bed,furniture live room furniture sofa bed futons ...,fabhomedecor fabric double sofa bed finish col...,fabhomedecor
2,aw belly,footwear womens footwear ballerinas aw belly,key feature aw belly sandals wedge heel casual...,aw
3,sicons purpose arnica dog shampoo,pet supply groom skin coat care shampoo sicons...,specifications sicons purpose arnica dog shamp...,sicons
4,eternal gandhi super series crystal paper weig...,eternal gandhi super series crystal paper weight,key feature eternal gandhi super series crysta...,eternal gandhi


In [20]:
df.shape

(12676, 4)

In [21]:
df['brand'] = np.where(df['brand'].isnull(),"",df['brand'])

In [22]:
df['description'] = np.where(df['description'].isnull(),"",df['description'])

In [23]:
df['meta_data'] = df['product_name']+" "+df['product_category_tree']+" "+df['description']+" "+df['brand']

In [24]:
df['meta_data'][0]

'alisha solid womens cycle short clothe womens clothe lingerie sleep swimwear short alisha short alisha solid womens cycle short key feature alisha solid womens cycle short cotton lycra navy red navyspecifications alisha solid womens cycle short short detail number content sales package pack 3 fabric cotton lycra type cycle short general detail pattern solid ideal womens fabric care gentle machine wash lukewarm water bleach additional detail style code altht3p21 box 3 short alisha'

In [25]:
df.head()

Unnamed: 0,product_name,product_category_tree,description,brand,meta_data
0,alisha solid womens cycle short,clothe womens clothe lingerie sleep swimwear s...,key feature alisha solid womens cycle short co...,alisha,alisha solid womens cycle short clothe womens ...
1,fabhomedecor fabric double sofa bed,furniture live room furniture sofa bed futons ...,fabhomedecor fabric double sofa bed finish col...,fabhomedecor,fabhomedecor fabric double sofa bed furniture ...
2,aw belly,footwear womens footwear ballerinas aw belly,key feature aw belly sandals wedge heel casual...,aw,aw belly footwear womens footwear ballerinas a...
3,sicons purpose arnica dog shampoo,pet supply groom skin coat care shampoo sicons...,specifications sicons purpose arnica dog shamp...,sicons,sicons purpose arnica dog shampoo pet supply g...
4,eternal gandhi super series crystal paper weig...,eternal gandhi super series crystal paper weight,key feature eternal gandhi super series crysta...,eternal gandhi,eternal gandhi super series crystal paper weig...


In [26]:
df1 = pd.read_csv('../datasets/main_data.csv')

In [27]:
df1.head()

Unnamed: 0,product_url,product_name,discounted_price,description,brand,product_specifications,product_name_val,product_category,img,meta_data
0,http://www.flipkart.com/alisha-solid-women-s-c...,Alisha Solid Women's Cycling Shorts,379.0,Key Features of Alisha Solid Women's Cycling S...,Alisha,product_specification=>key=>Number of Contents...,alisha solid womens cycle short,Clothing,http://img5a.flixcart.com/image/short/u/4/a/al...,alisha solid womens cycle short clothe womens ...
1,http://www.flipkart.com/fabhomedecor-fabric-do...,FabHomeDecor Fabric Double Sofa Bed,22646.0,FabHomeDecor Fabric Double Sofa Bed (Finish Co...,FabHomeDecor,product_specification=>key=>Installation & Dem...,fabhomedecor fabric double sofa bed,Furniture,http://img6a.flixcart.com/image/sofa-bed/j/f/y...,fabhomedecor fabric double sofa bed furniture ...
2,http://www.flipkart.com/aw-bellies/p/itmeh4grg...,AW Bellies,499.0,Key Features of AW Bellies Sandals Wedges Heel...,AW,"product_specification=>key=>Ideal For, value=>...",aw belly,Footwear,http://img5a.flixcart.com/image/shoe/7/z/z/red...,aw belly footwear womens footwear ballerinas a...
3,http://www.flipkart.com/sicons-all-purpose-arn...,Sicons All Purpose Arnica Dog Shampoo,210.0,Specifications of Sicons All Purpose Arnica Do...,Sicons,"product_specification=>key=>Pet Type, value=>D...",sicons purpose arnica dog shampoo,Pet Supplies,http://img5a.flixcart.com/image/pet-shampoo/r/...,sicons purpose arnica dog shampoo pet supply g...
4,http://www.flipkart.com/eternal-gandhi-super-s...,Eternal Gandhi Super Series Crystal Paper Weig...,430.0,Key Features of Eternal Gandhi Super Series Cr...,Eternal Gandhi,"product_specification=>key=>Model Name, value=...",eternal gandhi super series crystal paper weig...,Eternal Gandhi Super Series Crystal Paper Weig...,http://img5a.flixcart.com/image/paper-weight/u...,eternal gandhi super series crystal paper weig...


In [28]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')

In [29]:
tfidf_matrix = tf.fit_transform(df['meta_data'])

In [30]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [31]:
cosine_sim.shape

(12676, 12676)

In [32]:
d = {}
for i in range(df.shape[0]):
    if d.get(df['product_name'].iloc[i]) == None:
        d[df['product_name'].iloc[i]] = i

In [33]:
x = 'alisha solid womens cycle short'
idx = d[x]
idx

0

In [34]:
sim_scores = list(enumerate(cosine_sim[idx]))

In [35]:
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

In [36]:
sim_scores = sim_scores[1:31]

In [37]:
product_indices = [i[0] for i in sim_scores]

In [38]:
product_indices

[644,
 7162,
 899,
 1962,
 1786,
 1708,
 7158,
 1789,
 699,
 1796,
 905,
 8055,
 1775,
 904,
 3618,
 1832,
 9896,
 5526,
 1723,
 7159,
 427,
 474,
 1715,
 6653,
 405,
 557,
 1784,
 9683,
 502,
 7252]

In [21]:
res = df.iloc[product_indices]

In [22]:
res.index

Int64Index([ 644, 7162,  899, 1962, 1786, 1708, 7158, 1789,  699, 1796,  905,
            8055, 1775,  904, 3618, 1832, 9896, 5526, 1723, 7159,  427,  474,
            1715, 6653,  405,  557, 1784, 9683,  502, 7252],
           dtype='int64')

In [23]:
def get_recommendation(x):
    idx = d[x]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    product_indices = [i[0] for i in sim_scores]
    res = df.iloc[product_indices]
    return res.index

In [24]:
get_recommendation("alisha solid womens cycle short")

Int64Index([ 644, 7162,  899, 1962, 1786, 1708, 7158, 1789,  699, 1796,  905,
            8055, 1775,  904, 3618, 1832, 9896, 5526, 1723, 7159,  427,  474,
            1715, 6653,  405,  557, 1784, 9683,  502, 7252],
           dtype='int64')

In [26]:
import pickle

In [27]:
pickle.dump(tfidf_matrix,open('model.pkl','wb'))