In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('Dataset.csv', index_col=False)

In [3]:
df.head()

Unnamed: 0,item_id,category,product
0,1,accesories,umbrella
1,2,clothing,raincoat
2,3,decorative,candles
3,4,accesories,comb
4,5,clothing,sweaters


In [4]:
feature_df = df[['category','product']]
feature_df.head()

Unnamed: 0,category,product
0,accesories,umbrella
1,clothing,raincoat
2,decorative,candles
3,accesories,comb
4,clothing,sweaters


In [6]:
df.shape

(896, 3)

In [5]:
cat_dict = {'stationary' : 1, 'accesories' : 2, 'clothing' : 3, 'decorative' : 4, 'handicrafts' : 5, 'homecare' : 6, 'selfcare' : 7,
           'kitchen' : 8, 'food': 9, 'toys':10, 'Technology' : 11, 'Office Supplies' : 12, 'Furniture' : 13}

In [6]:
df["cat_ordinal"] = df.category.map(cat_dict)
df

Unnamed: 0,item_id,category,product,cat_ordinal
0,1,accesories,umbrella,2
1,2,clothing,raincoat,3
2,3,decorative,candles,4
3,4,accesories,comb,2
4,5,clothing,sweaters,3
...,...,...,...,...
891,892,Office Supplies,Paper,12
892,893,Office Supplies,Appliances,12
893,894,Office Supplies,Fasteners,12
894,895,Office Supplies,Appliances,12


In [7]:
df["cat_ordinal"].isnull().sum()

0

In [8]:
print(df.to_string())

     item_id         category                   product  cat_ordinal
0          1       accesories                  umbrella            2
1          2         clothing                  raincoat            3
2          3       decorative                   candles            4
3          4       accesories                      comb            2
4          5         clothing                  sweaters            3
5          6      handicrafts              wooden horse            5
6          7         homecare                       mop            6
7          8      handicrafts                      bags            5
8          9         homecare                    phenyl            6
9         10         selfcare                   napkins            7
10        11         homecare               dhoop batti            6
11        12          kitchen                      tawa            8
12        13         homecare                night lamp            6
13        14         homecare     

In [9]:
df.shape

(896, 4)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 896 entries, 0 to 895
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   item_id      896 non-null    int64 
 1   category     896 non-null    object
 2   product      896 non-null    object
 3   cat_ordinal  896 non-null    int64 
dtypes: int64(2), object(2)
memory usage: 28.1+ KB


In [29]:
pip install sklearn

Collecting sklearn
  Downloading sklearn-0.0.tar.gz (1.1 kB)
Building wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py): started
  Building wheel for sklearn (setup.py): finished with status 'done'
  Created wheel for sklearn: filename=sklearn-0.0-py2.py3-none-any.whl size=1316 sha256=c8a72a129cccbb9fab6286ae08c162c181112e7d95e0a526dbdff00d821b4e17
  Stored in directory: c:\users\santoshkumar\appdata\local\pip\cache\wheels\22\0b\40\fd3f795caaa1fb4c6cb738bc1f56100be1e57da95849bfc897
Successfully built sklearn
Installing collected packages: sklearn
Successfully installed sklearn-0.0
Note: you may need to restart the kernel to use updated packages.


In [28]:
sklearn.__version__

<IPython.core.display.Javascript object>

'0.24.1'

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
tfv = TfidfVectorizer(min_df = 3, max_features=None,
                     strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
                     ngram_range=(1,3),
                     stop_words='english')

In [13]:
tvf_matrix = tfv.fit_transform(df['category'])

In [14]:
tvf_matrix

<896x15 sparse matrix of type '<class 'numpy.float64'>'
	with 1630 stored elements in Compressed Sparse Row format>

In [15]:
tvf_matrix.shape

(896, 15)

In [16]:
from sklearn.metrics.pairwise import sigmoid_kernel

In [17]:
sig = sigmoid_kernel(tvf_matrix, tvf_matrix)#computes the sigmoid kernel between two vectors

In [23]:
sig[0]

array([0.78820208, 0.76159416, 0.76159416, 0.78820208, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.78820208, 0.78820208,
       0.76159416, 0.78820208, 0.76159416, 0.76159416, 0.78820208,
       0.76159416, 0.78820208, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.78820208, 0.76159416, 0.76159416,
       0.78820208, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.78820208, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159

In [18]:
indices = pd.Series(df.index, index=df['product']).drop_duplicates()

In [19]:
indices

product
umbrella         0
raincoat         1
candles          2
comb             3
sweaters         4
              ... 
Paper          891
Appliances     892
Fasteners      893
Appliances     894
Furnishings    895
Length: 896, dtype: int64

In [20]:
def give_rec(title, sig=sig):
    idx = indices[title]
    sig_scores = list(enumerate(sig[idx]))
    #print(sig_scores)
    sig_scores = sorted(sig_scores, key=lambda x:x[1], reverse=True)
    sig_scores = sig_scores[1:6]
    prod_indices = [i[0] for i in sig_scores]
    return df['product'].iloc[prod_indices]

In [21]:
give_rec('caps')

3          comb
38        bindi
39      sandals
41    jewellery
44        purse
Name: product, dtype: object

In [22]:
give_rec('cashew')

26    raisins
27    almonds
28    jaggery
29     pickle
30       ghee
Name: product, dtype: object

In [87]:
give_rec('umbrella')

3          comb
38        bindi
39      sandals
41    jewellery
44        purse
Name: product, dtype: object

In [88]:
give_rec('sofa set')

8           phenyl
10     dhoop batti
12      night lamp
13    earthern pot
35      bed covers
Name: product, dtype: object

In [92]:
give_rec('mop')

8           phenyl
10     dhoop batti
12      night lamp
13    earthern pot
35      bed covers
Name: product, dtype: object

In [94]:
give_rec('photo  frames')

8           phenyl
10     dhoop batti
12      night lamp
13    earthern pot
35      bed covers
Name: product, dtype: object