In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('recdata.csv')

In [3]:
df.head()

Unnamed: 0,item_id,category,product
0,,accesories,umbrella
1,,clothing,raincoat
2,,decorative,candles
3,,accesories,comb
4,,clothing,sweaters


In [4]:
feature_df = df[['category','product']]
feature_df.head()

Unnamed: 0,category,product
0,accesories,umbrella
1,clothing,raincoat
2,decorative,candles
3,accesories,comb
4,clothing,sweaters


In [5]:
df.shape

(896, 3)

In [6]:
cat_dict = {'stationary' : 1, 'accesories' : 2, 'clothing' : 3, 'decorative' : 4, 'handicrafts' : 5, 'homecare' : 6, 'selfcare' : 7,
           'kitchen' : 8, 'food': 9, 'toys':10, 'Technology' : 11, 'Office Supplies' : 12, 'Furniture' : 13}

In [7]:
df["cat_ordinal"] = df.category.map(cat_dict)
df

Unnamed: 0,item_id,category,product,cat_ordinal
0,,accesories,umbrella,2
1,,clothing,raincoat,3
2,,decorative,candles,4
3,,accesories,comb,2
4,,clothing,sweaters,3
...,...,...,...,...
891,,Office Supplies,Paper,12
892,,Office Supplies,Appliances,12
893,,Office Supplies,Fasteners,12
894,,Office Supplies,Appliances,12


In [8]:
df["cat_ordinal"].isnull().sum()

0

In [9]:
print(df.to_string())

     item_id         category                   product  cat_ordinal
0        NaN       accesories                  umbrella            2
1        NaN         clothing                  raincoat            3
2        NaN       decorative                   candles            4
3        NaN       accesories                      comb            2
4        NaN         clothing                  sweaters            3
5        NaN      handicrafts              wooden horse            5
6        NaN         homecare                       mop            6
7        NaN      handicrafts                      bags            5
8        NaN         homecare                    phenyl            6
9        NaN         selfcare                   napkins            7
10       NaN         homecare               dhoop batti            6
11       NaN          kitchen                      tawa            8
12       NaN         homecare                night lamp            6
13       NaN         homecare     

In [10]:
df.shape

(896, 4)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 896 entries, 0 to 895
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   item_id      0 non-null      float64
 1   category     896 non-null    object 
 2   product      896 non-null    object 
 3   cat_ordinal  896 non-null    int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 28.1+ KB


In [12]:
pip install sklearn




In [13]:
sklearn.__version__

NameError: name 'sklearn' is not defined

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
tfv = TfidfVectorizer(min_df = 3, max_features=None,
                     strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
                     ngram_range=(1,3),
                     stop_words='english')

In [16]:
tvf_matrix = tfv.fit_transform(df['category'])

In [17]:
tvf_matrix

<896x15 sparse matrix of type '<class 'numpy.float64'>'
	with 1630 stored elements in Compressed Sparse Row format>

In [18]:
tvf_matrix.shape

(896, 15)

In [19]:
from sklearn.metrics.pairwise import sigmoid_kernel

In [20]:
sig = sigmoid_kernel(tvf_matrix, tvf_matrix)

In [21]:
sig[0]

array([0.78820208, 0.76159416, 0.76159416, 0.78820208, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.78820208, 0.78820208,
       0.76159416, 0.78820208, 0.76159416, 0.76159416, 0.78820208,
       0.76159416, 0.78820208, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.78820208, 0.76159416, 0.76159416,
       0.78820208, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.78820208, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159

In [22]:
indices = pd.Series(df.index, index=df['product']).drop_duplicates()

In [23]:
indices

product
umbrella         0
raincoat         1
candles          2
comb             3
sweaters         4
              ... 
Paper          891
Appliances     892
Fasteners      893
Appliances     894
Furnishings    895
Length: 896, dtype: int64

In [24]:
def give_rec(title, sig=sig):
    idx = indices[title]
    sig_scores = list(enumerate(sig[idx]))
    #print(sig_scores)
    sig_scores = sorted(sig_scores, key=lambda x:x[1], reverse=True)
    sig_scores = sig_scores[1:6]
    prod_indices = [i[0] for i in sig_scores]
    return df['product'].iloc[prod_indices]

In [25]:
give_rec('caps')

3          comb
38        bindi
39      sandals
41    jewellery
44        purse
Name: product, dtype: object

In [26]:
give_rec('cashew')

26    raisins
27    almonds
28    jaggery
29     pickle
30       ghee
Name: product, dtype: object

In [87]:
give_rec('umbrella')

3          comb
38        bindi
39      sandals
41    jewellery
44        purse
Name: product, dtype: object

In [88]:
give_rec('sofa set')

8           phenyl
10     dhoop batti
12      night lamp
13    earthern pot
35      bed covers
Name: product, dtype: object

In [92]:
give_rec('mop')

8           phenyl
10     dhoop batti
12      night lamp
13    earthern pot
35      bed covers
Name: product, dtype: object

In [94]:
give_rec('photo  frames')

8           phenyl
10     dhoop batti
12      night lamp
13    earthern pot
35      bed covers
Name: product, dtype: object