In [1]:
import numpy as np
from numpy import log, log1p, bincount
import pandas as pd

import implicit
from implicit.als import AlternatingLeastSquares
from implicit.approximate_als import (
    AnnoyAlternatingLeastSquares,
    FaissAlternatingLeastSquares,
    NMSLibAlternatingLeastSquares,
)
from implicit.bpr import BayesianPersonalizedRanking
from implicit.lmf import LogisticMatrixFactorization
from implicit.nearest_neighbours import (
    BM25Recommender,
    CosineRecommender,
    TFIDFRecommender,
    bm25_weight,
)

import time
import logging
import scipy
from scipy.sparse import coo_matrix, linalg, csr_matrix
import codecs

from tqdm import tqdm

# log = logging.getLogger("implicit")

In [58]:
all_files = ['orders_export_1.csv', 'orders_export_2.csv', 'orders_export_3.csv']
df_from_each_file = (pd.read_csv(f) for f in all_files)
all_orders_original = pd.concat(df_from_each_file, ignore_index=True)

  all_orders_original = pd.concat(df_from_each_file, ignore_index=True)
  all_orders_original = pd.concat(df_from_each_file, ignore_index=True)


In [59]:
# the default dataset
all_orders_original.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54772 entries, 0 to 54771
Data columns (total 73 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Name                         54772 non-null  object 
 1   Email                        54728 non-null  object 
 2   Financial Status             37480 non-null  object 
 3   Paid at                      30519 non-null  object 
 4   Fulfillment Status           37480 non-null  object 
 5   Fulfilled at                 35612 non-null  object 
 6   Accepts Marketing            37480 non-null  object 
 7   Currency                     37480 non-null  object 
 8   Subtotal                     37480 non-null  float64
 9   Shipping                     37480 non-null  float64
 10  Taxes                        37480 non-null  float64
 11  Total                        37480 non-null  float64
 12  Discount Code                6202 non-null   object 
 13  Discount Amount 

In [60]:
columns = ['Email', 'Lineitem sku', 'Lineitem quantity']
columns2 = ['Vendor', 'Lineitem quantity', 'Email']

In [61]:
all_orders_2 = all_orders_original[columns]
all_orders_2.rename(columns={'Lineitem sku':'Item', 'Lineitem quantity':'Quantity'}, inplace=True)


all_orders = all_orders_original[columns2]
all_orders.rename(columns={'Lineitem quantity':'Quantity'}, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [62]:
all_orders.head()

Unnamed: 0,Vendor,Quantity,Email
0,Route,1,natsumi830@hotmail.com
1,FILA,1,natsumi830@hotmail.com
2,FILA,1,natsumi830@hotmail.com
3,BTS,1,micahjumaq@gmail.com
4,BTS,1,micahjumaq@gmail.com


### Model based on vendors
[Quantity, (Vendor, Email)]

In [63]:
# all_orders['Quantity'] = all_orders["Quantity"]
# all_orders["Email"] = all_orders["Email"].astype("category")
# all_orders["Item"] = all_orders["Item"].astype("category")
# all_orders["Name"] = all_orders["Name"].astype("category")

all_orders['Vendor'] = all_orders['Vendor'].astype("category")
all_orders['Email'] = all_orders['Email'].astype("category")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_orders['Vendor'] = all_orders['Vendor'].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_orders['Email'] = all_orders['Email'].astype("category")


In [64]:
# all_orders.check_format(full_check=True).
all_orders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54772 entries, 0 to 54771
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   Vendor    54772 non-null  category
 1   Quantity  54772 non-null  int64   
 2   Email     54728 non-null  category
dtypes: category(2), int64(1)
memory usage: 1.3 MB


In [65]:
# dropping null values or we will have value error doing matrix.
all_orders.dropna(inplace=True)
all_orders.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 54728 entries, 0 to 54771
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   Vendor    54728 non-null  category
 1   Quantity  54728 non-null  int64   
 2   Email     54728 non-null  category
dtypes: category(2), int64(1)
memory usage: 1.7 MB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_orders.dropna(inplace=True)



### Test 1


In [66]:
order_matrix = csr_matrix\
    (
        (all_orders['Quantity'].astype(np.float32),
        (all_orders['Vendor'].cat.codes.copy(),
        all_orders['Email'].cat.codes.copy())), shape=(54728, 54728)
    )

order_matrix

<54728x54728 sparse matrix of type '<class 'numpy.float32'>'
	with 24392 stored elements in Compressed Sparse Row format>

In [67]:
start = time.time()
# Implicit ALS training model
model = implicit.als.AlternatingLeastSquares(
    factors=10,
    iterations=20,
    calculate_training_loss=True
)
model.fit(order_matrix)
logging.debug("trained model in %0.2fs", time.time() - start)

  0%|          | 0/20 [00:00<?, ?it/s]

In [68]:
# recommend items for a user
# dataset -> SVD -> CSR
order_matrix = bm25_weight(order_matrix, K1=0, B=0)
model.approximate_similar_items = False
vendors = np.array(all_orders["Vendor"])
emails = np.array(all_orders["Email"])
order_matrix

<54728x54728 sparse matrix of type '<class 'numpy.float64'>'
	with 24392 stored elements in COOrdinate format>

In [69]:
vendors

array(['Route', 'FILA', 'FILA', ..., 'Daebak Box', 'Daebak Box',
       'Daebak Box'], dtype=object)

In [70]:
emails

array(['natsumi830@hotmail.com', 'natsumi830@hotmail.com',
       'natsumi830@hotmail.com', ..., 'howard@feverguys.com',
       'howard@feverguys.com', 'howard@feverguys.com'], dtype=object)

In [71]:
user_counts = order_matrix.T.tocsr()
user_counts

<54728x54728 sparse matrix of type '<class 'numpy.float64'>'
	with 24392 stored elements in Compressed Sparse Row format>

In [72]:
with tqdm(total=len(emails)) as progress:
    with codecs.open("vendor_recommendation", "w", "utf8") as o:
        for userid, username in enumerate(emails):
            for v_id, score in model.recommend(userid, user_counts):
                o.write("%s\t%s\t%s\n" % (username, vendors[v_id], score))
            progress.update(1)

logging.debug("generated recommendations in %0.2fs", time.time() - start)

100%|██████████| 54728/54728 [00:16<00:00, 3268.11it/s]



### Test 2


In [73]:
all_orders_2['Item'] = all_orders_2['Item'].astype("category")
all_orders_2['Email'] = all_orders_2['Email'].astype("category")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_orders_2['Item'] = all_orders_2['Item'].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_orders_2['Email'] = all_orders_2['Email'].astype("category")


In [74]:
# all_orders.check_format(full_check=True)
all_orders_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54772 entries, 0 to 54771
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   Email     54728 non-null  category
 1   Item      52783 non-null  category
 2   Quantity  54772 non-null  int64   
dtypes: category(2), int64(1)
memory usage: 1.4 MB


In [75]:
# dropping null values or we will have value error doing matrix
all_orders_2.dropna(inplace=True)
all_orders_2.info()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_orders_2.dropna(inplace=True)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 52742 entries, 0 to 54771
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   Email     52742 non-null  category
 1   Item      52742 non-null  category
 2   Quantity  52742 non-null  int64   
dtypes: category(2), int64(1)
memory usage: 1.8 MB


In [76]:
order_matrix2 = coo_matrix\
    (
        (all_orders_2['Quantity'].astype(np.float32),
        (all_orders_2['Item'].cat.codes.copy(),
        all_orders_2['Email'].cat.codes.copy())), shape=(52742, 52742)
    )

order_matrix2

<52742x52742 sparse matrix of type '<class 'numpy.float32'>'
	with 52742 stored elements in COOrdinate format>

In [77]:
start = time.time()
# Implicit ALS training model
model = implicit.als.AlternatingLeastSquares(
    factors=10,
    iterations=20,
    calculate_training_loss=True
)
model.fit(order_matrix2)
logging.debug("trained model in %0.2fs", time.time() - start)

  0%|          | 0/20 [00:00<?, ?it/s]

In [78]:
# recommend items for a user
# dataset -> SVD -> CSR
order_matrix2 = bm25_weight(order_matrix2, K1=0, B=0)
model.approximate_similar_items = False
items = np.array(all_orders_2["Item"])
emails = np.array(all_orders_2["Email"])
order_matrix2

<52742x52742 sparse matrix of type '<class 'numpy.float64'>'
	with 52742 stored elements in COOrdinate format>

In [79]:
items

array(['ROUTEINS24',
       'Fashion - 31082 - FILA - FILA x BTS GO BEYOND 오버핏 스몰 슈티 후디 - BLK / 100 - DA',
       'Fashion - 31095 - FILA - FILA x BTS GO BEYOND 뉴트로 바시티 맨투맨 - BLK / 095 - DA',
       ..., 'DBB_Seasonal', 'DBB_Seasonal', 'DBB_Seasonal'], dtype=object)

In [80]:
emails

array(['natsumi830@hotmail.com', 'natsumi830@hotmail.com',
       'natsumi830@hotmail.com', ..., 'howard@feverguys.com',
       'howard@feverguys.com', 'howard@feverguys.com'], dtype=object)

In [81]:
user_counts = order_matrix2.T.tocsr()
user_counts

<52742x52742 sparse matrix of type '<class 'numpy.float64'>'
	with 37201 stored elements in Compressed Sparse Row format>

In [82]:
with tqdm(total=len(emails)) as progress:
    with codecs.open("item_recommendation", "w", "utf8") as o:
        for userid, username in enumerate(emails):
            for i_id, score in model.recommend(userid, user_counts):
                o.write("%s\t%s\t%s\n" % (username, items[i_id], score))
            progress.update(1)

logging.debug("generated recommendations in %0.2fs", time.time() - start)

100%|██████████| 52742/52742 [00:16<00:00, 3249.16it/s]


In [83]:
related = model.similar_items(1)
related

[(1, 1.0),
 (422, 0.9917083),
 (159, 0.9915446),
 (915, 0.9912996),
 (2742, 0.9692392),
 (2741, 0.9692391),
 (3515, 0.95830137),
 (2366, 0.95583844),
 (880, 0.9558384),
 (160, 0.95416504)]

In [84]:
with tqdm(total=len(items)) as progress:
    with codecs.open("related_item", "w", "utf8") as o:
        for itemsid, itemname in enumerate(items):
            for i_id, score in model.similar_items(itemsid):
                o.write("%s\t%s\t%s\n" % (itemname, items[i_id], score))
            progress.update(1)

logging.debug("generated recommendations in %0.2fs", time.time() - start)


100%|██████████| 52742/52742 [00:13<00:00, 3783.46it/s]
