# Current BPR + LR (linear combination) for CG recommendations

In [None]:
import os
import pandas as pd
import numpy as np

office = pd.read_csv('/Users/Yiteng/venv/rfm-project/data/SM-office.csv', sep=',', engine='c')

By now, we take office (OFM) data as one example, where the code segment below (commended) provided the segmentations of the original data towards specific categories.

In [None]:
productfile1 = '/Users/Yiteng/venv/rfm-project/data/og_data/ProductMaster.csv'
productfile2 = '/Users/Yiteng/venv/rfm-project/data/og_data/ProductMaster_Tops.csv'
product1 = pd.read_csv(productfile1, sep=',', engine='c')
product2 = pd.read_csv(productfile2, sep=',', engine='c')

Data Segmentation according to the meeting with Vijay.

In [None]:
# office = sub_segment_df[sub_segment_df.BUID.isin([15,56])]
# non_office = sub_segment_df[~sub_segment_df.BUID.isin([15,56])]

# sub_segment_df = orders[orders.SKUCode.isin(product1.SKUID)]
# sub_segment_df.to_csv('ShoppingMall.csv', index=False)
# sub_segment_df2 = orders[orders.SKUCode.isin(product2.SKUID)]
# sub_segment_df2.to_csv('Tops_supermarket.csv', index=False)
# sub_segment_df3 = orders[~orders.SKUCode.isin(product1.SKUID) & ~orders.SKUCode.isin(product2.SKUID)]
# sub_segment_df3.to_csv('Problematic.csv', index=False)

# sub_segment_df.TicketNumber.drop_duplicates()

Taking "office" data for consideration in this notebook, the transaction data looks like:

In [None]:
data = office
data.head(4)

### Bayesian Personalized Ranking (BPR) Algorithm

Retrieve info from the data -- where we focus on a match between user and item

In [None]:
from theano_bpr import BPR

idx = pd.Categorical(list(data.CustomerID)).codes
itm = pd.Categorical(list(data.SKUCode)).codes

# match the ID and the "series number"
idx_match = zip(data.CustomerID, idx)
itm_match = zip(data.SKUCode, itm)

train_set = zip(idx,itm)

bpr = BPR(20, idx.max()+1, itm.max()+1)
bpr.train(train_set, epochs=20)

The result we get so far, is the matrix indicating the "willing" value of purchasing for a certain user towards a certain item.

Hence the shape of the matrix is (#users, #items)

In [None]:
res_bpr = bpr.predictions(range(idx.max()+1))
print res_bpr
print res_bpr.shape

As a simple performance showcase, this indicates a rough AUC value.

While for our recommendation, it is surely higher (above 0.9) since this is a self-learning that is indeed overfitting to historical data.

In [None]:
import math

bpr_show = BPR(20, idx.max()+1, itm.max()+1)
bpr_show.train(train_set[:int(math.floor(len(train_set) * 0.9))], epochs=20)
test_set = train_set[int(math.floor(len(train_set) * 0.9)):]
bpr_show.test(test_set)

### Logistic Regression (LR) Algorithm

Generating Training data according to CustomerID -- making #users as multi-class classification problem

In [None]:
idx_match_dic = dict(idx_match)
y_train = data.CustomerID.map(idx_match_dic)
# X_train = data[['BUID', 'SKUCode', 'Spending', 'SubDeptCode', 'QTY']]
X_train = data[[ 'SKUCode', 'Spending', 'QTY']]

X_train = np.array(X_train)
y_train = np.array(y_train)

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=10.0, random_state=0)
lr.fit(X_train_std, y_train)


Testing data are corresponded.

In [None]:
X_test = data[[ 'SKUCode', 'Spending', 'QTY']]
X_test = X_test.groupby('SKUCode').agg({'Spending': lambda x: x.sum(), 'QTY': lambda x: len(x)}) 
X_test = np.array(X_test.reset_index())
sc.fit(X_test)
X_test_std = sc.transform(X_test)

Prediction result, same format with BPR algorithm.

In [None]:
res_lrt = lr.predict_proba(X_test_std) 
print res_lrt.T
print res_lrt.T.shape

In [None]:
res_lr = lr.predict_proba(X_train_std) 
print res_lr.T

In [None]:
itm_match_dic = dict(itm_match)

### A linear combination of BPR and LR

Normalization is needed before a linear combination

In [None]:
from sklearn.preprocessing import normalize
normed_res_bpr = normalize(res_bpr, axis=0, norm='l1')
normed_res_lrt = normalize(res_lrt.T, axis=0, norm='l1')

A simple linear model at hand, later can even learn an optimized α.

In [None]:
res = 0.5 * normed_res_bpr + 0.5 * normed_res_lrt

Current memory cannot handle such matrix multiplication, thus need to adapt to sparse matrix for matrix manipulating.

In [None]:
from scipy.sparse import coo_matrix

purchased = coo_matrix((np.ones(len(itm)),(idx,itm)),shape=(max(idx)+1,max(itm)+1)).tocsr()
# purchased = purchased.todense()
# willing_from_purchased = np.multiply(res , purchased)
# willing_from_unpurchased = res - willing_from_purchased

res_csr = coo_matrix(res).tocsr()
willing_from_purchased = purchased.multiply(res_csr)
willing_from_unpurchased = res_csr - willing_from_purchased

# def matrix_reflection(train_set, res):
#     res_mat = np.zeros([max(idx)+1,max(itm)+1])
#     for each in train_set:
#         res_mat[each[0]][each[1]] = res[each[0]][each[1]]
#     return res_mat

# willing_from_purchased = matrix_reflection(train_set, res)
# willing_from_unpurchased = res - willing_from_purchased
# from scipy.sparse import coo_matrix
# print coo_matrix(willing_from_purchased)

Simulating a memory dumping -- no need to use "Pickle" -- for coding only

In [None]:
pd.DataFrame(willing_from_purchased.todense()).to_csv("purchased.csv",header=False)
pd.DataFrame(willing_from_unpurchased.todense()).to_csv("unpurchased.csv",header=False)
pd.DataFrame(res).to_csv("result.csv",header=False)
# read csv to restore memory

# Use Cases
## Side Data Preparation

In [None]:
cstmfile = '/Users/Yiteng/venv/rfm-project/data/og_data/Customer_Profile.csv'
customer = pd.read_csv(cstmfile, sep=',', engine='c')

def strip_brace(input):
#     input = input.strip('{')
#     input = input.strip('}')
    return input[1:-1]
#     return input

customer.CustomerID = customer.CustomerID.map(strip_brace)

## use case 1 -- recommend a certain item to users who are interested in

Generating dictionaries for content:index matching

In [None]:
idx_match_rev_dic = dict(zip(idx,data.CustomerID))
itm_match_rev_dic = dict(zip(itm,data.SKUCode))

Randomly select one item.

In [None]:
import random
rec_itm = random.randint(0,itm.max())
SKUID_rec_itm = itm_match_rev_dic[rec_itm]
print rec_itm, SKUID_rec_itm

Set a parameter τ here, indicating a % (e.g., 10%) of a selection for compaign

In [None]:
tau = 0.1 
qualified_no = int(math.floor(res.shape[0]*tau))
print 'Recommend item to ' + str(qualified_no) + ' customers out of ' + str(res.shape[0]) + ' in total.'

Retreive the CustomerID to understand the details of this recommendation.

In [None]:
# product1[product1.SKUID.isin(data.SKUCode)]
import heapq
rec_col = res[:,rec_itm]
nlargest_values = heapq.nlargest(qualified_no, rec_col)
rec_customer_indices = [np.where(rec_col == item)[0][0] for item in nlargest_values]
rec_customer_ID = map(idx_match_rev_dic.get, rec_customer_indices)

Hence, sub-segment this portion of data out from overall customer data. And following analysis can be based on this data, also can do anything on it based on what we need.

In [None]:
sub_customer = customer[customer.CustomerID.isin(rec_customer_ID)]
print 'Among the total ' + str(sub_customer.shape[0]), 'recommendations, there are ' + str(len(sub_customer[sub_customer.Gender == 'F'])) + ' ladies and ' + str(len(sub_customer[sub_customer.Gender == 'M'])) + ' gentlemen, while ' + str(len(sub_customer[sub_customer.Gender.isnull()])) + ' ppl have not indicated their gender.'


In [None]:
print 'The details of these customers are listed here:'
print ''
print sub_customer

## use case 2 -- recommend a certain group of users a certain item

As a discussion with May-E, this will be based on the original plan of recommending 64 (cube(4)) different user groups based on RFM segmentations.

Below are current segmentation according to 4 quartiles.

In [None]:
# Arguments (x = value, p = recency, monetary_value, frequency, k = quartiles dict)
def RClass(x,p,d):
    if x <= d[p][0.25]:
        return 1
    elif x <= d[p][0.50]:
        return 2
    elif x <= d[p][0.75]: 
        return 3
    else:
        return 4

# Arguments (x = value, p = recency, monetary_value, frequency, k = quartiles dict)
def FMClass(x,p,d):
    if x <= d[p][0.25]:
        return 4
    elif x <= d[p][0.50]:
        return 3
    elif x <= d[p][0.75]: 
        return 2
    else:
        return 1

Making the RFM table according to the data read:

In [None]:
import datetime
NOW = datetime.date(2017, 1, 1)
orders = data
orders['tDate2'] = pd.to_datetime(orders['TransactionDate']).dt.date
rfmTable = orders.groupby('CustomerID').agg({'tDate2': lambda x: (NOW - x.max()).days, # Recency
                                           'TicketNumber': lambda x: len(x),      # Frequency
                                           'Spending': lambda x: x.mean()}) # Monetary Value

In [None]:
rfmTable['tDate2'] = rfmTable['tDate2'].astype(int)
rfmTable.rename(columns={'tDate2': 'recency', 
                          'TicketNumber': 'frequency', 
                          'Spending': 'monetary_value'}, inplace=True)

In [None]:
quantiles = rfmTable.quantile(q=[0.25, 0.50, 0.75])
quantiles = quantiles.to_dict()
rfmSegmentation = rfmTable
rfmSegmentation['R_Quartile'] = rfmSegmentation['recency'].apply(RClass, args=('recency',quantiles,))
rfmSegmentation['F_Quartile'] = rfmSegmentation['frequency'].apply(FMClass, args=('frequency',quantiles,))
rfmSegmentation['M_Quartile'] = rfmSegmentation['monetary_value'].apply(FMClass, args=('monetary_value',quantiles,))
rfmSegmentation['RFMClass'] = rfmSegmentation.R_Quartile.map(str) + rfmSegmentation.F_Quartile.map(str) + rfmSegmentation.M_Quartile.map(str)


The segmentations are ready, hence recommend item based on the result matrix to each segment.

One may wonder that why there are many "recommendation failed" -- it is because the data contains a vast of missing values (i.e., NaN) for office items.

In [None]:
import collections
idx_match_dic = dict(idx_match)

for i in range(1,5):
    for j in range(1,5):
        for k in range(1,5):
            # for each RFM Quartile, select the data out from original data based on CustermerID
            sub_segment = rfmSegmentation[rfmSegmentation.RFMClass == str(i*100+j*10+k)]
            sub_segment = sub_segment.reset_index()

            print 'category-R('+str(i)+')-F('+str(j)+')-M('+str(k)+') has '+ str(sub_segment.shape[0]) + ' customers.'

            sub_segment_df2 = customer[customer.CustomerID.isin(sub_segment.CustomerID)]
            print 'Among them, ' + str(len(sub_segment_df2[sub_segment_df2.Gender == 'F'])) + ' ladies and ' + str(len(sub_segment_df2[sub_segment_df2.Gender == 'M'])) + ' gentlemen, while ' + str(len(sub_segment_df2[sub_segment_df2.Gender.isnull()])) + ' ppl have not indicated their gender.'

#             sub_segment_df = orders[orders.CustomerID.isin(sub_segment.CustomerID)]
#             some = collections.Counter(sub_segment_df.SKUCode).most_common()
#             recommended_pos = int(math.floor(some.__len__()*.4))
#             recommended_id = some[recommended_pos][0]

            rec_customer_indices = map(idx_match_dic.get, sub_segment.CustomerID)
            sub_res = res[rec_customer_indices]
            rec_from = sub_res.mean(axis=0)
            rec_item_SKUID = itm_match_rev_dic[np.where(rec_from == rec_from.max())[0][0]]            

            result = product1[product1.SKUID == rec_item_SKUID] #recommended_id]
            try:
                if result.shape[0]:
                    print 'Recommendation:'+ '    ' + result.iloc[0].DeptName + '    ' + result.iloc[0].SubDeptName
                else:
                    result = product2[product2.SKUID == recommended_id]
                    if result.shape[0]:
                        print 'Recommendation:'+ '    ' + result.iloc[0].PRODUCT_ENG_DESC
            except:
                print 'SKUID ' + str(result.iloc[0].SKUID) + ' is a unknown product -- recommendation failed'
