In [1]:
import numpy as np
import datetime
import pandas as pd
from collections import Counter
import re
import boto3
import os
import json
import operator
import matplotlib.pyplot as plt
import tarfile
import shutil
import urllib.request
import json

In [2]:
# Input
folderRec = 'recommendations'

fileEventsPrefix = 'datafeed/01-vrtproduction_'
fileEventsSuffix = '.tsv.gz'

fileLookupPrefix = 'datafeed/vrtproduction_'
fileLookupSuffix = '-lookup_data.tar.gz'

# fileCred = os.getenv("HOME") + '/.aws/credentials-s3'
fileCred = os.getenv("HOME") + '/.aws/credentials'

bucketIn = 'vrt-adobe-analytics'
folderIn = 'datafeed'


lfuture = 2 # hours of data after recommandation moment to load
hPost = 3600 # seconds after recommendations for evaluation

# Load

In [3]:
# Read credentials
with open(fileCred, 'r') as f:
    for line in f:
        if not re.match('aws_access_key_id', line) is None:
            ACCESS_KEY = line.split('=')[1].strip()
        elif not re.match('aws_secret_access_key', line) is None:
            SECRET_KEY = line.split('=')[1].strip()

session = boto3.Session(aws_access_key_id=ACCESS_KEY,
                        aws_secret_access_key=SECRET_KEY)
s3 = session.resource('s3')  

os.environ['AWS_ACCESS_KEY_ID'] = ACCESS_KEY
os.environ['AWS_SECRET_ACCESS_KEY'] = SECRET_KEY
s3 = boto3.resource('s3')
my_bucket = s3.Bucket(bucketIn)  

In [4]:
# Load most recent recommendation file
fileRec = folderRec + '/' + sorted(os.listdir(folderRec))[-1]

with open(fileRec, 'r') as f:
    recommendations = json.load(f)
    
recommendations = pd.DataFrame(recommendations)

In [210]:
len(recommendations)

1773

In [211]:
users = list(set(recommendations.user))
len(users)

588

In [6]:
# Event file names

dtmin = datetime.datetime.fromtimestamp(min(recommendations.utc))
dtmin = dtmin.replace(minute=0, second = 0)
dtmax = datetime.datetime.fromtimestamp(max(recommendations.utc))
dtmax = dtmax.replace(minute=0, second = 0) 

fileEvents = []
fileLookup = []

for i in range(int((dtmax - dtmin).seconds/3600) + lfuture):
    dt = dtmin + datetime.timedelta(hours=i)
    fileEvents.append(fileEventsPrefix + datetime.datetime.strftime(dt, '%Y%m%d-%H0000') + fileEventsSuffix)
    fileLookup.append(fileLookupPrefix + datetime.datetime.strftime(dt, '%Y%m%d-%H0000') + fileLookupSuffix)



In [7]:
# Load events

events = pd.DataFrame()

for f in range(len(fileLookup)):

    # Load headers
    print(fileLookup[f])
    my_bucket.download_file(fileLookup[f], 'tmp.tar.gz')

    tar = tarfile.open("tmp.tar.gz")
    tar.extractall('lookup')
    tar.close()

    file = open('lookup/column_headers.tsv', 'r')
    headers = file.readline().rstrip('\n').split('\t')
    file.close()



    # Load events
    print(fileEvents[f])
    my_bucket.download_file(fileEvents[f], 'tmp.tsv.gz')
    events_ = pd.read_csv('tmp.tsv.gz', sep='\t', delimiter=None, header=None, names=None, index_col=None, 
               usecols=None, squeeze=False, prefix=None, mangle_dupe_cols=True, dtype=None, engine=None, 
               converters=None, true_values=None, false_values=None, skipinitialspace=False, 
               skiprows=None, nrows=None, na_values=None, keep_default_na=True, na_filter=True, 
               verbose=False, skip_blank_lines=True, parse_dates=False, infer_datetime_format=False, 
               keep_date_col=False, date_parser=None, dayfirst=False, iterator=False, chunksize=None, 
               compression='gzip', thousands=None, decimal=b'.', lineterminator=None, quotechar='"', 
               quoting=0, escapechar=None, comment=None, encoding='latin_1', dialect=None, 
               tupleize_cols=None, error_bad_lines=False, warn_bad_lines=True, skipfooter=0, 
               doublequote=True, delim_whitespace=False, low_memory=False, memory_map=False, 
               float_precision=None)
    events_.columns = headers
    
    # Filter
    events_ = events_[events_.post_pagename.str.contains('vrtnws', na=False, regex=True)].reset_index(drop=True)
    events_ = events_[['post_evar27', 'mcvisid', 'hit_time_gmt']]
    events_ = events_[events_.mcvisid.isin(users)]
    
    # Append
    events = events.append(events_)

    # Clean up
    shutil.rmtree('lookup')
    os.remove('tmp.tar.gz')
    os.remove('tmp.tsv.gz')

    
    
    print(len(events))

datafeed/vrtproduction_20190909-140000-lookup_data.tar.gz
datafeed/01-vrtproduction_20190909-140000.tsv.gz
188
datafeed/vrtproduction_20190909-150000-lookup_data.tar.gz
datafeed/01-vrtproduction_20190909-150000.tsv.gz
340
datafeed/vrtproduction_20190909-160000-lookup_data.tar.gz
datafeed/01-vrtproduction_20190909-160000.tsv.gz
499


In [8]:
events

Unnamed: 0,post_evar27,mcvisid,hit_time_gmt
1052,1509016329578,35349204239796775262989742556095831198,1568032966
1053,1568024283320,35349204239796775262989742556095831198,1568032991
1745,1487943309744,89628776676573804661254956676602575696,1568033348
2452,1567848653973,39246399150725490020205620592671886829,1568032562
2453,1487943309744,39246399150725490020205620592671886829,1568032683
2454,1567848653973,39246399150725490020205620592671886829,1568032702
2455,1554478886567,39246399150725490020205620592671886829,1568032789
2456,1567848653973,39246399150725490020205620592671886829,1568032921
2457,1487943309744,39246399150725490020205620592671886829,1568032934
4172,1568015688505,02593891549515594090144353953565711410,1568032420


# Evaluate

## Fail rate

In [9]:
recommendations.result.isna().sum()/len(recommendations)

0.15341229554427524

In [194]:
recommendations

for api in list(set(recommendations.api)):
    print(api)
    counts = Counter([type(r) for r in recommendations[recommendations.api == api].result])
    print('Fail rate:', counts[float]/np.sum(list(counts.values())))
    print()


cpn-hybrid
Fail rate: 0.23011844331641285

cpn-als
Fail rate: 0.23011844331641285

cpn-recent-random
Fail rate: 0.0



## Accuracy

In [24]:
recommendations = recommendations.reset_index(drop=True)

In [202]:
time_shift_max = 2*36000

hits = []

for r in range(len(recommendations)):
    row = recommendations[recommendations.index == r]
    if type(row.result[r]) == dict:
        rec_ = row.result[r]['items']
        rec_ = [r[0] for r in rec_]
        user_ = row.user[r]
        rec_time_ = row.utc[r]
        events_ = events[(events.mcvisid == user_) & \
                         (events.hit_time_gmt > rec_time_) & \
                         (events.hit_time_gmt - rec_time_ < time_shift_max)]
                          
        if len(events_) > 0:
            next_item_ = list(events_.sort_values('hit_time_gmt').post_evar27)[0]
            time_shift_ = list(events_.sort_values('hit_time_gmt').hit_time_gmt)[0]-rec_time_
#             print(user_)
#             print(rec_)
#             print(rec_time_)
#             print(next_item_)
#             print(next_item_ in rec_)
            hits.append({'api':row.api[r],
                         'hit': next_item_ in rec_,
                         'time_shift' : time_shift_,
                         'rec_time' : rec_time_})
    
hits = pd.DataFrame(hits)


In [209]:
for api in list(set(hits.api)):
    print(api)
    print('HR:', np.mean(hits[hits.api==api]['hit']))
    print('{} post events between {} and {} seconds after recommendation'\
          .format(len(hits[hits.api==api]), min(hits[hits.api==api]['time_shift']), max(hits[hits.api==api]['time_shift'])))
    print('Recommendations retrieved between {} and {}'\
          .format(datetime.datetime.fromtimestamp(min(hits[hits.api==api].rec_time)),\
                 datetime.datetime.fromtimestamp(max(hits[hits.api==api].rec_time)))) 
    print()

cpn-hybrid
HR: 0.03296703296703297
91 post events between 4 and 8910 seconds after recommendation
Recommendations retrieved between 2019-09-09 14:18:31 and 2019-09-09 15:25:51

cpn-als
HR: 0.0
91 post events between 8 and 8915 seconds after recommendation
Recommendations retrieved between 2019-09-09 14:18:27 and 2019-09-09 15:25:47

cpn-recent-random
HR: 0.04854368932038835
103 post events between 4 and 8910 seconds after recommendation
Recommendations retrieved between 2019-09-09 14:18:31 and 2019-09-09 15:25:51



## Speed

In [220]:
print('Average waiting time: {} seconds'\
      .format(round((max(recommendations.utc) - min(recommendations.utc))/len(recommendations),2)))


Average waiting time: 2.31 seconds


# GRAVEYARD

In [190]:
# Calculate scores


patk = []
ratk = []
f1atk = []


for r in range(len(recommendations)):
    
    if type(recommendations[recommendations.index == r]['result'][r]) == float:
        patk_ = np.nan
        ratk_ = np.nan
        f1atk_ = np.nan
#         print('No recommendations')


    else:

        user_ = recommendations[recommendations.index == r]['user'][r]
#         api_ = recommendations[recommendations.index == r]['api'][r]
        rec_ = recommendations[recommendations.index == r]['result'][r]['items']
        pred_ = [x[0] for x in rec_]

#         true_ = list(set(events[events.mcvisid == user_]['post_evar27']))
        utcRec = recommendations.utc[r] 
        utcPost = utcRec + hPost
        true_ = list(set(events[(events.mcvisid == user_) & \
                (events.hit_time_gmt > utcRec) & \
                (events.hit_time_gmt <= utcPost)]['post_evar27']))
        
        
        nreadafter_ = len(true_)
        
#         print(pred_)
#         print(true_)
        
        if nreadafter_ == 0:
            patk_ = np.nan
            ratk_ = np.nan
            f1atk_ = np.nan 
            
        else:
            overlap_ = len(list(set(true_) & set(pred_)))

            patk_ = overlap_/len(rec_)
            ratk_ = overlap_/len(true_)

            if patk_ == 0 and ratk_ == 0:
                f1atk_ = 0
            elif patk_ == 0:
                f1atk_ = ratk_/2
            elif ratk_ == 0:
                f1atk_ = patk_/2  
            else:
                f1atk_ = 2/(1/patk_ + 1/ratk_)
                
                
#     print(patk_, ratk_, f1atk_)
#     print()

    patk.append(patk_)
    ratk.append(ratk_)
    f1atk.append(f1atk_)
    
    
recommendations['patk'] = patk
recommendations['ratk'] = ratk
recommendations['f1atk'] = f1atk

In [191]:

for api_ in sorted(list(set(recommendations.api))):
    print(api_)
    for metric_ in ['patk', 'ratk', 'f1atk']:
        res = [p for  p in list(recommendations[recommendations.api == api_][metric_]) if not np.isnan(p)]
        print('{}: {} +/- {}'.format(metric_, np.round(np.mean(res),3), np.round(np.std(res),3)))
    print('{} users'.format(len(res)))
    print()

cpn-als
patk: 0.0 +/- 0.0
ratk: 0.0 +/- 0.0
f1atk: 0.0 +/- 0.0
45 users

cpn-hybrid
patk: 0.004 +/- 0.011
ratk: 0.08 +/- 0.23
f1atk: 0.008 +/- 0.021
45 users

cpn-recent-random
patk: 0.009 +/- 0.016
ratk: 0.12 +/- 0.245
f1atk: 0.016 +/- 0.029
54 users

