# 2. Build a Recommendation System for Purchase Data

The scope of this notebook is 

- Code the Scoring Function
- Unit Test the Score
- Build the Dash Applimcation



## Settings

In [5]:
#Data
import sqlalchemy as sql

#Data Science
import pandas as pd

#Model Tracking
import mlflow
from mlflow.tracking import MlflowClient

from minio import Minio
from minio.error import ResponseError

from surprise import dump

#Utils
import os
import glob
import logging
from collections import defaultdict
import configparser
import json
import pickle

#Settings
from pprint import pprint
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
# Enviroment variables
outmodels = '../models/'
artefact_name = '4fa76aa5e00c413db3e23810a913dc8e'

# Set dbconnection variables
dbconnPath = './dbconn.properties'
config = configparser.RawConfigParser()
config.read(dbconnPath)
params = config
db_host=params.get('CONN', 'host')
db_port=params.get('CONN', 'port')
db_user=params.get('CONN', 'user')
db_pwd=params.get('CONN', 'password')
db_name=params.get('CONN', 'database')

## Model Validation

### Download Model Artefact from the Registry

In [3]:
client = MlflowClient()
for regmodel in client.list_registered_models():
    regmodel_info = dict(regmodel)

# pprint(regmodel_info, indent=3)

champion=client.get_registered_model('Champion')
championid=champion.latest_versions[-1].run_id

art_list = [arts.path for arts in client.list_artifacts(championid, path=None)]

for art_path in art_list: 
    client.download_artifacts(championid, art_path, outmodels)

  from collections import Mapping, MutableMapping


### Quick Check Model Content

In [10]:
modelpkl = [modelpath for modelpath in glob.glob(outmodels + 'model/*.pkl')][0]
modelpkl

predictions, algo = dump.load(modelpkl)

print(predictions[0:10])
print('*'*100)
print(algo)

[Prediction(uid='10929', iid='261', r_ui=0.0, est=0.3747662865053879, details={'was_impossible': False}), Prediction(uid='2715', iid='256', r_ui=0.0, est=1.397448561679778, details={'was_impossible': False}), Prediction(uid='12786', iid='198', r_ui=0.0, est=0.34800521979059373, details={'was_impossible': False}), Prediction(uid='38', iid='119', r_ui=0.0, est=1.4081730858161146, details={'was_impossible': False}), Prediction(uid='8417', iid='2', r_ui=0.0, est=0.10200631019335304, details={'was_impossible': False}), Prediction(uid='5370', iid='293', r_ui=0.0, est=1.4426131167097673, details={'was_impossible': False}), Prediction(uid='225', iid='34', r_ui=0.0, est=0.6310815188817669, details={'was_impossible': False}), Prediction(uid='12930', iid='24', r_ui=2.0, est=0.5743177052276057, details={'was_impossible': False}), Prediction(uid='9481', iid='261', r_ui=0.0, est=0.45208051960508466, details={'was_impossible': False}), Prediction(uid='2410', iid='49', r_ui=0.0, est=0.7374420578118797

### Test One: Score the single value

In [27]:
test_sample = pd.read_csv(''.join([outmodels, 'sample2scoredo9qo832.csv']))
test_sample

test_sample.head()

Unnamed: 0,userID,itemID,rating
0,100,0,1
1,1007,0,1
2,10089,0,0
3,1011,0,0
4,10171,0,9


In [28]:
uid = str(1007)  # raw user id (as in the ratings file). They are **strings**!
iid = str(0)  # raw item id (as in the ratings file). They are **strings**!

# get a prediction for specific users and items.
pred = algo.predict(uid, iid, r_ui=1, verbose=True)

user: 1007       item: 0          r_ui = 1.00   est = 0.86   {'was_impossible': False}


### Test Two: Score the entire dataset

In [29]:
test_sample['predictions'] = test_sample.apply(lambda row:algo.predict(row['userID'], 
                     row['itemID'], row['rating']), axis = 1)
test_sample.head()
# predictions=list(data_prep_5['predictions'])

Unnamed: 0,userID,itemID,rating,predictions
0,100,0,1,"(100, 0, 1, 0.6463847965824251, {'was_impossib..."
1,1007,0,1,"(1007, 0, 1, 0.6463847965824251, {'was_impossi..."
2,10089,0,0,"(10089, 0, 0, 0.6463847965824251, {'was_imposs..."
3,1011,0,0,"(1011, 0, 0, 0.6463847965824251, {'was_impossi..."
4,10171,0,9,"(10171, 0, 9, 0.6463847965824251, {'was_imposs..."


## Model Pipeline

We have an mobile app allowing its customers to place orders before they even have to walk into the store.
When a customer first taps on the “order” page, we may recommend 

1. Top items to be added to their basket, e.g. disposable utensils, fresh meat, chips, and and so on.
2. Personalized recommendation with ranked list of items (product IDs) that the user is most likely to want to put in his/her (empty) “basket”

Assuming that the scenario is ModelOps 0. Then: 

1. Data scientists hand over a trained model as an artifact to the engineering team for deployement
2. The handoff can include putting the trained model in the models registry
3. The Scoring process is in Batch on a sigle EC2 instance

We have to reproduce the required development enviroment

0. Define Artefacter function to get the last version of Champion Model (optional)

1. Define Scoring Functions: Batch scoring is the main assumption

    - Define the get_top_items function 
    - Define the get_top_n_ui function
    

2. Unit Test 

3. Define a quick front end that simulate Mobile App (Test it in Docker)

### Define Scoring Functions

In [None]:
def get_top(predictions):
    
    '''
    Returns the the top-N recommendation from a set of predictions
    
    '''
    top_n = defaultdict(list)
    

### Top 10 Recommended Items


In [30]:
def get_top_n(predictions, n=10):
    
    '''Return the top-N recommendation for each user from a set of predictions.
    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.
    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))
    print(top_n)

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [31]:
def get_top_n_ui(top, ui):
    try:
        return {k:v for k,v in top.items() if ui==k}
    except ValueError: # user was not part of the trainset
        return 0

In [32]:
# Set connection string
connection_str = f'mysql+pymysql://{db_user}:{db_pwd}@{db_host}:{db_port}/{db_name}'

# connect to database
engine = sql.create_engine(connection_str)
connection = engine.connect()
test_sample = pd.read_sql("select * from CUSTOMERID", connection)
connection.close()

In [1]:
# for _, row in test_sample[['customerId']].iterrows():
#     print(row)
#     get_top_n_ui(get_top_n(predictions, n=10), row)