# 필요 라이브러리 다운로드

In [1]:
!pip install fastparquet
!pip install implicit

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fastparquet
  Downloading fastparquet-0.8.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 2.1 MB/s 
[?25hCollecting cramjam>=2.3.0
  Downloading cramjam-2.5.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 51.3 MB/s 
Installing collected packages: cramjam, fastparquet
Successfully installed cramjam-2.5.0 fastparquet-0.8.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting implicit
  Downloading implicit-0.6.1-cp37-cp37m-manylinux2014_x86_64.whl (18.6 MB)
[K     |████████████████████████████████| 18.6 MB 1.4 MB/s 
Installing collected packages: implicit
Successfully installed implicit-0.6.1


#필요 라이브러리 로드

In [2]:
import pandas as pd
import numpy as np
import scipy.sparse as sparse
import random
import implicit
import datetime
from tqdm import tqdm

# 데이터 로드

In [3]:
try:
  path = 'C:/Users/User/Desktop/AIB_13/CP2/data/'
  df = pd.read_parquet(path + 'light_2019-Oct.parquet', engine='fastparquet')
except:
  path = '/content/drive/MyDrive/CP2/data/'
  df = pd.read_parquet(path + 'light_2019-Oct.parquet', engine='fastparquet')

# 데이터 class

In [16]:
import datetime
import numpy as np
import scipy.sparse as sparse

class Data():
  """
  """
  def __init__(self, df):
    self.df = df
    
  
  def _after_11_user_remove(self):
    """
    """
    copy_df = self.df.copy()
    copy_df['event_time'] = copy_df['event_time'].apply(lambda x : x[:-4]).astype('datetime64')
    copy_df['event_time'] = copy_df['event_time'] + datetime.timedelta(hours=4)
    copy_df = copy_df.loc[copy_df['event_time'] < '2019-11-01']

    return copy_df

  def _get_users(self, users_type = 'no_conversion'):
    df = self._after_11_user_remove()
    data = df[['user_id','product_id','event_type']]
    if users_type == 'no_conversion':
      drop_user_id = data.loc[data['event_type'] != 'view', 'user_id']
      data = data.loc[~data['user_id'].isin(drop_user_id)].reset_index(drop=True)
      data['event_type'] = data['event_type'].astype('object')

      return data
    elif users_type == 'conversion':
      cart_user_id = set(data.loc[data['event_type'] == 'cart','user_id'].unique())
      purchase_user_id = set(data.loc[data['event_type'] == 'purchase', 'user_id'].unique())
      all_id = cart_user_id.union(purchase_user_id)
      data = data.loc[data['user_id'].isin(all_id)].reset_index(drop=True)
      data['event_type'] = data['event_type'].astype('object')

      return data

  def _get_view_count_grouped(self, users_type = 'no_conversion'):
    """
    """
    data = self._get_users(users_type)
    if users_type == "no_conversion":
      grouped = data.groupby(['user_id','product_id'])['event_type'].count()
      grouped =grouped.reset_index()
      data = grouped.rename(columns = {'event_type' : 'view_count'})

      return data
    elif users_type =='conversion':
      grouped = data.groupby(['user_id','product_id','event_type'])['event_type'].count()
      grouped = pd.DataFrame(grouped).rename(columns={'event_type' : 'count'}).reset_index()

      table = grouped.pivot_table(index=['user_id','product_id'], columns=['event_type'], values=['count'])
      table = table.reset_index()
      table.columns = ['user_id','product_id','cart','purchase','view_count']
      table = table.fillna(0)

      table['cart'] = table['cart'].astype('int')
      table['purchase'] = table['purchase'].astype('int')
      table['view_count'] = table['view_count'].astype('int')

      table.loc[table['view_count'] == 0, 'view_count'] = 2

      data = table[['user_id','product_id','view_count']]

      return data

  def get_view_count_df(self, users_type = 'no_conversion'):
    self.data = self._get_view_count_grouped(users_type)
    num_user = self.data['user_id'].nunique()
    num_item = self.data['product_id'].nunique()

    users = list(np.sort(self.data['user_id'].unique()))
    products = list(self.data['product_id'].unique())
    count = list(self.data['view_count'])

    rows = self.data['user_id'].astype('category').cat.codes
    self.data['user_id_code'] = self.data['user_id'].astype('category').cat.codes

    cols = self.data['product_id'].astype('category').cat.codes
    self.data['product_id_code'] = self.data['product_id'].astype('category').cat.codes

    user_item_matrix = sparse.csr_matrix((count, (rows, cols)), shape=(num_user, num_item))

    return user_item_matrix, self.data

  def _get_view_ratio_grouped(self, users_type  = "no_conversion"):
    """
    """
    data = self._get_users(users_type)
    if users_type == 'no_conversion':
      grouped = data.groupby(['user_id','product_id'])['event_type'].count()
      grouped = grouped.reset_index()
      grouped = grouped.rename(columns = {'event_type' : 'view_count'})

      total_event_type = data.groupby(['user_id'])['product_id'].count()
      total_event_type = total_event_type.reset_index()
      total_event_type = total_event_type.rename(columns={'product_id' : 'total_view'})

      data = grouped.merge(total_event_type, on='user_id')
      data['view_ratio'] =  (data['view_count'] / data['total_view']) * 100

      return data
    elif users_type == 'conversion':
      grouped = data.groupby(['user_id','product_id','event_type'])['event_type'].count()
      grouped = pd.DataFrame(grouped).rename(columns={'event_type' : 'count'}).reset_index()

      table = grouped.pivot_table(index=['user_id','product_id'], columns=['event_type'], values=['count'])
      table = table.reset_index()
      table.columns = ['user_id','product_id','cart','purchase','view_count']
      table = table.fillna(0)

      table['cart'] = table['cart'].astype('int')
      table['purchase'] = table['purchase'].astype('int')
      table['view_count'] = table['view_count'].astype('int')

      table.loc[table['view_count'] == 0, 'view_count'] = 2

      temp = table[['user_id','product_id','view_count']]
      total_view = temp.groupby(['user_id'])['view_count'].sum()
      total_view = total_view.reset_index()
      total_view = total_view.rename(columns={'view_count' : 'total_view'})

      data = temp.merge(total_view, on='user_id')
      data['view_ratio'] = (data['view_count'] / data['total_view']) * 100

      data = data[['user_id','product_id','view_ratio']]
      return data

  def get_view_ratio_df(self, users_type = 'no_conversion'):
    self.data = self._get_view_ratio_grouped(users_type)
    num_user = self.data['user_id'].nunique()
    num_item = self.data['product_id'].nunique()

    users = list(np.sort(self.data['user_id'].unique()))
    products = list(self.data['product_id'].unique())
    ratio = list(self.data['view_ratio'])

    rows = self.data['user_id'].astype('category').cat.codes
    self.data['user_id_code'] = self.data['user_id'].astype('category').cat.codes

    cols = self.data['product_id'].astype('category').cat.codes
    self.data['product_id_code'] = self.data['product_id'].astype('category').cat.codes

    user_item_matrix = sparse.csr_matrix((ratio, (rows, cols)), shape=(num_user, num_item))
    
    return user_item_matrix, self.data
  def get_item_lookup(self):
    """
    """
    copy_df = self._after_11_user_remove()
    product_lookup = copy_df[['product_id','category_code','brand']].drop_duplicates('product_id').reset_index(drop=True).sort_values('product_id')
    return product_lookup


# 추천시스템 class

In [30]:
from implicit.als import AlternatingLeastSquares
import random
import numpy as np

class MyALS():
  """
  implicit 라이브러리를 이용하여 필요한 기능을 담았습니다.

  Parameters
  ----------
  model : implicit 라이브러리의 AlternatingLeastSquares() 클래스로 생성된 인스턴스
  user_item_matrix : 사용자가 정의한 User-Item Matrix(Sparse Matrix)
  item_lookup : item(product)에 대한 정보를 담은 테이블
  data : User-Item Matrix를 만든 원본데이터

  """
  def __init__(self, model, user_item_matrix, item_lookup, data):

    self.model = model
    self.metrics_model = model
    self.user_item_matrix = user_item_matrix
    self.item_lookup = item_lookup
    self.data = data

  def fit(self):
    """
    행렬분해의 ALS를 이용하여 모델을 학습합니다.

    Parameters
    ----------
    user_item_matrix : 사용자가 정의한 User-Item Matrix(Sparse Matrix)
    """
    user_item_matrix = self.user_item_matrix.copy()
    self.model.fit(user_item_matrix)
    
  def _user_id_2_code(self, user_id):
    """
    입력받은 user_id를 User_Item_Matrix에 있는 user_id_code로 바꾸어주는 함수

    Parameters
    ----------
    user_id : 유저 ID

    """
    user_id_code = self.data.loc[self.data['user_id'] == user_id, 'user_id_code'].unique()[0]
    return user_id_code

  def _product_id_2_code(self, product_id):
    """
    입력받은 product_id를 User-Item-Matrix에 있는 product_id_code로 바꾸어주는 함수

    Parameters
    ----------
    product_id : 상품 ID
    """
    product_id_code = self.data.loc[self.data['product_id'] == product_id, 'product_id_code'].unique()[0]
    return product_id_code

  def _code_2_product_id(self, product_id_code):
    """
    입력받은 product_id_code를 User-Item-Matrix에 있는 product_id로 바꾸어주는 함수

    Parameters
    ----------
    product_id_code : 상품 ID code    
    """
    product_id = self.data.loc[self.data['product_id_code'] == product_id_code, 'product_id'].unique()[0]
    return product_id

  def get_recom_product(self, user_id, n = 10):
    """  
    user_id에 맞는 product를 n개 만큼 추천하여 데이터프레임 형태로 반환하는 함수

    Parameters
    ----------
    user_id : 유저 ID
    n : 추천 받게 될 item의 수
    """ 

    # user_id_2_code 함수를 이용하여 유저의 ID를 user_id_code로 변환합니다
    user_id_code = self._user_id_2_code(user_id)
  
    # model의 recommend를 이용하여 추천받는 제품의 id를 추출합니다.
    # 이때 추천 받는 제품의 id는 product_id가 아니라 product_id_code 입니다.
    recommended = self.model.recommend(user_id_code, self.user_item_matrix[user_id_code], N=n)[0]
    #결과를 담을 리스트를 초기화 합니다.
    results = []
    # 추천 받은 id를 돌면서 item_lookup 테이블에서 해당 product의 정보를 찾아 결과에 담습니다.
    for product_id_code in recommended:
      
      recommended_product_id = self._code_2_product_id(product_id_code)
      result = self.item_lookup.loc[self.item_lookup['product_id'] == recommended_product_id]
      results.append(result)
      
    return pd.concat(results)
  
  def get_user_topN_product(self,user_id,column, n = 20):
    """
    유저가 특정 기준값이 높은 제품 N개를 반환

    Parameters
    ----------
    user_id : 유저 ID
    column : 값을 확인할 기준이 되는 컬럼
    n : 반환할 Item 수
    """
    #입력받은 user_id 를 기준으로 column이 높은 순으로 정렬하여 product_id를 추출
    product_ids = self.data.loc[self.data['user_id'] == user_id].sort_values(column, ascending=False)[:n]['product_id'].values
    #입력받은 user_id 를 기준으로 column이 높은 순으로 정렬하여 column을 추출
    product_values = self.data[self.data['user_id'] == user_id].sort_values(column, ascending=False)[:n][column].values

    results = []
    #item_lookup 테이블에서 id에 맞는 데이터프레임을 찾음
    for i in product_ids:
      result = self.item_lookup.loc[self.item_lookup['product_id'] == i]
      results.append(result)

    #결과를 확인하기 쉽게 데이터프레임으로 반환
    frame = pd.concat(results)
    frame[column] = product_values

    return frame

  def get_explain(self, user_id, item_id, column):
    """
    사용자에게 제품이 추천된 이유를 반환하는 함수

    Parameters
    ----------
    user_id : 유저 ID
    item_id : Item(product) ID
    column : 확인할 컬럼
    """
    #입력받은 user_id, item_id를 user_id_code, product_id_code로 바꾸어줌
    user_id_code = self._user_id_2_code(user_id)
    product_id_code = self._product_id_2_code(item_id)

    #implicit라이브러리의 explain 함수를 사용하여 결과값을 반환
    total_score, top_contributions, user_weights = self.model.to_cpu().explain(user_id_code, self.user_item_matrix, product_id_code)

    results = []
    categorys = []
    brands = []
    scores = []
    # id에 해당하는 user_id, product_id, column, category, brand를 찾기
    for id_, score_ in top_contributions:
      product_id = self._code_2_product_id(id_)
      result = self.data.loc[(self.data['product_id'] == product_id) & (self.data['user_id'] == user_id)][['user_id','product_id',column]]
      category = self.item_lookup.loc[self.item_lookup['product_id'] == product_id, 'category_code'].unique()[0]
      brand = self.item_lookup.loc[self.item_lookup['product_id'] == product_id, 'brand'].unique()[0]

      results.append(result)
      categorys.append(category)
      brands.append(brand)
      scores.append(score_)

    #결과를 확인하기 쉽게 데이터프레임으로 반환
    frame = pd.concat(results)
    frame['score'] = scores
    frame['category'] = categorys
    frame['brand'] = brands
    
    frame = frame[['user_id', 'product_id','category','brand',column, 'score']]
    return frame, total_score

  def _get_train_test(self,percentage=.2, seed=42):
    """
    score를 구하기 위하여 train, test 데이터를 나누어주는 함수
    파라미터로 들어오는 percentage만큼 train_set의 값을 0으로 만들어 줌
    test_set는 기존의 User-Item Matrix에서 0이 아닌값으로 모두 1로 만들어 줌

    Parameters
    ----------
    percentage : 감추고 싶은 데이터의 비율
    seed : random seed

    """
    #원본 데이터를 test_set, train_set에 복사
    test_set = self.user_item_matrix.copy()
    train_set = self.user_item_matrix.copy()

    #relevant(선호 혹은 평가)여부를 확인하기 위하여 test_set에서 0이 아닌값을 1로 만들어 줌 
    test_set[test_set != 0] = 1

    #train_set에서 0이 아닌 x축, y축을 추출
    nonzero_idxs = train_set.nonzero()
    #x, y를 짝을지어 저장
    nonzero_pairs = list(zip(nonzero_idxs[0], nonzero_idxs[1]))

    #랜덤 시드를 적용
    random.seed(seed)
    #주어진 비율로 샘플을 추출
    n_samples = int(np.ceil(percentage * len(nonzero_pairs)))
    samples = random.sample(nonzero_pairs, n_samples)

    
    user_idxs = [index[0] for index in samples]
    item_idxs = [index[1] for index in samples]

    #샘플에 해당 하는 값들을 평가한적이 없도록 보이기 위하여 0으로 감춤
    train_set[user_idxs, item_idxs] = 0
    
    train_set.eliminate_zeros()

    self.zero_user_idxs = user_idxs
    self.zero_item_idxs = item_idxs
    self.train_set = train_set
    self.test_set = test_set
    

  def get_score(self, percentage =.2, seed=42, k = 10, method='hit_at_k', n_samples = 10000, verbose=True):
    """
    train_set로 학습하고 test_set와 비교하여 method 파라미터를 이용하여 추천시스템의 성능을 평가하는 함수
    
    Parameters
    ----------
    percentage : 감추고 싶은 데이터의 비율, default = .2
    seed : random seed, default = 42
    k : 추천할 아이템의 수, default = 10
    method : 평가 지표 - hit_at_k, precision_at_k
    n_samples : 평가할 user의 수, default = 10000

    """
    # 입력받은 n_samples가 최대값이상 이면 최대값으로 적용
    max_num = self.data['user_id_code'].nunique()
    if max_num <= n_samples:
      n_samples = max_num

    # train, test로 데이터가 분리
    self._get_train_test(percentage, seed)
    # metrics_model이 학습
    self.metrics_model.fit(self.train_set)
    # 샘플 user를 랜덤으로 추출
    random_state = np.random.RandomState(seed)
    user_id_code_samples = random_state.choice(self.data['user_id_code'], n_samples)

    #method 방법에 따라 scores 값을 반환
    if method == 'hit_at_k':
      scores = self._hit_at_k(user_id_code_samples, k, verbose= verbose)
      return scores

    elif method =='precision_at_k':
      scores = self._precision_at_k(user_id_code_samples, k, verbose=verbose)
      return scores

  def _hit_at_k(self, user_id_code_samples, k, verbose =True):
    """
    k개의 추천 중 relevant한것(존재)이 있다면 1, 아니면 0을 반환하여 측정
    추천을 받은 user 수 만큼 나누어 평균을 반환

    Parameters
    ----------
    user_id_code_samples : 샘플링한 유저의 ID 리스트
    k : 추천할 아이템의 수
    """
    if verbose:
      scores = []
      #입력받은 user_id_code_samples를 돌면서
      for user_id_code_sample in tqdm(user_id_code_samples):
        #해당 유저에게 추천하는 아이템을 추출
        recommedation_ids = self.metrics_model.recommend(user_id_code_sample, self.train_set[user_id_code_sample], N=k)[0]

        results = []
        # 추천 받은 아이템이 유저가 선호(혹은 평가)했는지 확인
        for id_ in recommedation_ids:
          result = self.test_set[user_id_code_sample, id_]
          results.append(result)
        # 만약 결과 리스트안에 1이 있다면 1을 입력, 아니면 0을 입력
        if 1 in results:
          scores.append(1)
        else:
          scores.append(0)
          #결과를 평균내어 반환
      return np.mean(scores)
    else:
      scores = []
      #입력받은 user_id_code_samples를 돌면서
      for user_id_code_sample in user_id_code_samples:
        #해당 유저에게 추천하는 아이템을 추출
        recommedation_ids = self.metrics_model.recommend(user_id_code_sample, self.train_set[user_id_code_sample], N=k)[0]

        results = []
        # 추천 받은 아이템이 유저가 선호(혹은 평가)했는지 확인
        for id_ in recommedation_ids:
          result = self.test_set[user_id_code_sample, id_]
          results.append(result)
        # 만약 결과 리스트안에 1이 있다면 1을 입력, 아니면 0을 입력
        if 1 in results:
          scores.append(1)
        else:
          scores.append(0)
          #결과를 평균내어 반환
      return np.mean(scores)
  def _precision_at_k(self, user_id_code_samples, k, verbose=True):
    """
    k개의 추천 중 사용자가 relevant(선호 혹은 평가)한 아이템이 얼마나 존재하는지 측정
    추천을 받은 user 수 만큼 나누어 평균을 반환

    Parameters
    ----------
    user_id_code_samples : 샘플링한 유저의 ID 리스트
    k : 추천할 아이템의 수
    """
    if verbose:
      scores = []
      #입력받은 user_id_code_samples를 돌면서
      for user_id_code_sample in tqdm(user_id_code_samples):
        #해당 유저에게 추천하는 아이템을 추출
        recommedation_ids = self.metrics_model.recommend(user_id_code_sample, self.train_set[user_id_code_sample], N=k)[0]
        results = []
        # 추천 받은 아이템이 유저가 선호(혹은 평가)했는지 확인
        for id_ in recommedation_ids:
          result = self.test_set[user_id_code_sample, id_]
          results.append(result)
        # 유저가 추천받은 아이템들을 얼마나 선호(혹은 평가)했는지 추출
        scores.append(np.mean(results))
        # 결과를 평균내어 반환
      return np.mean(scores)
    else:
      scores = []
      #입력받은 user_id_code_samples를 돌면서
      for user_id_code_sample in user_id_code_samples:
        #해당 유저에게 추천하는 아이템을 추출
        recommedation_ids = self.metrics_model.recommend(user_id_code_sample, self.train_set[user_id_code_sample], N=k)[0]
        results = []
        # 추천 받은 아이템이 유저가 선호(혹은 평가)했는지 확인
        for id_ in recommedation_ids:
          result = self.test_set[user_id_code_sample, id_]
          results.append(result)
        # 유저가 추천받은 아이템들을 얼마나 선호(혹은 평가)했는지 추출
        scores.append(np.mean(results))
        # 결과를 평균내어 반환
      return np.mean(scores)

# 튜닝 함수 작성

In [18]:
import numpy as np
from tqdm import tqdm
def tunning(user_item_matrix, product_lookup, data, params, n_iters = 10, metrics = 'hit_at_k', verbose=False, tuner = 'random'):
  if tuner == 'random':
    results = []
    for _ in tqdm(range(n_iters)):
      factor = np.random.choice(params["factors"])
      alpha = np.random.choice(params['alpha'])
      iteration = np.random.choice(params['iterations'])
      als = AlternatingLeastSquares(factors = factor, alpha=alpha, iterations = iteration)
      model = MyALS(als, user_item_matrix, product_lookup, data)
      
      score = model.get_score(method = metrics, verbose= verbose)
      result = [factor, alpha, iteration, score]
      results.append(result)
    frame = pd.DataFrame(results, columns = ['factor','alpha','iteration',metrics])
    frame = frame.sort_values(metrics, ascending=False)[:5]

    return frame
  elif tuner == 'grid':
    results = []
    for factor in tqdm(params['factors']):
      for alpha in params['alpha']:
        for iteration in params['iterations']:
          als = AlternatingLeastSquares(factors = factor, alpha=alpha, iterations = iteration)
          model = MyALS(als, user_item_matrix, product_lookup, data)

          score = model.get_score(method = metrics, verbose=verbose)
          result = [factor, alpha, iteration, score]
          results.append(result)
    frame = pd.DataFrame(results, columns = ['factor','alpha','iteration',metrics])
    frame = frame.sort_values(metrics, ascending=False)[:5]

    return frame

# 성능 개선

View Ratio Base만 수행

In [7]:
base = Data(df)

## No Conversion Users

In [8]:
product_lookup = base.get_item_lookup()
no_cvs_view_ratio_matrix, no_cvs_view_ratio_data = base.get_view_ratio_df('no_conversion')
#cvs_view_ratio_matrix, cvs_view_ratio_data  = base.get_view_ratio_df('conversion')

In [None]:
base_model = MyALS(AlternatingLeastSquares(), no_cvs_view_ratio_matrix, product_lookup, no_cvs_view_ratio_data)
base_model.get_score()

  0%|          | 0/15 [00:00<?, ?it/s]

100%|██████████| 10000/10000 [00:11<00:00, 908.85it/s]


0.3537

In [None]:
#random
params = {
    'factors' : np.arange(100, 210, 10),
    'alpha' : np.arange(1, 41, 5),
    'iterations' : np.arange(20, 110, 10)
}

ncvs_user_random = tunning(no_cvs_view_ratio_matrix, product_lookup, no_cvs_view_ratio_data, params)

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/90 [00:00<?, ?it/s]

 10%|█         | 1/10 [01:11<10:43, 71.55s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 20%|██        | 2/10 [02:12<08:43, 65.42s/it]

  0%|          | 0/20 [00:00<?, ?it/s]

 30%|███       | 3/10 [02:53<06:18, 54.09s/it]

  0%|          | 0/50 [00:00<?, ?it/s]

 40%|████      | 4/10 [03:59<05:53, 58.99s/it]

  0%|          | 0/40 [00:00<?, ?it/s]

 50%|█████     | 5/10 [04:49<04:38, 55.67s/it]

  0%|          | 0/80 [00:00<?, ?it/s]

 60%|██████    | 6/10 [06:07<04:13, 63.28s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 70%|███████   | 7/10 [07:08<03:07, 62.41s/it]

  0%|          | 0/90 [00:00<?, ?it/s]

 80%|████████  | 8/10 [08:26<02:14, 67.30s/it]

  0%|          | 0/40 [00:00<?, ?it/s]

 90%|█████████ | 9/10 [09:15<01:01, 61.70s/it]

  0%|          | 0/50 [00:00<?, ?it/s]

100%|██████████| 10/10 [10:15<00:00, 61.54s/it]


In [None]:
ncvs_user_random

Unnamed: 0,factor,alpha,iteration,hit_at_k
2,180,16,20,0.3737
3,200,31,50,0.3722
9,180,31,50,0.3688
5,170,26,80,0.3649
1,140,36,60,0.3605


In [None]:
#grid
params = {
    'factors' : [180,190,200],
    'alpha' : [16, 18, 20],
    'iterations' : [20, 30, 40]
}

ncvs_user_grid = tunning(no_cvs_view_ratio_matrix, product_lookup, no_cvs_view_ratio_data, params, tuner='grid')

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

 33%|███▎      | 1/3 [07:09<14:18, 429.27s/it]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

 67%|██████▋   | 2/3 [14:25<07:13, 433.33s/it]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

100%|██████████| 3/3 [22:07<00:00, 442.53s/it]


In [None]:
ncvs_user_grid

Unnamed: 0,factor,alpha,iteration,hit_at_k
21,200,18,20,0.3762
24,200,20,20,0.3756
9,190,16,20,0.3749
18,200,16,20,0.3745
25,200,20,30,0.3738


In [48]:
als = AlternatingLeastSquares(factors = 200, alpha=18, iterations = 20)
ncvs_user_best_model = MyALS(als, no_cvs_view_ratio_matrix, product_lookup, no_cvs_view_ratio_data)

In [49]:
presicion_at_k = ncvs_user_best_model.get_score(method = 'precision_at_k')
precision_at_k

  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 10000/10000 [00:20<00:00, 499.97it/s]


0.09485

In [50]:
hit_at_k = ncvs_user_best_model.get_score(method = 'hit_at_k')
hit_at_k

  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 10000/10000 [00:19<00:00, 519.90it/s]


0.3719

In [34]:
ncvs_user_best_model.fit()

  0%|          | 0/20 [00:00<?, ?it/s]

In [35]:
user = 512475445
column = 'view_ratio'

ncvs_user_best_model.get_user_topN_product(user, column)

Unnamed: 0,product_id,category_code,brand,view_ratio
868,4700419,auto.accessories.videoregister,sho-me,3.066165
763,4700478,auto.accessories.videoregister,sho-me,3.02582
4726,4700557,auto.accessories.videoregister,sho-me,2.837547
4864,4700590,auto.accessories.videoregister,sho-me,2.770307
14595,5800802,electronics.audio.subwoofer,kenwood,2.649274
2019,5700788,auto.accessories.player,kenwood,2.595481
2814,5701062,auto.accessories.player,pioneer,2.474449
4495,5701086,auto.accessories.player,pioneer,2.474449
487,5700384,auto.accessories.player,pioneer,1.707907
23871,6100194,auto.accessories.radar,,1.654115


In [36]:
ncvs_user_best_model.get_recom_product(user)

Unnamed: 0,product_id,category_code,brand
1786,5701128,auto.accessories.player,
2267,5801218,electronics.audio.subwoofer,
753,5701166,auto.accessories.player,
5682,5700282,auto.accessories.player,alpine
4729,5701246,auto.accessories.player,
8364,5700850,auto.accessories.player,alpine
367,5700518,auto.accessories.player,
4405,5701087,auto.accessories.player,jvc
1618,4700589,auto.accessories.videoregister,xiaomi
2345,5701247,auto.accessories.player,


## Conversion Users

In [39]:
cvs_view_ratio_matrix, cvs_view_ratio_data = base.get_view_ratio_df('conversion')

In [None]:
base_model = MyALS(AlternatingLeastSquares(), cvs_view_ratio_matrix, product_lookup, cvs_view_ratio_data)
base_model.get_score()

  0%|          | 0/15 [00:00<?, ?it/s]

100%|██████████| 10000/10000 [00:11<00:00, 877.66it/s]


0.5173

In [None]:
params = {
    'factors' : np.arange(100, 210, 10),
    'alpha' : np.arange(1, 41, 5),
    'iterations' : np.arange(20, 110, 10)
}

cvs_user_random = tunning(cvs_view_ratio_matrix, product_lookup, cvs_view_ratio_data, params)

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

 10%|█         | 1/10 [00:26<03:57, 26.43s/it]

  0%|          | 0/20 [00:00<?, ?it/s]

 20%|██        | 2/10 [00:51<03:25, 25.71s/it]

  0%|          | 0/20 [00:00<?, ?it/s]

 30%|███       | 3/10 [01:16<02:56, 25.28s/it]

  0%|          | 0/30 [00:00<?, ?it/s]

 40%|████      | 4/10 [01:43<02:34, 25.83s/it]

  0%|          | 0/80 [00:00<?, ?it/s]

 50%|█████     | 5/10 [02:26<02:40, 32.00s/it]

  0%|          | 0/70 [00:00<?, ?it/s]

 60%|██████    | 6/10 [03:03<02:15, 33.84s/it]

  0%|          | 0/30 [00:00<?, ?it/s]

 70%|███████   | 7/10 [03:31<01:35, 31.86s/it]

  0%|          | 0/100 [00:00<?, ?it/s]

 80%|████████  | 8/10 [04:16<01:12, 36.11s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 90%|█████████ | 9/10 [04:55<00:36, 36.98s/it]

  0%|          | 0/100 [00:00<?, ?it/s]

100%|██████████| 10/10 [05:44<00:00, 34.42s/it]


In [None]:
cvs_user_random

Unnamed: 0,factor,alpha,iteration,hit_at_k
8,200,1,60,0.536
1,180,6,20,0.5333
2,170,26,20,0.5312
4,180,16,80,0.5296
9,180,11,100,0.5296


In [None]:
params = {
    'factors' : [200, 210, 220],
    'alpha' : [1, 3, 5],
    'iterations' : [60, 70, 80]
}
cvs_user_grid = tunning(cvs_view_ratio_matrix, product_lookup, cvs_view_ratio_data, params,tuner='grid')

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/60 [00:00<?, ?it/s]

  0%|          | 0/70 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/60 [00:00<?, ?it/s]

  0%|          | 0/70 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/60 [00:00<?, ?it/s]

  0%|          | 0/70 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

 33%|███▎      | 1/3 [06:23<12:46, 383.20s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

  0%|          | 0/70 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/60 [00:00<?, ?it/s]

  0%|          | 0/70 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/60 [00:00<?, ?it/s]

  0%|          | 0/70 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

 67%|██████▋   | 2/3 [12:48<06:24, 384.32s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

  0%|          | 0/70 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/60 [00:00<?, ?it/s]

  0%|          | 0/70 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/60 [00:00<?, ?it/s]

  0%|          | 0/70 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

100%|██████████| 3/3 [19:16<00:00, 385.54s/it]


In [None]:
cvs_user_grid

Unnamed: 0,factor,alpha,iteration,hit_at_k
22,220,3,70,0.5368
21,220,3,60,0.5367
25,220,5,70,0.5365
19,220,1,70,0.5364
18,220,1,60,0.5364


In [51]:
als = AlternatingLeastSquares(factors = 220, alpha=3, iterations = 70)
cvs_user_best_model = MyALS(als, cvs_view_ratio_matrix, product_lookup, cvs_view_ratio_data)

In [52]:
precision_at_k = cvs_user_best_model.get_score(method = 'precision_at_k')
precision_at_k

  0%|          | 0/70 [00:00<?, ?it/s]

100%|██████████| 10000/10000 [00:16<00:00, 607.24it/s]


0.09479000000000001

In [53]:
hit_at_k =cvs_user_best_model.get_score(method = 'hit_at_k')
hit_at_k

  0%|          | 0/70 [00:00<?, ?it/s]

100%|██████████| 10000/10000 [00:17<00:00, 561.77it/s]


0.5348

In [54]:
cvs_user_best_model.fit()

  0%|          | 0/70 [00:00<?, ?it/s]

In [55]:
user = 523974502
column = 'view_ratio'

cvs_user_best_model.get_user_topN_product(user, column)

Unnamed: 0,product_id,category_code,brand,view_ratio
1724,5100563,electronics.clocks,samsung,68.80531
635,5100718,electronics.clocks,samsung,5.309735
388,5100737,electronics.clocks,samsung,2.876106
113,1004249,electronics.smartphone,apple,2.654867
51555,5100862,electronics.clocks,,1.99115
88,1005115,electronics.smartphone,apple,1.548673
1878,5100738,electronics.clocks,samsung,1.327434
556,1004250,electronics.smartphone,apple,0.884956
1104,1004777,electronics.smartphone,xiaomi,0.884956
122,5100816,,xiaomi,0.663717


In [56]:
cvs_user_best_model.get_recom_product(user)

Unnamed: 0,product_id,category_code,brand
51950,5100861,electronics.clocks,
53090,5100863,electronics.clocks,
124455,5100874,electronics.clocks,
378,4803976,electronics.audio.headphone,samsung
124393,5100875,electronics.clocks,
65234,5100865,electronics.clocks,
1731,5100572,electronics.clocks,apple
137538,5100878,electronics.clocks,
124477,5100876,electronics.clocks,
144,5100239,electronics.clocks,samsung
