In [1]:
import pickle
import pandas as pd
import numpy as np
import os, sys, gc 
from plotnine import *
import plotnine

from tqdm import tqdm_notebook
import seaborn as sns
import warnings
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import matplotlib as mpl
from matplotlib import rc
import re
from matplotlib.ticker import PercentFormatter
import datetime
from math import log # IDF 계산을 위해

## 베이스라인 모델 생성
- Popular Based Recommendation
- Popular Based Recommendation with following arthor

In [2]:
path = "../input/t-academy-recommendation/"

In [3]:
# pd.read_json : json 형태의 파일을 dataframe 형태로 불러오는 코드 
magazine = pd.read_json(path + 'magazine.json', lines=True) # lines = True : Read the file as a json object per line.
metadata = pd.read_json(path + 'metadata.json', lines=True)
users = pd.read_json(path + 'users.json', lines=True)

In [4]:
%%time 
import itertools
from itertools import chain
import glob
import os 

def chainer(s):
    return list(itertools.chain.from_iterable(s))

read_rowwise = pd.read_csv(path + "read_rowwise.csv")

CPU times: user 28.1 s, sys: 3.53 s, total: 31.7 s
Wall time: 31.7 s


In [5]:
from datetime import datetime 

metadata['reg_datetime'] = metadata['reg_ts'].apply(lambda x : datetime.fromtimestamp(x/1000.0))
metadata.loc[metadata['reg_datetime'] == metadata['reg_datetime'].min(), 'reg_datetime'] = datetime(2090, 12, 31)
metadata['reg_dt'] = metadata['reg_datetime'].dt.date
metadata['type'] = metadata['magazine_id'].apply(lambda x : '개인' if x == 0.0 else '매거진')
metadata['reg_dt'] = pd.to_datetime(metadata['reg_dt'])

## Popular Based Recommendation
- 2019년도 이후로 작성된 글중에서 상위 100건의 글을 추천 
- 사용자가 읽은 글은 추천이 되지 않도록 후처리

In [6]:
# 2019년도 이후로 작성된 글중에서 상위 100건의 글을 추천 
# 단, 이미 읽은 글의 경우는 추천에서 제외 
read_rowwise = read_rowwise.merge(metadata[['id', 'reg_dt']], how='left', left_on='article_id', right_on='id')

In [7]:
read_rowwise.head()

Unnamed: 0,from,to,user_id,article_id,id,reg_dt
0,2018100100,2018100101,#e208be4ffea19b1ceb5cea2e3c4dc32c,@kty0613_91,@kty0613_91,2018-09-30
1,2018100100,2018100101,#e208be4ffea19b1ceb5cea2e3c4dc32c,,,NaT
2,2018100100,2018100101,#0a3d493f3b2318be80f391eaa00bfd1c,@miamiyoung_31,@miamiyoung_31,2018-09-02
3,2018100100,2018100101,#0a3d493f3b2318be80f391eaa00bfd1c,@banksalad_49,@banksalad_49,2016-11-30
4,2018100100,2018100101,#0a3d493f3b2318be80f391eaa00bfd1c,@rlfrjsdn_95,@rlfrjsdn_95,2018-06-04


In [8]:
# 사용자가 읽은 글의 목록들을 저장 
read_total = pd.DataFrame(read_rowwise.groupby(['user_id'])['article_id'].unique()).reset_index()
read_total.columns = ['user_id', 'article_list']

In [9]:
# 1. article_id가 결측치인 경우는 삭제 (작가가 사라진 경우)
# 2. reg_dt가 결측치인 경우는 삭제 (메타데이터에 자료가 없는 경우)
read_rowwise = read_rowwise[read_rowwise['article_id'] != '']
read_rowwise = read_rowwise[(read_rowwise['id'].notnull()) & (read_rowwise['reg_dt'].notnull())]
read_rowwise = read_rowwise[(read_rowwise['reg_dt'] >= '2019-01-01') & (read_rowwise['reg_dt'] < '2090-12-31')].reset_index(drop=True)

del read_rowwise['id']

In [10]:
valid = pd.read_csv(path + '/predict/dev.users', header=None)

In [11]:
%%time 

popular_rec_model = read_rowwise['article_id'].value_counts().index[0:1000]
with open('recommend.txt', 'w') as f:
    for user in tqdm_notebook(valid[0].values):
        # 추천 후보 
        seen = chainer(read_total[read_total['user_id'] == user]['article_list'].values)
        recs = []
        for r in popular_rec_model:
            if len(recs) == 100: 
                break 
            else: 
                if r not in seen: recs.append(r)
        f.write('%s %s\n' % (user, ' '.join(recs)))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


CPU times: user 5min 40s, sys: 2.94 s, total: 5min 43s
Wall time: 5min 40s


![](https://github.com/choco9966/T-academy-Recommendation/blob/master/figure/Popular_Based_score.PNG?raw=true)

## Popular Based Recommendation with following arthor
- 2019년도 이후로 작성된 글중에서 구독작가의 글을 우선적으로 추천 
- 사용자가 읽은 글은 추천이 되지 않도록 후처리

![](https://github.com/choco9966/T-academy-Recommendation/blob/master/figure/Following_popular.PNG?raw=true)

In [12]:
following_cnt_by_user = users['following_list'].map(len)
following_rowwise = pd.DataFrame({'user_id': np.repeat(users['id'], following_cnt_by_user),
                             'author_id': chainer(users['following_list'])})

following_rowwise.reset_index(drop=True, inplace=True)

In [13]:
following_rowwise = following_rowwise[following_rowwise['user_id'].isin(valid[0].values)]
following_rowwise.head()

Unnamed: 0,user_id,author_id
310,#a6f7a5ff90a19ec4d583f0db1836844d,@potatohands
311,#a6f7a5ff90a19ec4d583f0db1836844d,@ggpodori
312,#a6f7a5ff90a19ec4d583f0db1836844d,@chae-pulib
313,#a6f7a5ff90a19ec4d583f0db1836844d,@roysday
314,#a6f7a5ff90a19ec4d583f0db1836844d,@lovebrander


In [14]:
%%time 
metadata_ = metadata[['user_id', 'id', 'reg_dt']]
metadata_.columns = ['author_id', 'article_id', 'reg_dt']
following_popular_model = pd.merge(following_rowwise, metadata_, how='left', on='author_id')

CPU times: user 3.43 s, sys: 436 ms, total: 3.86 s
Wall time: 3.9 s


위의 모델을 통해서 추천하려고 했더니 발생하는 문제점 
1. 구독하는 작가가 없는 경우에는 어떤 식으로 추천해야할 지? 
2. 구독하는 작가가 여러명이고 작가의 글도 여러개인데 어떤 글을 위주로 추천해야할 지? 

단순한 해결책 
1. Popular Based Model에서 상위 100건을 추천 
2. 작가중에서 선호하는 작가를 선정하고 해당 작가의 인기글을 추천 
    - 선호 : 해당 작가의 글을 가장 많이 읽음 
    - 위의 정의 말고도 "얼마나 많은 날에 찾아가서 읽었는 지", "작가의 글중에서 몇편의 글을 읽었는 지" 등으로 다르게 정의도 가능

In [15]:
%%time 
read_rowwise['author_id'] = read_rowwise['article_id'].apply(lambda x: x.split('_')[0])
author_favor = read_rowwise.groupby(['user_id', 'author_id'])['author_id'].agg({'count'}).reset_index()

CPU times: user 7.93 s, sys: 545 ms, total: 8.47 s
Wall time: 8.6 s


In [16]:
popular_model = pd.DataFrame(read_rowwise['article_id'].value_counts()).reset_index()
popular_model.columns = ['article_id', 'count']

In [17]:
following_popular_model = pd.merge(following_popular_model, author_favor, how='left', on=['user_id', 'author_id'])
following_popular_model = following_popular_model[following_popular_model['count'].notnull()].reset_index(drop=True)
following_popular_model = pd.merge(following_popular_model, popular_model, how='left', on='article_id')
following_popular_model.head()

Unnamed: 0,user_id,author_id,article_id,reg_dt,count_x,count_y
0,#a6f7a5ff90a19ec4d583f0db1836844d,@potatohands,@potatohands_8,2019-03-01,2.0,
1,#a6f7a5ff90a19ec4d583f0db1836844d,@potatohands,@potatohands_5,2019-02-04,2.0,410.0
2,#a6f7a5ff90a19ec4d583f0db1836844d,@potatohands,@potatohands_6,2019-02-21,2.0,105.0
3,#a6f7a5ff90a19ec4d583f0db1836844d,@ggpodori,@ggpodori_14,2019-01-28,2.0,2603.0
4,#a6f7a5ff90a19ec4d583f0db1836844d,@ggpodori,@ggpodori_12,2019-03-13,2.0,


- count_x : 작가에 대한 개별 사용자의 선호도 
- count_y : 글에 대한 전체 사용자의 선호도 

In [18]:
following_popular_model = following_popular_model.sort_values(by=['count_x', 'count_y', 'reg_dt'], ascending=[False, False, False])

In [19]:
following_popular_model[following_popular_model['user_id'] == '#a6f7a5ff90a19ec4d583f0db1836844d'].head()

Unnamed: 0,user_id,author_id,article_id,reg_dt,count_x,count_y
180,#a6f7a5ff90a19ec4d583f0db1836844d,@soo-log,@soo-log_169,2019-01-09,5.0,1882.0
181,#a6f7a5ff90a19ec4d583f0db1836844d,@soo-log,@soo-log_171,2019-01-16,5.0,1466.0
182,#a6f7a5ff90a19ec4d583f0db1836844d,@soo-log,@soo-log_174,2019-01-23,5.0,896.0
184,#a6f7a5ff90a19ec4d583f0db1836844d,@soo-log,@soo-log_178,2019-01-30,5.0,873.0
192,#a6f7a5ff90a19ec4d583f0db1836844d,@soo-log,@soo-log_184,2019-02-06,5.0,787.0


In [20]:
%%time 

with open('./recommend.txt', 'w') as f:
    for user in tqdm_notebook(valid[0].values):
        # 추천 후보 
        seen = chainer(read_total[read_total['user_id'] == user]['article_list'].values)
        following_rec_model = following_popular_model[following_popular_model['user_id'] == user]['article_id'].values
        recs = []
        for r in following_rec_model:
            if len(recs) == 100:
                break 
            else: 
                if r not in seen + recs: recs.append(r)
        
        if len(recs) < 100: 
            for r in popular_rec_model:
                if len(recs) == 100: 
                    break 
                else: 
                    if r not in seen + recs: recs.append(r)            
        f.write('%s %s\n' % (user, ' '.join(recs)))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


CPU times: user 43min 21s, sys: 9.71 s, total: 43min 31s
Wall time: 43min 20s


![](https://github.com/choco9966/T-academy-Recommendation/blob/master/figure/Following_Popular_Based_score.PNG?raw=true)