* Model based recsys
    * ALS / Matrix factorization 으로 추천 결과 및 성능 비교하기

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [2]:
# 데이터셋 로드
data_path = './anime/'

anime = pd.read_csv(data_path+'anime.csv')
rating = pd.read_csv(data_path+'rating.csv')

In [3]:
anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [4]:
rating.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


#### preprocessing

In [5]:
print(rating.shape)
rating['rating'].describe() # rating - rating out of 10 this user has assigned (-1 if the user watched it but didn't assign a rating).

(7813737, 3)


count    7.813737e+06
mean     6.144030e+00
std      3.727800e+00
min     -1.000000e+00
25%      6.000000e+00
50%      7.000000e+00
75%      9.000000e+00
max      1.000000e+01
Name: rating, dtype: float64

In [6]:
rating.loc[rating['user_id']==1,'rating'].describe() # 어떤 user는 시청한 대부분의 item의 rating 을 매기지 않음

count    153.000000
mean      -0.712418
std        1.760955
min       -1.000000
25%       -1.000000
50%       -1.000000
75%       -1.000000
max       10.000000
Name: rating, dtype: float64

In [7]:
pd.merge(rating.loc[(rating['user_id']==1)&(rating['rating']>0)], anime, on ='anime_id', how='left')

Unnamed: 0,user_id,anime_id,rating_x,name,genre,type,episodes,rating_y,members
0,1,8074,10,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural",TV,12,7.46,535892
1,1,11617,10,High School DxD,"Comedy, Demons, Ecchi, Harem, Romance, School",TV,12,7.7,398660
2,1,11757,10,Sword Art Online,"Action, Adventure, Fantasy, Game, Romance",TV,25,7.83,893100
3,1,15451,10,High School DxD New,"Action, Comedy, Demons, Ecchi, Harem, Romance,...",TV,12,7.87,266657


In [8]:
# 모든 item 의 rating 을 assign 하지 않은 경우? >> 3915명
# 이 경우에는 item feature 기반 contents based filtering 으로 추천
tmp =rating.groupby('user_id')['rating'].agg(['count','sum'])
tmp['tmp_val'] = tmp['count']+tmp['sum']  
tmp[tmp['tmp_val']==0]

Unnamed: 0_level_0,count,sum,tmp_val
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4,52,-52,0
6,37,-37,0
13,174,-174,0
49,6,-6,0
54,834,-834,0
...,...,...,...
73418,28,-28,0
73447,1,-1,0
73451,74,-74,0
73467,83,-83,0


In [9]:
exclude_users = list(tmp[tmp['tmp_val']==0].index)
train_users_interaction = rating.loc[~rating['user_id'].isin(exclude_users)]
print(train_users_interaction.shape)

(7422056, 3)


In [10]:
# 위 interaction dataframe 내 -1 rating 은 item dataframe 의 rating 으로 대체

In [11]:
train_users_interaction.loc[train_users_interaction['rating']==-1,'rating'] = None

Unnamed: 0,anime_id,episodes,rating,members,Adventure,Cars,Comedy,Dementia,Demons,Drama,...,Supernatural,Thriller,Vampire,Yaoi,Movie,Music,ONA,OVA,Special,TV
0,32281,1,9.37,200630,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,5114,64,9.26,793665,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
2,28977,51,9.25,114262,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,9253,24,9.17,673572,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,9969,51,9.16,151266,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12289,9316,1,4.15,211,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
12290,5543,1,4.28,183,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
12291,5621,4,4.88,219,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
12292,6133,1,4.98,175,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [20]:
anime_features = anime[['anime_id','episodes','rating','members']].join(anime['genre'].str.get_dummies(sep=','))
anime_features = pd.concat([anime_features, anime['type'].str.get_dummies()],axis=1)
genre_type_cols = [c for c in anime_features.columns if c not in ['anime_id','episodes','rating','members']]
print(genre_type_cols)
anime_features.head()

[' Adventure', ' Cars', ' Comedy', ' Dementia', ' Demons', ' Drama', ' Ecchi', ' Fantasy', ' Game', ' Harem', ' Hentai', ' Historical', ' Horror', ' Josei', ' Kids', ' Magic', ' Martial Arts', ' Mecha', ' Military', ' Music', ' Mystery', ' Parody', ' Police', ' Psychological', ' Romance', ' Samurai', ' School', ' Sci-Fi', ' Seinen', ' Shoujo', ' Shoujo Ai', ' Shounen', ' Shounen Ai', ' Slice of Life', ' Space', ' Sports', ' Super Power', ' Supernatural', ' Thriller', ' Vampire', ' Yaoi', ' Yuri', 'Action', 'Adventure', 'Cars', 'Comedy', 'Dementia', 'Demons', 'Drama', 'Ecchi', 'Fantasy', 'Game', 'Harem', 'Hentai', 'Historical', 'Horror', 'Josei', 'Kids', 'Magic', 'Martial Arts', 'Mecha', 'Military', 'Music', 'Mystery', 'Parody', 'Police', 'Psychological', 'Romance', 'Samurai', 'School', 'Sci-Fi', 'Seinen', 'Shoujo', 'Shounen', 'Slice of Life', 'Space', 'Sports', 'Super Power', 'Supernatural', 'Thriller', 'Vampire', 'Yaoi', 'Movie', 'Music', 'ONA', 'OVA', 'Special', 'TV']


Unnamed: 0,anime_id,episodes,rating,members,Adventure,Cars,Comedy,Dementia,Demons,Drama,...,Supernatural,Thriller,Vampire,Yaoi,Movie,Music,ONA,OVA,Special,TV
0,32281,1,9.37,200630,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,5114,64,9.26,793665,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
2,28977,51,9.25,114262,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,9253,24,9.17,673572,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,9969,51,9.16,151266,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [21]:
total_df = pd.merge(train_users_interaction, anime_features, on='anime_id', how='left')
total_df.head()

Unnamed: 0,user_id,anime_id,rating_x,episodes,rating_y,members,Adventure,Cars,Comedy,Dementia,...,Supernatural,Thriller,Vampire,Yaoi,Movie,Music,ONA,OVA,Special,TV
0,1,20,,220,7.81,683297.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1,24,,26,8.06,178553.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1,79,,24,7.31,158772.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1,226,,13,7.85,623511.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1,241,,11,6.69,84395.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [22]:
total_df['rating'] = np.where(total_df['rating_x'].isnull(), total_df['rating_y'],total_df['rating_x'])
del total_df['rating_x']
total_df['avg_rating'] = total_df['rating_y']
del total_df['rating_y']
total_df.head()

Unnamed: 0,user_id,anime_id,episodes,members,Adventure,Cars,Comedy,Dementia,Demons,Drama,...,Vampire,Yaoi,Movie,Music,ONA,OVA,Special,TV,rating,avg_rating
0,1,20,220,683297.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,7.81,7.81
1,1,24,26,178553.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,8.06,8.06
2,1,79,24,158772.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,7.31,7.31
3,1,226,13,623511.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,7.85,7.85
4,1,241,11,84395.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6.69,6.69


In [24]:
# 2️⃣ 유저별 장르별 평균 rating 계산
user_genre_rating = total_df.melt(id_vars=['user_id', 'rating'], value_vars=genre_type_cols) \
    .query("value == 1") \
    .groupby(['user_id', 'variable'])['rating'].mean().unstack()


In [25]:
user_genre_rating.head()

variable,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,Harem,...,Shounen,Slice of Life,Special,Sports,Super Power,Supernatural,TV,Thriller,Vampire,Yaoi
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,7.872222,,7.498529,,7.887857,7.665652,7.323077,7.451333,8.206667,7.393971,...,,,7.260667,,,,7.566121,,,
2,,,,,,,,,,,...,,,,,,,8.506667,,,
3,7.44,,7.394,,6.5,7.48,7.25,7.66,7.0,,...,7.666667,,7.666667,,,,7.728986,,,
5,5.220741,6.0,4.766909,1.0,4.322,5.186441,2.505376,4.450435,4.1,2.25,...,6.965,,4.055556,,,,4.320154,,,
7,7.617647,,7.770492,10.0,7.181818,7.580645,6.917808,7.385542,7.692308,7.076923,...,7.2,,7.277778,,,,7.438596,,,


In [None]:
total_df.groupby('user_id').agg({})

In [23]:
# split train-test set (setting hold-outset)


In [None]:
# X_train 을 기준으로 user based feature , item based feature , interaction 으로 전체 feature set 구성
def get_feature_set(df):
    
    X = df.drop(columns=['rating'])
    y = df[['rating']]
    print('# of original features :::: {}'.format(X.shape[1]))
    # feature engineering

    scaler = MinMaxScaler()
    X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=.25, random_state=42)

    # user based feature engineering
    user_feature_genre_rating = pd.concat([X_train,y_train]).melt(id_vars=['user_id', 'rating'], value_vars=genre_type_cols) \
    .query("value == 1") \
    .groupby(['user_id', 'variable'])['rating'].mean().unstack().fillna(0)
    user_feature_avg_rating = X_train.groupby('user_id')['rating'].mean().rename('user_avg_rating')
    user_feature_cnt_rating = X_train.groupby('user_id')['rating'].count().rename('user_cnt_rating')
    user_feature_std_rating = X_train.groupby('user_id')['rating'].std().rename('user_std_rating')

    # item based feature engineering
    item_feature_avg_rating = X_train.groupby('anime_id')['rating'].mean().rename('item_avg_rating')
    item_feature_cnt_rating = X_train.groupby('anime_id')['rating'].count().rename('item_cnt_rating')
    item_feature_std_rating = X_train.groupby('anime_id')['rating'].std().rename('item_std_rating')



    X_train_interaction = X[['user_id','anime_id']]



    result_df = None
    
    return result_df

In [20]:
total_df = pd.merge(anime, rating, on='anime_id', suffixes= ['_anime', '_user'])
print(total_df.shape)

(7813727, 89)
