<a href="https://colab.research.google.com/github/NagaoTadashi/Ledge-recommend/blob/main/random.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [58]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [59]:
!wget -nc --no-check-certificate https://files.grouplens.org/datasets/movielens/ml-10m.zip -P ../data
!unzip -n ../data/ml-10m.zip -d ../data/

File ‘../data/ml-10m.zip’ already there; not retrieving.

Archive:  ../data/ml-10m.zip


映画情報

In [60]:
m_cols = ['movie_id', 'title', 'genre']
movies = pd.read_csv('../data/ml-10M100K/movies.dat', names=m_cols, sep='::' , encoding='latin-1', engine='python')

In [61]:
# genreをlist形式で保持する
movies['genre'] = movies.genre.apply(lambda x:x.split('|'))

In [62]:
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy]


In [63]:
print("映画の作品数 : {}".format(movies.shape[0]))

映画の作品数 : 10681


タグ情報

In [64]:

t_cols = ['user_id', 'movie_id', 'tag', 'timestamp']
user_tagged_movies = pd.read_csv('../data/ml-10M100K/tags.dat', names=t_cols, sep='::', engine='python')

In [65]:
user_tagged_movies.head()

Unnamed: 0,user_id,movie_id,tag,timestamp
0,15,4973,excellent!,1215184630
1,20,1747,politics,1188263867
2,20,1747,satire,1188263867
3,20,2424,chick flick 212,1188263835
4,20,2424,hanks,1188263835


In [66]:
print("タグの種類 : {}".format(len(user_tagged_movies["tag"].unique())))
print("タグが付いている映画の作品数 : {}".format(len(user_tagged_movies["movie_id"].unique())))

タグの種類 : 16529
タグが付いている映画の作品数 : 7601


In [67]:
# tagを小文字にする
user_tagged_movies['tag'] = user_tagged_movies['tag'].str.lower()

In [68]:
# tagを映画ごとにlist形式で保持する
movie_tags = user_tagged_movies.groupby('movie_id').agg({'tag':list})

In [71]:
# タグ情報を結合する
movies = movies.merge(movie_tags, on='movie_id', how='left')

In [72]:
movies.head()

Unnamed: 0,movie_id,title,genre,tag
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]","[pixar, pixar, pixar, animation, pixar, animat..."
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]","[for children, game, animals, joe johnston, ro..."
2,3,Grumpier Old Men (1995),"[Comedy, Romance]","[funniest movies, comedinha de velhinhos engra..."
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]",[girl movie]
4,5,Father of the Bride Part II (1995),[Comedy],"[steve martin, pregnancy, remake, steve martin..."


評価データ

In [73]:
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('../data/ml-10M100K/ratings.dat', names=r_cols, sep='::', engine='python')

In [74]:
# データ量が多いため、ユーザー数を1000に絞って、試していく
valid_user_ids = sorted(ratings.user_id.unique())[:1000]
ratings = ratings[ratings["user_id"].isin(valid_user_ids)]

In [75]:
movielens = ratings.merge(movies, on='movie_id')

In [76]:
movielens.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genre,tag
0,1,122,5.0,838985046,Boomerang (1992),"[Comedy, Romance]","[dating, nudity (topless - brief), can't remem..."
1,139,122,3.0,974302621,Boomerang (1992),"[Comedy, Romance]","[dating, nudity (topless - brief), can't remem..."
2,149,122,2.5,1112342322,Boomerang (1992),"[Comedy, Romance]","[dating, nudity (topless - brief), can't remem..."
3,182,122,3.0,943458784,Boomerang (1992),"[Comedy, Romance]","[dating, nudity (topless - brief), can't remem..."
4,215,122,4.5,1102493547,Boomerang (1992),"[Comedy, Romance]","[dating, nudity (topless - brief), can't remem..."


学習用・評価用データの分割

In [78]:
# 各ユーザの直近の５件の映画を評価用に使い、それ以外を学習用とする
# まずは、それぞれのユーザが評価した映画の順序を計算する
# 直近付与した映画から順番を付与していく(1始まり)
movielens['timestamp_rank'] = movielens.groupby('user_id')['timestamp'].rank(ascending=False, method='first')
movielens_train = movielens[movielens['timestamp_rank'] > 5]
movielens_test = movielens[movielens['timestamp_rank']<= 5]

In [79]:
# ユーザーIDとアイテムIDに対して、０始まりのインデックスを割り振る
unique_user_ids = sorted(movielens_train.user_id.unique())
unique_movie_ids = sorted(movielens_train.movie_id.unique())
user_id2index = dict(zip(unique_user_ids, range(len(unique_user_ids))))
movie_id2index = dict(zip(unique_movie_ids, range(len(unique_movie_ids))))

ランダム推薦

In [80]:
# ユーザー×アイテムの行列で、各セルの予測評価値は0.5〜5.0の一様乱数とする
pred = np.random.uniform(0.5, 5.0, (len(unique_user_ids), len(unique_movie_ids)))

In [81]:
pred.shape

(1000, 6673)

In [92]:
# テストデータに出てくるユーザーとアイテムの予測評価値を格納する
movie_rating_predict = movielens_test.copy()
pred_results = []

In [93]:
for i, row in movielens_test.iterrows():
    user_id = row["user_id"]
    # テストデータのアイテムIDが学習用に登場していない場合も乱数を格納する
    if row["movie_id"] not in movie_id2index:
        pred_results.append(np.random.uniform(0.5, 5.0))
        continue
    # テストデータに現れるユーザーIDとアイテムIDのインデックスを取得し、評価値行列の値を取得する
    user_index = user_id2index[row["user_id"]]
    movie_index = movie_id2index[row["movie_id"]]
    pred_score = pred[user_index, movie_index]
    pred_results.append(pred_score)
movie_rating_predict["rating_pred"] = pred_results

In [94]:
movie_rating_predict.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genre,tag,timestamp_rank,rating_pred
0,1,122,5.0,838985046,Boomerang (1992),"[Comedy, Romance]","[dating, nudity (topless - brief), can't remem...",1.0,3.56756
5,217,122,3.0,844429650,Boomerang (1992),"[Comedy, Romance]","[dating, nudity (topless - brief), can't remem...",5.0,0.871919
33,892,122,4.0,850079961,Boomerang (1992),"[Comedy, Romance]","[dating, nudity (topless - brief), can't remem...",5.0,2.554775
46,59,185,3.0,838984807,"Net, The (1995)","[Action, Crime, Thriller]","[computers, computers, internet, irwin winkler...",1.0,4.112181
47,62,185,5.0,834874598,"Net, The (1995)","[Action, Crime, Thriller]","[computers, computers, internet, irwin winkler...",5.0,2.524669
