In [1]:
import pandas as pd
import numpy as np
import torch
import torch_geometric
import os
import torch_geometric.transforms as T
from torch_geometric.data import HeteroData

In [2]:
from matplotlib import pyplot as plt
import seaborn as sns

In [3]:
# Python version
!python --version

Python 3.7.13


In [4]:
# Lets check version of the lib, and cuda
print("Torch version:" ,torch.__version__)
print("Cuda available:", torch.cuda.is_available())
print("Cuda version:",torch.version.cuda)
print("Torch geometric version: ",torch_geometric.__version__)

Torch version: 1.12.0+cu113
Cuda available: True
Cuda version: 11.3
Torch geometric version:  2.3.1


In [5]:
# Load csv
anime_dataset = pd.read_csv("data/raw/anime.csv")
user_dataset = pd.read_csv("data/raw/rating.csv")

In [6]:
anime_dataset.dropna(inplace=True)

In [7]:
anime_dataset.isnull().sum()

anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64

In [8]:
user_dataset.isnull().sum()

user_id     0
anime_id    0
rating      0
dtype: int64

In [9]:
print(anime_dataset.head())

   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie        1    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV       64    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.25   
3                                   Sci-Fi, Thriller     TV       24    9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.16   

   members  
0   200630  
1   793665  
2   114262  
3   673572  
4   151266  


In [10]:
print(user_dataset.head())

   user_id  anime_id  rating
0        1        20      -1
1        1        24      -1
2        1        79      -1
3        1       226      -1
4        1       241      -1


In [11]:
user_dataset

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1
...,...,...,...
7813732,73515,16512,7
7813733,73515,17187,9
7813734,73515,22145,10
7813735,73516,790,9


In [12]:
anime_dataset

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [13]:
# We gonna use the user_dataset that only have anime_id that same in the anime dataset
user_dataset = user_dataset[user_dataset['anime_id'].isin(anime_dataset['anime_id'].tolist())]

# Pre-processing

# Anime dataset pre-process

In [14]:
anime_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12017 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12017 non-null  int64  
 1   name      12017 non-null  object 
 2   genre     12017 non-null  object 
 3   type      12017 non-null  object 
 4   episodes  12017 non-null  object 
 5   rating    12017 non-null  float64
 6   members   12017 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 751.1+ KB


In [15]:
anime_dataset.describe()

Unnamed: 0,anime_id,rating,members
count,12017.0,12017.0,12017.0
mean,13638.001165,6.478264,18348.88
std,11231.076675,1.023857,55372.5
min,1.0,1.67,12.0
25%,3391.0,5.89,225.0
50%,9959.0,6.57,1552.0
75%,23729.0,7.18,9588.0
max,34519.0,10.0,1013917.0


In [16]:
# From the table above, We can see that the anime_id itself is not sequential
# This is how you make the anime_id sequential by using new indexing
sorted_anime = anime_dataset.sort_values('anime_id',ascending=True).set_index('anime_id')
sorted_anime = sorted_anime.reset_index()

# We store the id mapping here
anime_id_mapping = sorted_anime["anime_id"]
# After that we turn it to dict, so we can map it to the anime_dataset later on
anime_mapping = anime_id_mapping.reset_index().set_index("anime_id").to_dict()
# Dict turn the column to be the key

In [17]:
# Next fot genres column
# Because the genres itself contains multiple values (list)
# So, we turn it to be categorical values 
genres = sorted_anime["genre"].str.split(",")
genres

0        [Action,  Adventure,  Comedy,  Drama,  Sci-Fi,...
1              [Action,  Drama,  Mystery,  Sci-Fi,  Space]
2                               [Action,  Comedy,  Sci-Fi]
3        [Action,  Drama,  Magic,  Mystery,  Police,  S...
4           [Adventure,  Fantasy,  Shounen,  Supernatural]
                               ...                        
12012                             [Sci-Fi,  Slice of Life]
12013                                             [Comedy]
12014                                             [Comedy]
12015         [Action,  Adventure,  Fantasy,  Game,  Kids]
12016                                             [Comedy]
Name: genre, Length: 12017, dtype: object

In [18]:
exploded_genres = genres.explode()  # Turn list on row of data TO single row of data
exploded_genres

0            Action
0         Adventure
0            Comedy
0             Drama
0            Sci-Fi
            ...    
12015     Adventure
12015       Fantasy
12015          Game
12015          Kids
12016        Comedy
Name: genre, Length: 35595, dtype: object

In [19]:
genres_onehot = pd.get_dummies(exploded_genres)
genres_onehot

Unnamed: 0,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,Harem,...,Shoujo,Shounen,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12015,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12015,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
12015,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
12015,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
genres_onehot = genres_onehot.groupby(genres_onehot.index).sum()
genres_onehot

Unnamed: 0,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,Harem,...,Shoujo,Shounen,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi
0,1,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12012,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12013,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12014,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12015,1,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
# get dummies for type
type_preprocess = pd.get_dummies(sorted_anime['type'])
type_preprocess

Unnamed: 0,Movie,Music,ONA,OVA,Special,TV
0,0,0,0,0,0,1
1,1,0,0,0,0,0
2,0,0,0,0,0,1
3,0,0,0,0,0,1
4,0,0,0,0,0,1
...,...,...,...,...,...,...
12012,0,0,0,0,1,0
12013,0,0,0,0,0,1
12014,0,0,0,0,0,1
12015,0,0,1,0,0,0


In [22]:
# Concat it, and we use the episodes,type and genres as features node
anime_df = pd.concat([genres_onehot, sorted_anime[
                'episodes'], type_preprocess], axis=1)
anime_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12017 entries, 0 to 12016
Data columns (total 89 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0    Adventure      12017 non-null  uint8 
 1    Cars           12017 non-null  uint8 
 2    Comedy         12017 non-null  uint8 
 3    Dementia       12017 non-null  uint8 
 4    Demons         12017 non-null  uint8 
 5    Drama          12017 non-null  uint8 
 6    Ecchi          12017 non-null  uint8 
 7    Fantasy        12017 non-null  uint8 
 8    Game           12017 non-null  uint8 
 9    Harem          12017 non-null  uint8 
 10   Hentai         12017 non-null  uint8 
 11   Historical     12017 non-null  uint8 
 12   Horror         12017 non-null  uint8 
 13   Josei          12017 non-null  uint8 
 14   Kids           12017 non-null  uint8 
 15   Magic          12017 non-null  uint8 
 16   Martial Arts   12017 non-null  uint8 
 17   Mecha          12017 non-null  uint8 
 18   Milit

In [23]:
# We can see that, there is an object data type. Because the model not accepting object values.
anime_df.select_dtypes(object)

Unnamed: 0,episodes
0,26
1,1
2,26
3,26
4,52
...,...
12012,1
12013,30
12014,24
12015,18


In [24]:
# Change the value of unknown to -1
anime_df['episodes'] = anime_df['episodes'].replace('Unknown', -1)
anime_df['episodes'] = anime_df['episodes'].astype('float64')

In [25]:
anime_df.head()

Unnamed: 0,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,Harem,...,Thriller,Vampire,Yaoi,episodes,Movie,Music,ONA,OVA,Special,TV
0,1,0,1,0,0,1,0,0,0,0,...,0,0,0,26.0,0,0,0,0,0,1
1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,1.0,1,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,26.0,0,0,0,0,0,1
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,26.0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,52.0,0,0,0,0,0,1


In [26]:
anime_df.columns

Index([' Adventure', ' Cars', ' Comedy', ' Dementia', ' Demons', ' Drama',
       ' Ecchi', ' Fantasy', ' Game', ' Harem', ' Hentai', ' Historical',
       ' Horror', ' Josei', ' Kids', ' Magic', ' Martial Arts', ' Mecha',
       ' Military', ' Music', ' Mystery', ' Parody', ' Police',
       ' Psychological', ' Romance', ' Samurai', ' School', ' Sci-Fi',
       ' Seinen', ' Shoujo', ' Shoujo Ai', ' Shounen', ' Shounen Ai',
       ' Slice of Life', ' Space', ' Sports', ' Super Power', ' Supernatural',
       ' Thriller', ' Vampire', ' Yaoi', ' Yuri', 'Action', 'Adventure',
       'Cars', 'Comedy', 'Dementia', 'Demons', 'Drama', 'Ecchi', 'Fantasy',
       'Game', 'Harem', 'Hentai', 'Historical', 'Horror', 'Josei', 'Kids',
       'Magic', 'Martial Arts', 'Mecha', 'Military', 'Music', 'Mystery',
       'Parody', 'Police', 'Psychological', 'Romance', 'Samurai', 'School',
       'Sci-Fi', 'Seinen', 'Shoujo', 'Shounen', 'Slice of Life', 'Space',
       'Sports', 'Super Power', 'Supernatural'

# User dataset pre-process

In [27]:
user_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7813611 entries, 0 to 7813736
Data columns (total 3 columns):
 #   Column    Dtype
---  ------    -----
 0   user_id   int64
 1   anime_id  int64
 2   rating    int64
dtypes: int64(3)
memory usage: 238.5 MB


In [28]:
user_dataset.describe()

Unnamed: 0,user_id,anime_id,rating
count,7813611.0,7813611.0,7813611.0
mean,36727.99,8908.729,6.144075
std,20997.92,8883.598,3.727792
min,1.0,1.0,-1.0
25%,18974.0,1240.0,6.0
50%,36791.0,6213.0,7.0
75%,54758.0,14093.0,9.0
max,73516.0,34519.0,10.0


In [29]:
# For user that gave rating -1, it means that they not rating that movies
dropped_user = user_dataset.drop(user_dataset[user_dataset['rating'] == -1].index,axis=0)
dropped_user.describe()

Unnamed: 0,user_id,anime_id,rating
count,6337146.0,6337146.0,6337146.0
mean,36747.95,8902.548,7.808543
std,21013.37,8881.674,1.57244
min,1.0,1.0,1.0
25%,18985.0,1239.0,7.0
50%,36815.0,6213.0,8.0
75%,54873.0,14075.0,9.0
max,73516.0,34475.0,10.0


In [30]:
# Do the same thing for  the user_id 
sorted_user = dropped_user.sort_values('user_id',ascending=True).reset_index(drop=True)


In [31]:
# How about the user features for each node??
# Taking mean of rating, and how many time they rate  movies
mean_rating = sorted_user.groupby('user_id')['rating'].mean().rename("mean")
num_rating = sorted_user.groupby("user_id")["rating"].count().rename("count")

In [32]:
merged_user = pd.merge(dropped_user, mean_rating, on='user_id', how='left')
merged_user = pd.merge(merged_user, num_rating, on='user_id', how='left')
merged_user

Unnamed: 0,user_id,anime_id,rating,mean,count
0,1,8074,10,10.000000,4
1,1,11617,10,10.000000,4
2,1,11757,10,10.000000,4
3,1,15451,10,10.000000,4
4,2,11771,10,10.000000,1
...,...,...,...,...,...
6337141,73515,16512,7,8.547486,179
6337142,73515,17187,9,8.547486,179
6337143,73515,22145,10,8.547486,179
6337144,73516,790,9,9.000000,2


In [33]:

user_features = pd.concat([mean_rating, num_rating], axis=1)
user_connection = merged_user[['user_id','anime_id']]

# Build the hetero graph

In [34]:
# Graph have edge_index,nodes,node feature

# Movie nodes
anime_node_features = torch.tensor(anime_df.to_numpy(), dtype=torch.float)
# User nodes 
user_node_features = torch.tensor(user_features.to_numpy(), dtype=torch.float)

In [35]:
# Edge index
# Mapping movie id

user_connection['anime_id'] = user_connection["anime_id"].map(anime_mapping['index']).astype(int)
user_connection['user_id'] = pd.factorize(user_connection['user_id'])[0] 
user_connection = user_connection.values.transpose()
edge_index = torch.tensor(user_connection)
edge_index = edge_index.long()
edge_index

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


tensor([[    0,     0,     0,  ..., 69598, 69599, 69599],
        [ 5225,  6585,  6624,  ...,  8616,   718,  5225]])

# Create hetero data

In [36]:
data = HeteroData()
data["user"].node_id = torch.arange(len(user_features))
data["movie"].node_id = torch.arange(len(anime_df))
data['user'].x = (user_node_features)

data['movie'].x = (anime_node_features)
data['user', 'rating', 'movie'].edge_index = (edge_index)
# data['user', 'movie'].y = y

data = T.ToUndirected()(data)

In [37]:
edge_index.long().dtype

torch.int64

In [38]:
# torch.save(data, os.path.join('data/processed/', 'updated_data.pt'))