In [9]:
import numpy as np
import pandas as pd
import seaborn as sns

In [10]:
ANIME_DIR = "anime_data/"

### Предобработка данных

In [11]:
anime_ratings = pd.read_csv(ANIME_DIR + "animelist.csv", nrows=10000000)
anime_data = pd.read_csv(ANIME_DIR + "anime.csv")

Рассиморим, какая информация находится в файле $anime.csv$

In [12]:
anime_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17562 entries, 0 to 17561
Data columns (total 35 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   MAL_ID         17562 non-null  int64 
 1   Name           17562 non-null  object
 2   Score          17562 non-null  object
 3   Genres         17562 non-null  object
 4   English name   17562 non-null  object
 5   Japanese name  17562 non-null  object
 6   Type           17562 non-null  object
 7   Episodes       17562 non-null  object
 8   Aired          17562 non-null  object
 9   Premiered      17562 non-null  object
 10  Producers      17562 non-null  object
 11  Licensors      17562 non-null  object
 12  Studios        17562 non-null  object
 13  Source         17562 non-null  object
 14  Duration       17562 non-null  object
 15  Rating         17562 non-null  object
 16  Ranked         17562 non-null  object
 17  Popularity     17562 non-null  int64 
 18  Members        17562 non-n

Сразу переименуем колонку $MAL\_ID$ в $anime\_id$

In [13]:
anime_data.rename(columns={'MAL_ID':"anime_id"},inplace=True)

In [14]:
anime_data.columns

Index(['anime_id', 'Name', 'Score', 'Genres', 'English name', 'Japanese name',
       'Type', 'Episodes', 'Aired', 'Premiered', 'Producers', 'Licensors',
       'Studios', 'Source', 'Duration', 'Rating', 'Ranked', 'Popularity',
       'Members', 'Favorites', 'Watching', 'Completed', 'On-Hold', 'Dropped',
       'Plan to Watch', 'Score-10', 'Score-9', 'Score-8', 'Score-7', 'Score-6',
       'Score-5', 'Score-4', 'Score-3', 'Score-2', 'Score-1'],
      dtype='object')

Избавимся от лишних полей

In [15]:
to_keep = ['anime_id', 'Name', 'Score', 'Genres', 'Members']
anime_data = anime_data[to_keep]
anime_data

Unnamed: 0,anime_id,Name,Score,Genres,Members
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",1251960
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",273145
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",558913
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",94683
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",13224
...,...,...,...,...,...
17557,48481,Daomu Biji Zhi Qinling Shen Shu,Unknown,"Adventure, Mystery, Supernatural",354
17558,48483,Mieruko-chan,Unknown,"Comedy, Horror, Supernatural",7010
17559,48488,Higurashi no Naku Koro ni Sotsu,Unknown,"Mystery, Dementia, Horror, Psychological, Supe...",11309
17560,48491,Yama no Susume: Next Summit,Unknown,"Adventure, Slice of Life, Comedy",1386


Сейчас у нас все жанры описаны в одном поле через запятую, что не очень удобно, поэтому мы определим все жанры и добавим их как поля для каждой записи

In [16]:
genres_column = anime_data["Genres"].map(lambda x: x.split(", "))
genres = list(set(sum(genres_column, [])))
anime_data[genres] = 0
for i in range(0, len(genres_column)):
    anime_data.loc[i, genres_column[i]] = 1

Теперь поле $Genres$ становится ненужным

In [17]:
anime_data = anime_data.drop(columns="Genres")

Заменим Unknown в поле Score 0

In [18]:
dict = {'Unknown' : 0}
anime_data['Score'] = anime_data['Score'].astype(str).apply(lambda x : dict[x] if x == 'Unknown' else x).astype(float)

In [19]:
anime_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17562 entries, 0 to 17561
Data columns (total 48 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   anime_id       17562 non-null  int64  
 1   Name           17562 non-null  object 
 2   Score          17562 non-null  float64
 3   Members        17562 non-null  int64  
 4   Kids           17562 non-null  int64  
 5   Super Power    17562 non-null  int64  
 6   Action         17562 non-null  int64  
 7   Sci-Fi         17562 non-null  int64  
 8   Shounen Ai     17562 non-null  int64  
 9   Ecchi          17562 non-null  int64  
 10  Cars           17562 non-null  int64  
 11  Yuri           17562 non-null  int64  
 12  Mecha          17562 non-null  int64  
 13  Space          17562 non-null  int64  
 14  Parody         17562 non-null  int64  
 15  Josei          17562 non-null  int64  
 16  Vampire        17562 non-null  int64  
 17  Adventure      17562 non-null  int64  
 18  Myster

Рассмотрим теперь информацию файла $animelist.csv$

In [20]:
anime_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000000 entries, 0 to 9999999
Data columns (total 5 columns):
 #   Column            Dtype
---  ------            -----
 0   user_id           int64
 1   anime_id          int64
 2   rating            int64
 3   watching_status   int64
 4   watched_episodes  int64
dtypes: int64(5)
memory usage: 381.5 MB


Избавимся от информации о кол-ве просмотренных эпизодов и статусе просмотра

In [22]:
anime_ratings = anime_ratings[['user_id', 'anime_id', 'rating']]
anime_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000000 entries, 0 to 9999999
Data columns (total 3 columns):
 #   Column    Dtype
---  ------    -----
 0   user_id   int64
 1   anime_id  int64
 2   rating    int64
dtypes: int64(3)
memory usage: 228.9 MB


Проверим, есть ли оценки для всех аниме, представленных в датасете

In [23]:
anime_ratings.anime_id.nunique()

17554

Это действительно так

In [None]:
# anime_ratings = anime_ratings.sample(frac=0.3)
# anime_ratings.info()

In [None]:
# anime_ratings.anime_id.nunique()

Объеденим информацию из двух файлов

In [29]:
anime_complete = pd.merge(anime_data, anime_ratings, on='anime_id')
anime_complete.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000000 entries, 0 to 2999999
Data columns (total 50 columns):
 #   Column         Dtype  
---  ------         -----  
 0   anime_id       int64  
 1   Name           object 
 2   Score          float64
 3   Members        int64  
 4   Kids           int64  
 5   Super Power    int64  
 6   Action         int64  
 7   Sci-Fi         int64  
 8   Shounen Ai     int64  
 9   Ecchi          int64  
 10  Cars           int64  
 11  Yuri           int64  
 12  Mecha          int64  
 13  Space          int64  
 14  Parody         int64  
 15  Josei          int64  
 16  Vampire        int64  
 17  Adventure      int64  
 18  Mystery        int64  
 19  Fantasy        int64  
 20  School         int64  
 21  Harem          int64  
 22  Samurai        int64  
 23  Romance        int64  
 24  Game           int64  
 25  Magic          int64  
 26  Yaoi           int64  
 27  Shoujo         int64  
 28  Music          int64  
 29  Thriller      

Переименуеем Score d total_score, а rating в user_score

In [30]:
anime_complete = anime_complete.rename(columns={'Score' : 'total_score', 'rating': 'user_score'})

In [31]:
anime_complete.isna().sum()

anime_id         0
Name             0
total_score      0
Members          0
Kids             0
Super Power      0
Action           0
Sci-Fi           0
Shounen Ai       0
Ecchi            0
Cars             0
Yuri             0
Mecha            0
Space            0
Parody           0
Josei            0
Vampire          0
Adventure        0
Mystery          0
Fantasy          0
School           0
Harem            0
Samurai          0
Romance          0
Game             0
Magic            0
Yaoi             0
Shoujo           0
Music            0
Thriller         0
Shoujo Ai        0
Supernatural     0
Sports           0
Military         0
Martial Arts     0
Police           0
Unknown          0
Psychological    0
Comedy           0
Slice of Life    0
Seinen           0
Historical       0
Hentai           0
Dementia         0
Drama            0
Demons           0
Shounen          0
Horror           0
user_id          0
user_score       0
dtype: int64

In [42]:
anime_complete

Unnamed: 0,anime_id,Name,total_score,Members,Kids,Super Power,Action,Sci-Fi,Shounen Ai,Ecchi,...,Seinen,Historical,Hentai,Dementia,Drama,Demons,Shounen,Horror,user_id,user_score
0,1,Cowboy Bebop,8.78,1251960,0,0,1,1,0,0,...,0,0,0,0,1,0,0,0,21812,8
1,1,Cowboy Bebop,8.78,1251960,0,0,1,1,0,0,...,0,0,0,0,1,0,0,0,26660,0
2,1,Cowboy Bebop,8.78,1251960,0,0,1,1,0,0,...,0,0,0,0,1,0,0,0,11682,9
3,1,Cowboy Bebop,8.78,1251960,0,0,1,1,0,0,...,0,0,0,0,1,0,0,0,31629,10
4,1,Cowboy Bebop,8.78,1251960,0,0,1,1,0,0,...,0,0,0,0,1,0,0,0,27887,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999995,48488,Higurashi no Naku Koro ni Sotsu,0.00,11309,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,14587,0
2999996,48488,Higurashi no Naku Koro ni Sotsu,0.00,11309,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,19052,0
2999997,48488,Higurashi no Naku Koro ni Sotsu,0.00,11309,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,28723,0
2999998,48488,Higurashi no Naku Koro ni Sotsu,0.00,11309,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,5551,0


сохраним полученый df в csv

In [43]:
anime_complete.to_csv(ANIME_DIR + 'complete.csv')

### Подготовка данных для рекомендаций

In [None]:
anime_feature = pd.read_csv(ANIME_DIR + 'complete.csv')

Посмотрим на 10 самых популярных аниме по кол-ву оценок и по кол-ву фанатов

In [34]:
top10_by_score = anime_complete['Name'].value_counts().nlargest(10)
top10_by_members = anime_complete.sort_values(by='Members', ascending=False).drop_duplicates(subset='Name').head(10)

In [36]:
top10_by_score

Name
Death Note                          6602
Shingeki no Kyojin                  6030
Sword Art Online                    5881
Fullmetal Alchemist: Brotherhood    5586
Toradora!                           5564
Code Geass: Hangyaku no Lelouch     5375
Angel Beats!                        5268
Steins;Gate                         5253
Naruto                              4985
Mirai Nikki                         4913
Name: count, dtype: int64

In [40]:
top10_by_members['Name']

519679                           Death Note
1686010                  Shingeki no Kyojin
925620     Fullmetal Alchemist: Brotherhood
1450242                    Sword Art Online
2210599                       One Punch Man
2348837               Boku no Hero Academia
1938286                         Tokyo Ghoul
16746                                Naruto
1223640                         Steins;Gate
1824573                     No Game No Life
Name: Name, dtype: object