In [1]:
import pandas as pd
import numpy as np

# модуль sparse библиотеки scipy понадобится для работы с разреженными матрицами 
from scipy.sparse import csr_matrix

from sklearn.neighbors import NearestNeighbors

animes = pd.read_csv('animes.csv')
ratings = pd.read_csv('ratings.csv')

In [5]:
animes.head(5) 

Unnamed: 0,anime_id,title,genres,media,episodes,rating,members,start_date,season,source
0,1,Cowboy Bebop,"Action,Adventure,Comedy,Drama,Sci-Fi,Space",tv,26,8.77,1324054.0,1998/4/3,spring_1998,original
1,5,Cowboy Bebop: Tengoku no Tobira,"Action,Drama,Mystery,Sci-Fi,Space",movie,1,8.39,284827.0,2001/9/1,summer_2001,original
2,6,Trigun,"Action,Sci-Fi,Adventure,Comedy,Drama,Shounen",tv,26,8.23,578887.0,1998/4/1,spring_1998,manga
3,7,Witch Hunter Robin,"Action,Mystery,Police,Supernatural,Drama,Magic",tv,26,7.27,96487.0,2002/7/2,summer_2002,original
4,8,Bouken Ou Beet,"Adventure,Fantasy,Shounen,Supernatural",tv,52,6.97,13450.0,2004/9/30,fall_2004,manga


In [6]:
ratings.head(5)

Unnamed: 0,user_id,anime_id,rating
0,1,454,3
1,1,28761,8
2,1,6682,5
3,1,9624,6
4,1,38101,7


In [7]:
#удаление столбца
animes.drop(['genres','media','episodes','rating','members','start_date','season','source'], axis=1, inplace=True)

### Создание матрицы предпочтений

In [8]:
# по горизонтали будут фильмы, по вертикали - пользователи, значения - оценки
user_item_matrix = ratings.pivot(index = 'anime_id', columns = 'user_id', values = 'rating')
user_item_matrix.head()

user_id,1,2,3,4,5,6,7,8,10,11,...,108013,108014,108015,108017,108018,108019,108020,108021,108023,108024
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,9.0,,10.0,,9.0,,0.0,10.0,...,,,8.0,9.0,,,10.0,,10.0,
5,,8.0,,,10.0,,9.0,,,,...,,,,8.0,,,0.0,,10.0,
6,,,,,,,8.0,,,9.0,...,,,9.0,8.0,,,8.0,,10.0,
7,,8.0,,,,,,,,,...,,,7.0,,,,0.0,,,
8,,,,,,,,,,,...,,,,,,,,,,


In [9]:
#замена пропущенных значений 
user_item_matrix.fillna(0, inplace = True)
print(user_item_matrix.shape)
user_item_matrix.head()


(12043, 86954)


user_id,1,2,3,4,5,6,7,8,10,11,...,108013,108014,108015,108017,108018,108019,108020,108021,108023,108024
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,9.0,0.0,10.0,0.0,9.0,0.0,0.0,10.0,...,0.0,0.0,8.0,9.0,0.0,0.0,10.0,0.0,10.0,0.0
5,0.0,8.0,0.0,0.0,10.0,0.0,9.0,0.0,0.0,0.0,...,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,10.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,9.0,...,0.0,0.0,9.0,8.0,0.0,0.0,8.0,0.0,10.0,0.0
7,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Чистка матрицы**

In [10]:
# вначале сгруппируем (объединим) пользователей, возьмем только столбец rating 
# и посчитаем, сколько было оценок у каждого пользователя
users_votes = ratings.groupby('user_id')['rating'].agg('count')
 
# сделаем то же самое, только для фильма
animes_votes = ratings.groupby('anime_id')['rating'].agg('count')
 
# теперь создадим фильтр (mask)
user_mask = users_votes[users_votes > 50].index
anime_mask = animes_votes[animes_votes > 10].index
 
# применим фильтры и отберем фильмы с достаточным количеством оценок
user_item_matrix = user_item_matrix.loc[anime_mask,:]
 
# а также активных пользователей
user_item_matrix = user_item_matrix.loc[:,user_mask]

MemoryError: Unable to allocate 5.45 GiB for an array with shape (8413, 86954) and data type float64

### Преобразование разреженной матрицы
Преобразование данных матрицы в формат сжатого хранения строкой 

user_item_matrix.shape
user_item_matrix.head()

In [11]:
# атрибут values передаст функции csr_matrix только значения датафрейма
csr_data = csr_matrix(user_item_matrix.values)
user_item_matrix = user_item_matrix.rename_axis(None, axis = 1).reset_index()

MemoryError: Unable to allocate 7.80 GiB for an array with shape (86954, 12043) and data type float64

In [4]:
user_item_matrix = user_item_matrix.rename_axis(None, axis = 1).reset_index()
user_item_matrix.head()

NameError: name 'user_item_matrix' is not defined

### Создание модели машинного обучения 

На основе метода k-ближайших соседей

In [12]:
# создадим объект класса NearestNeighbors
# metric = ‘cosine’: косинусное сходство; algorithm = ‘brute’: полный перебор; 
# n_neighbors = 20: количество соседей; n_jobs = -1: вычисления на всех свободных ядрах процессора
knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute', n_neighbors = 20, n_jobs = -1)
knn.fit(csr_data)

NearestNeighbors(algorithm='brute', metric='cosine', n_jobs=-1, n_neighbors=20)

### Получение рекомендаций

In [2]:
recommendations = 10
search_word = 'Attack'

In [3]:
animes_search = animes[animes['title'].str.contains(search_word)]
animes_search

Unnamed: 0,anime_id,title,genres,media,episodes,rating,members,start_date,season,source
1399,1550,Attack No.1,"Drama,Shoujo,Sports",tv,104,6.74,7108.0,1969/12/7,winter_1970,manga
2785,3081,Attacker You!,"Action,Romance,Shoujo,Sports",tv,58,6.57,5532.0,1984/4/13,spring_1984,manga
3555,4339,Ashita e Attack!,"Sports,School,Drama",tv,23,6.38,1574.0,1977/4/4,spring_1977,original
5471,9163,Attack No.1 (1970),"Sports,Drama,Shoujo",movie,1,6.14,891.0,1970/3/21,winter_1970,
5472,9164,Attack No.1: Namida no Fushichou,"Sports,Drama,Shoujo",movie,1,6.02,640.0,1971/3/17,winter_1971,
5473,9165,Attack No.1: Namida no Kaiten Receive,"Sports,Drama,Shoujo",movie,1,6.12,661.0,1970/8/1,summer_1970,
5474,9166,Attack No.1: Namida no Sekai Senshuken,"Sports,Drama,Shoujo",movie,1,6.13,649.0,1970/12/19,fall_1970,
8679,24259,Mechano: Scientific Attack Force,"Comedy,Dementia,Fantasy,Horror,Music,Parody",ova,3,,334.0,1995/9/1,summer_1995,
10435,32707,Jewelpet: Attack Chance!?,"Comedy,Magic,Fantasy,Shoujo",ona,3,5.73,773.0,2016/2/19,winter_2016,original
12542,36894,Inazma Delivery: Dougyousha Attack-hen,"Action,Sci-Fi,Comedy",tv,10,6.16,513.0,2017/12/8,winter_2018,original


In [12]:
anime_id = animes_search.iloc[0]['anime_id']
 
# далее по индексу фильма в датасете movies найдем соответствующий индекс
# в матрице предпочтений
anime_id = user_item_matrix[user_item_matrix['anime_id'] == anime_id].index[0]
anime_id

KeyError: 'anime_id'

In [16]:
print(csr_data[anime_id])

  (0, 0)	5.0
  (0, 1)	1.0
  (0, 4)	0.5
  (0, 6)	4.0
  (0, 7)	3.5
  (0, 8)	5.0
  (0, 9)	4.5
  (0, 10)	4.0
  (0, 12)	4.0
  (0, 15)	4.0
  (0, 17)	4.0
  (0, 20)	5.0
  (0, 21)	3.0
  (0, 24)	5.0
  (0, 26)	2.0
  (0, 27)	5.0
  (0, 29)	5.0
  (0, 31)	2.5
  (0, 33)	5.0
  (0, 34)	5.0
  (0, 36)	1.0
  (0, 37)	5.0
  (0, 38)	3.5
  (0, 39)	4.0
  (0, 40)	5.0
  :	:
  (0, 335)	5.0
  (0, 336)	5.0
  (0, 339)	4.5
  (0, 340)	4.0
  (0, 341)	5.0
  (0, 346)	4.5
  (0, 349)	5.0
  (0, 351)	5.0
  (0, 352)	5.0
  (0, 353)	5.0
  (0, 357)	5.0
  (0, 358)	4.0
  (0, 360)	4.0
  (0, 361)	5.0
  (0, 363)	2.5
  (0, 364)	5.0
  (0, 365)	4.0
  (0, 367)	5.0
  (0, 368)	3.0
  (0, 369)	5.0
  (0, 371)	5.0
  (0, 374)	5.0
  (0, 375)	5.0
  (0, 376)	5.0
  (0, 377)	5.0


In [17]:
#поиск индексов ближайших соседей 
#массив индексов фильмов (indices) и массив расстояний (distances) до них
distances, indices = knn.kneighbors(csr_data[anime_id], n_neighbors = recommendations + 1)

In [13]:
# уберем лишние измерения через squeeze() и преобразуем массивы в списки с помощью tolist()
indices_list = indices.squeeze().tolist()
distances_list = distances.squeeze().tolist()
 
indices_distances = list(zip(indices_list, distances_list))
 
# в набор кортежей (tuple)
print(type(indices_distances[0]))
 
# и посмотрим на первые три пары/кортежа
print(indices_distances[:3])

NameError: name 'indices' is not defined

In [19]:
indices_distances_sorted = sorted(indices_distances, key = lambda x: x[1], reverse = False)

# и убрать первый элемент с индексом 901 (потому что это и есть "Матрица")
indices_distances_sorted = indices_distances_sorted[1:]
indices_distances_sorted

[(1002, 0.22982440568634488),
 (442, 0.25401128310081567),
 (454, 0.27565616686043737),
 (124, 0.2776088577731709),
 (735, 0.2869100842838125),
 (954, 0.2911101181714415),
 (1362, 0.31393358217709477),
 (1157, 0.31405925934381695),
 (1536, 0.3154800434449465),
 (978, 0.31748544046311844)]

In [20]:
# сопоставление индексам названия фильмов 
recom_list = []

# теперь в цикле будем поочередно проходить по кортежам
for ind_dist in indices_distances_sorted:

    # искать movieId в матрице предпочтений
    matrix_movie_id = user_item_matrix.iloc[ind_dist[0]]['anime_id']

    # выяснять индекс этого фильма в датафрейме movies
    id = animes[animes['anime_id'] == matrix_movie_id].index

    # брать название фильма и расстояние до него
    title = animes.iloc[id]['title'].values[0]
    dist = ind_dist[1]

    # помещать каждую пару в питоновский словарь
    # который, в свою очередь, станет элементом списка recom_list
    recom_list.append({'Title' : title, 'Distance' : dist})

In [21]:
recom_list[:5]

[{'Title': 'Fight Club (1999)', 'Distance': 0.22982440568634488},
 {'Title': 'Star Wars: Episode V - The Empire Strikes Back (1980)',
  'Distance': 0.25401128310081567},
 {'Title': 'Star Wars: Episode VI - Return of the Jedi (1983)',
  'Distance': 0.27565616686043737},
 {'Title': 'Star Wars: Episode IV - A New Hope (1977)',
  'Distance': 0.2776088577731709},
 {'Title': 'Saving Private Ryan (1998)', 'Distance': 0.2869100842838125}]

In [14]:
#преобразование в датафрейм
recom_df = pd.DataFrame(recom_list, index = range(1, recommendations + 1))
recom_df

NameError: name 'recom_list' is not defined