In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

from tqdm.notebook import tqdm

import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

from sklearn.metrics import mean_squared_error

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

%matplotlib inline

Загружаем датасет

In [2]:
links = pd.read_csv('/Users/irinanikulina/Documents/ML/RecommendationSystems/links.csv')
movies = pd.read_csv('/Users/irinanikulina/Documents/ML/RecommendationSystems/movies.csv')
ratings = pd.read_csv('/Users/irinanikulina/Documents/ML/RecommendationSystems/ratings.csv')
tags = pd.read_csv('/Users/irinanikulina/Documents/ML/RecommendationSystems/tags.csv')

In [3]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [7]:
# Создание нового датафрейма для работы
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

In [8]:
movies_with_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,964982700.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,847435000.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7.0,4.5,1106636000.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15.0,2.5,1510578000.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17.0,4.5,1305696000.0


In [9]:
def change_string(s):
    return' '.join(s.replace(' ','').replace('-','').split('|'))

In [10]:
movie_genres = [change_string(g) for g in movies_with_ratings.genres.values]

Построим рекомендации (регрессия, предсказываем оценку) на TF-IDF по жанрам

In [11]:
movie_genres

['Adventure Animation Children Comedy Fantasy',
 'Adventure Animation Children Comedy Fantasy',
 'Adventure Animation Children Comedy Fantasy',
 'Adventure Animation Children Comedy Fantasy',
 'Adventure Animation Children Comedy Fantasy',
 'Adventure Animation Children Comedy Fantasy',
 'Adventure Animation Children Comedy Fantasy',
 'Adventure Animation Children Comedy Fantasy',
 'Adventure Animation Children Comedy Fantasy',
 'Adventure Animation Children Comedy Fantasy',
 'Adventure Animation Children Comedy Fantasy',
 'Adventure Animation Children Comedy Fantasy',
 'Adventure Animation Children Comedy Fantasy',
 'Adventure Animation Children Comedy Fantasy',
 'Adventure Animation Children Comedy Fantasy',
 'Adventure Animation Children Comedy Fantasy',
 'Adventure Animation Children Comedy Fantasy',
 'Adventure Animation Children Comedy Fantasy',
 'Adventure Animation Children Comedy Fantasy',
 'Adventure Animation Children Comedy Fantasy',
 'Adventure Animation Children Comedy Fa

In [12]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(movie_genres)

In [13]:
count_vect.get_feature_names()

['action',
 'adventure',
 'animation',
 'children',
 'comedy',
 'crime',
 'documentary',
 'drama',
 'fantasy',
 'filmnoir',
 'horror',
 'imax',
 'musical',
 'mystery',
 'nogenreslisted',
 'romance',
 'scifi',
 'thriller',
 'war',
 'western']

In [14]:
print(X_train_counts.shape)

(100836, 20)


In [15]:
X = X_train_counts.toarray()

In [16]:
y = movies_with_ratings.rating

In [17]:
X_tr, X_ts, y_tr, y_ts = train_test_split(X_train_counts.toarray(),y,test_size = 0.15)

In [31]:
neigh = KNeighborsRegressor(n_neighbors=15,n_jobs = -1, metric = 'euclidean')

In [32]:
neigh.fit(X_tr, y_tr)

KNeighborsRegressor(metric='euclidean', n_jobs=-1, n_neighbors=15)

In [33]:
neigh.predict(X_ts)

array([3.13333333, 3.23333333, 3.76666667, ..., 2.96666667, 3.63333333,
       3.23333333])

In [34]:
neigh.score(X_tr,y_tr)

0.055451375387044366

In [35]:
scores = cross_val_score(neigh, X, y, cv=5, verbose=True)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   21.9s finished


In [36]:
print(scores)

[-0.21066185 -0.15851144 -0.07452149 -0.14748643 -0.13581015]


In [24]:
tfidf_transformer = TfidfTransformer()

In [25]:
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [26]:
print(X_train_tfidf.toarray())

[[0.         0.36388502 0.54973521 ... 0.         0.         0.        ]
 [0.         0.36388502 0.54973521 ... 0.         0.         0.        ]
 [0.         0.36388502 0.54973521 ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.51274474 0.         0.85854111 ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [27]:
neigh = NearestNeighbors(n_neighbors = 7, n_jobs = -1, metric = 'euclidean')
neigh.fit(X_train_tfidf)

NearestNeighbors(metric='euclidean', n_jobs=-1, n_neighbors=7)

In [28]:
test = change_string("Comedy|Drama|Romance")

predict = count_vect.transform([test])
X_tfidf2 = tfidf_transformer.transform(predict)

res = neigh.kneighbors(X_tfidf2, return_distance=True)

In [29]:
res

(array([[0., 0., 0., 0., 0., 0., 0.]]),
 array([[61012, 61011, 61014, 61013, 61016, 61010, 61017]]))

In [30]:
movies.iloc[res[1][0]]

IndexError: positional indexers are out-of-bounds

Построим рекомендации (регрессия, предсказываем оценку) на TF-IDF по тегам

In [None]:
movies_with_tags = movies.join(tags.set_index('movieId'), on='movieId')

In [None]:
movies_with_tags.head()

In [None]:
movies_with_tags[movies_with_tags.title == 'Jumanji (1995)']

In [None]:
movies_with_tags.tag.unique()

In [None]:
movies_with_tags.tag.unique().shape

In [None]:
movies_with_tags.dropna(inplace=True)

In [None]:
movies_with_tags.title.unique().shape

In [None]:
tag_strings = []
movies = []

for movie, group in tqdm(movies_with_tags.groupby('title')):
    tag_strings.append(' '.join([str(s).replace(' ', '').replace('-', '') for s in group.tag.values]))
    movies.append(movie)

In [None]:
tag_strings[:15]

In [None]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(tag_strings)

In [None]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [None]:
neigh = NearestNeighbors(n_neighbors=10, n_jobs=-1, metric='manhattan') 
neigh.fit(X_train_tfidf)

In [None]:
for i in range(len(movies)):
    if 'Jumanji (1995)' == movies[i]:
        print(i)

In [None]:
tag_strings[712]

In [None]:
test = change_string("lawyers")

predict = count_vect.transform([test])
X_tfidf2 = tfidf_transformer.transform(predict)

res = neigh.kneighbors(X_tfidf2, return_distance=True)

In [None]:
res

In [None]:
for i in res[1][0]:
    print(movies[i])

Рассчитаем средние оценки (+ median, variance, etc.) фильма

In [None]:
# Гистограмма количества оценок рейтинга. Более 8000 фильмов имеют от 0 до 10 оценок. 
# Только 200 фильмов имеют 50 оценок.
ratings.groupby('movieId').rating.count().hist(bins=25)

In [None]:
# Медиана оценок на фильм
ratings.groupby('movieId').rating.count().median()

In [None]:
# Среднее количество (математическое ожидание) оценок на фильм
ratings.groupby('movieId').rating.count().mean()

In [None]:
# Средняя оценка фильма по его id
movieID_mean = ratings.groupby('movieId').rating.mean()
movieID_mean

In [None]:
# Средняя оценка по всем фильмам
np.mean(movieID_mean)

In [None]:
# Дисперсия оценок по всем фильмам
np.var(movieID_mean)

Рассчитаем средние оценки (+ median, variance, etc.) пользователя

In [None]:
# Гистограмма количества оценок на пользователя. Более 360 пользователей поставили от 0 до 10 оценок. 
# 10 пользователей поставили от 490 до 510 оценок.
ratings.groupby('userId').rating.count().hist(bins=30)

In [None]:
# Медиана оценок на пользователя
ratings.groupby('userId').rating.count().median()

In [None]:
# Среднее количество (математическое ожидание) оценок на пользователя
ratings.groupby('userId').rating.count().mean()

In [None]:
# Средняя оценка фильма пользователем
userID_mean = ratings.groupby('userId').rating.mean()
userID_mean

In [None]:
# Средняя оценка по всем пользователям
np.mean(userID_mean)

In [None]:
# Дисперсия оценок по всем пользователям
np.var(userID_mean)