## Датасет

Использовать датасет MovieLens.
Построить рекомендации (регрессия, предсказываем оценку) на фичах:    - 
TF-IDF на тегах и жанрах    - ;
средние оценки (+ median, variance и т. д.) пользователя и фильма.
Оценить RMSE на тестовой выборке.

In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from datetime import datetime
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import wget
import zipfile

In [2]:
data = 'ml-latest-small'

In [3]:
url = f'https://files.grouplens.org/datasets/movielens/{data}.zip'
wget.download(url, 'MovieLens.zip')

100% [............................................................................] 978202 / 978202

'MovieLens (5).zip'

In [4]:
with zipfile.ZipFile("MovieLens.zip","r") as zip_ref:
    zip_ref.extractall()

links = pd.read_csv(f'./{data}/links.csv')
links.name = 'links'
movies = pd.read_csv(f'./{data}/movies.csv')
movies.name = 'movies'
ratings = pd.read_csv(f'./{data}/ratings.csv')
ratings.name = 'ratings'
tags = pd.read_csv(f'./{data}/tags.csv')
tags.name = 'tags'

In [5]:
links = pd.read_csv(f'./{data}/links.csv')
movies = pd.read_csv(f'./{data}/movies.csv')
ratings = pd.read_csv(f'./{data}/ratings.csv')
tags = pd.read_csv(f'./{data}/tags.csv')

### Проверяем датасет

In [6]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [8]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [9]:
tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   userId     3683 non-null   int64 
 1   movieId    3683 non-null   int64 
 2   tag        3683 non-null   object
 3   timestamp  3683 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 115.2+ KB


In [10]:
grouped_users = tags['userId'].value_counts()
grouped_users.head()

userId
474    1507
567     432
62      370
599     323
477     280
Name: count, dtype: int64

In [11]:
# сколько всего пользователей ставили оценок
len(grouped_users)

58

In [12]:
grouped_movies = tags['movieId'].value_counts()
len(grouped_movies)

1572

In [13]:
grouped_tags = tags['tag'].value_counts()
grouped_tags.head()

tag
In Netflix queue     131
atmospheric           36
thought-provoking     24
superhero             24
funny                 23
Name: count, dtype: int64

In [14]:
len(grouped_tags)

1589

In [15]:
u_tags = tags['tag'].unique()

In [16]:
len (u_tags)

1589

### Создаем фичи на основе жанров и тегов 

In [17]:
#грузим библиотеки для прелюразования объектов в матрицу объектов TF-IDF.
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [18]:
# уберем лишние пробелы и дефисы для дальнейшего формирования матрицы
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [19]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [20]:
# создаем список жанров
movie_genres = [change_string(g) for g in movies.genres.values]
movie_genres[:5]

['Adventure Animation Children Comedy Fantasy',
 'Adventure Children Fantasy',
 'Comedy Romance',
 'Comedy Drama Romance',
 'Comedy']

In [21]:
# Преобразуем необработанные фичи в матрицу объектов TF-IDF.
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(movie_genres)
X_train_tfidf

<9742x20 sparse matrix of type '<class 'numpy.float64'>'
	with 22084 stored elements in Compressed Sparse Row format>

In [22]:
# преобразуем в df. 
movie_t = pd.DataFrame(X_train_tfidf.toarray(), columns=tfidf.get_feature_names_out())
movie_t.head()

Unnamed: 0,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,filmnoir,horror,imax,musical,mystery,nogenreslisted,romance,scifi,thriller,war,western
0,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,0.48299,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.512361,0.0,0.620525,0.0,0.0,0.0,0.0,0.593662,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.821009,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.505015,0.0,0.0,0.466405,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.726241,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
movies = movies.drop(columns=['genres', 'title'])
movies = pd.concat([movies, movie_t], axis=1)

In [24]:
movies.head()

Unnamed: 0,movieId,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,...,horror,imax,musical,mystery,nogenreslisted,romance,scifi,thriller,war,western
0,1,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,0.48299,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0.0,0.512361,0.0,0.620525,0.0,0.0,0.0,0.0,0.593662,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.821009,0.0,0.0,0.0,0.0
3,4,0.0,0.0,0.0,0.0,0.505015,0.0,0.0,0.466405,0.0,...,0.0,0.0,0.0,0.0,0.0,0.726241,0.0,0.0,0.0,0.0
4,5,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
# Преобразуем теги в ветора
movies_with_tags = movies.merge(tags, on='movieId')
movies_with_tags.head()

Unnamed: 0,movieId,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,...,mystery,nogenreslisted,romance,scifi,thriller,war,western,userId,tag,timestamp
0,1,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,0.48299,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,336,pixar,1139045764
1,1,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,0.48299,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,474,pixar,1137206825
2,1,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,0.48299,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,567,fun,1525286013
3,2,0.0,0.512361,0.0,0.620525,0.0,0.0,0.0,0.0,0.593662,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,62,fantasy,1528843929
4,2,0.0,0.512361,0.0,0.620525,0.0,0.0,0.0,0.0,0.593662,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,62,magic board game,1528843932


In [26]:
movies_with_tags.dropna(inplace=True)

In [27]:
#movies_with_tags.movieId.unique().shape

In [28]:
tag_strings = []
tag_movies = []

for movie, group in tqdm(movies_with_tags.groupby('movieId')):
    tag_strings.append(' '.join([change_string(s) for s in group.tag.values]))
    tag_movies.append(movie)

  0%|          | 0/1572 [00:00<?, ?it/s]

In [29]:

tfidf_tag = TfidfVectorizer()
tag_train_tfidf = tfidf_tag.fit_transform(tag_strings)
tag_train_tfidf = pd.DataFrame(tag_train_tfidf.toarray(), columns=tfidf_tag.get_feature_names_out())

In [30]:
# добавляем теги в df
new_tags = pd.concat([pd.DataFrame(tag_movies, columns=['movieId']), tag_train_tfidf], axis=1)
new_tags.head()

Unnamed: 0,movieId,06oscarnominatedbestmovieanimation,1900s,1920s,1950s,1960s,1970s,1980s,1990s,2001like,...,worldwari,worldwarii,writing,wrongfulimprisonment,wry,youngermen,zither,zoekazan,zombies,zooeydeschanel
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
movies_with_tags = movies.merge(new_tags, on='movieId')
movies_with_tags.dropna(inplace=True)
movies_with_tags.head()

Unnamed: 0,movieId,action_x,adventure_x,animation_x,children_x,comedy_x,crime_x,documentary_x,drama_x,fantasy_x,...,worldwari,worldwarii,writing,wrongfulimprisonment,wry,youngermen,zither,zoekazan,zombies,zooeydeschanel
0,1,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,0.48299,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0.0,0.512361,0.0,0.620525,0.0,0.0,0.0,0.0,0.593662,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7,0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Добавляем новые фичи 
- среднюю оценку пользователя
- среднюю оценку фильма

In [32]:
# добавляем оценку фильма из таблицы ratings и сразу удаляем timestamp
movies_with_ratings = movies_with_tags.merge(ratings, on='movieId').drop(columns=['timestamp'])
len(movies_with_ratings)

48287

In [33]:
movies_with_ratings.head()

Unnamed: 0,movieId,action_x,adventure_x,animation_x,children_x,comedy_x,crime_x,documentary_x,drama_x,fantasy_x,...,writing,wrongfulimprisonment,wry,youngermen,zither,zoekazan,zombies,zooeydeschanel,userId,rating
0,1,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,0.48299,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,4.0
1,1,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,0.48299,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,4.0
2,1,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,0.48299,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7,4.5
3,1,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,0.48299,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15,2.5
4,1,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,0.48299,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17,4.5


In [34]:
mean_mr = movies_with_ratings.groupby('movieId')['rating'].mean()
mean_ur = movies_with_ratings.groupby('userId')['rating'].mean()

In [35]:
movies_with_ratings['mean_movie_rating'] = movies_with_ratings['movieId'].apply(lambda x: mean_mr[x])
movies_with_ratings['mean_user_rating'] = movies_with_ratings['userId'].apply(lambda x: mean_ur[x])

In [36]:
movies_with_ratings.head()

Unnamed: 0,movieId,action_x,adventure_x,animation_x,children_x,comedy_x,crime_x,documentary_x,drama_x,fantasy_x,...,wry,youngermen,zither,zoekazan,zombies,zooeydeschanel,userId,rating,mean_movie_rating,mean_user_rating
0,1,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,0.48299,...,0.0,0.0,0.0,0.0,0.0,0.0,1,4.0,3.92093,4.403509
1,1,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,0.48299,...,0.0,0.0,0.0,0.0,0.0,0.0,5,4.0,3.92093,3.74359
2,1,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,0.48299,...,0.0,0.0,0.0,0.0,0.0,0.0,7,4.5,3.92093,3.543956
3,1,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,0.48299,...,0.0,0.0,0.0,0.0,0.0,0.0,15,2.5,3.92093,3.5625
4,1,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,0.48299,...,0.0,0.0,0.0,0.0,0.0,0.0,17,4.5,3.92093,4.27907


In [37]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [38]:
X = movies_with_ratings.drop(columns=['rating'])
y = pd.DataFrame(movies_with_ratings['rating'])

In [39]:
sc_x = StandardScaler()
sc_y = StandardScaler()
X_std = sc_x.fit_transform(X)
y_std = sc_y.fit_transform(y)

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X_std, y_std, test_size=0.2, random_state=42)

In [41]:
model = Ridge(alpha=0.1)
model.fit(X_train, y_train)

In [42]:
pred = model.predict(X_test)

In [43]:
np.sqrt(mean_squared_error(y_test, pred))

0.8322089739377408

In [44]:
model = Lasso(alpha=0.1)
model.fit(X_train, y_train)
pred = model.predict(X_test)
np.sqrt(mean_squared_error(y_test, pred))

0.8306050944524038

### Вывод
Были произведены преобразования данных в вектора (TF-IDF) для признаков жанры и теги. Добавлены средние значения по рейтингу, пользователям. 
Построены модели регрессии, предсказывающие rating
Проведена RMSE на тестовой выборке.