# Загрузка библиотек

In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from datetime import datetime
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

# Загрузка датасета

In [4]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [5]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [6]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [7]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [8]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


# Преобразование данных

## TF-IDF жанров

In [9]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [10]:
movie_genres = [change_string(g) for g in movies.genres.values]

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer


tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(movie_genres)
X_train_tfidf

<9742x20 sparse matrix of type '<class 'numpy.float64'>'
	with 22084 stored elements in Compressed Sparse Row format>

In [12]:
tf_idf_genres = pd.DataFrame(
    X_train_tfidf.toarray(),
    columns=tfidf.get_feature_names_out())

tf_idf_genres.head()

Unnamed: 0,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,filmnoir,horror,imax,musical,mystery,nogenreslisted,romance,scifi,thriller,war,western
0,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,0.48299,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.512361,0.0,0.620525,0.0,0.0,0.0,0.0,0.593662,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.821009,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.505015,0.0,0.0,0.466405,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.726241,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
movies_genres = pd.concat([movies, tf_idf_genres], axis=1)

movies_genres = movies_genres.drop(columns=["genres"])

movies_genres.head()

Unnamed: 0,movieId,title,action,adventure,animation,children,comedy,crime,documentary,drama,...,horror,imax,musical,mystery,nogenreslisted,romance,scifi,thriller,war,western
0,1,Toy Story (1995),0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),0.0,0.512361,0.0,0.620525,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.821009,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale (1995),0.0,0.0,0.0,0.0,0.505015,0.0,0.0,0.466405,...,0.0,0.0,0.0,0.0,0.0,0.726241,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II (1995),0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## TF-IDF тегов

In [14]:
movies_genres_tags = movies_genres.merge(tags, on='movieId')

movies_genres_tags.dropna(inplace=True)

movies_genres_tags.head()

Unnamed: 0,movieId,title,action,adventure,animation,children,comedy,crime,documentary,drama,...,mystery,nogenreslisted,romance,scifi,thriller,war,western,userId,tag,timestamp
0,1,Toy Story (1995),0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,336,pixar,1139045764
1,1,Toy Story (1995),0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,474,pixar,1137206825
2,1,Toy Story (1995),0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,567,fun,1525286013
3,2,Jumanji (1995),0.0,0.512361,0.0,0.620525,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,62,fantasy,1528843929
4,2,Jumanji (1995),0.0,0.512361,0.0,0.620525,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,62,magic board game,1528843932


In [15]:
tag_strings = []

for movie_id, group in tqdm(movies_genres_tags.groupby('title')):
    tag_strings.append(' '.join([change_string(s) for s in group.tag.values]))

  0%|          | 0/1572 [00:00<?, ?it/s]

In [16]:
tag_strings[:5]

['artistic Funny humorous inspiring intelligent quirky romance ZooeyDeschanel',
 'lawyers',
 'creepy suspense',
 'Shakespearesortof',
 'dogs remake']

In [17]:
tfidf = TfidfVectorizer()
X_train_tfidf_tag = tfidf.fit_transform(tag_strings)
X_train_tfidf_tag

<1572x1472 sparse matrix of type '<class 'numpy.float64'>'
	with 3598 stored elements in Compressed Sparse Row format>

In [18]:
tf_idf_tags = pd.DataFrame(
    X_train_tfidf_tag.toarray(),
    columns=tfidf.get_feature_names_out())

tf_idf_tags.head()

Unnamed: 0,06oscarnominatedbestmovieanimation,1900s,1920s,1950s,1960s,1970s,1980s,1990s,2001like,2danimation,...,worldwari,worldwarii,writing,wrongfulimprisonment,wry,youngermen,zither,zoekazan,zombies,zooeydeschanel
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.420342
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
movies_names_with_tags = movies_genres_tags["title"].unique()

movies_names_with_tags

array(['Toy Story (1995)', 'Jumanji (1995)', 'Grumpier Old Men (1995)',
       ..., 'Deadpool 2 (2018)', 'Solo: A Star Wars Story (2018)',
       'Gintama: The Movie (2010)'], dtype=object)

In [20]:
tf_idf_tags["title"] = movies_names_with_tags

tf_idf_tags.head()

Unnamed: 0,06oscarnominatedbestmovieanimation,1900s,1920s,1950s,1960s,1970s,1980s,1990s,2001like,2danimation,...,worldwarii,writing,wrongfulimprisonment,wry,youngermen,zither,zoekazan,zombies,zooeydeschanel,title
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.420342,Toy Story (1995)
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Jumanji (1995)
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Grumpier Old Men (1995)
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Father of the Bride Part II (1995)
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Sabrina (1995)


In [21]:
movies_genres_tags = movies_genres.merge(tf_idf_tags, on='title')

movies_genres_tags.dropna(inplace=True)

movies_genres_tags.head()

Unnamed: 0,movieId,title,action_x,adventure_x,animation_x,children_x,comedy_x,crime_x,documentary_x,drama_x,...,worldwari,worldwarii,writing,wrongfulimprisonment,wry,youngermen,zither,zoekazan,zombies,zooeydeschanel
0,1,Toy Story (1995),0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.420342
1,2,Jumanji (1995),0.0,0.512361,0.0,0.620525,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5,Father of the Bride Part II (1995),0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7,Sabrina (1995),0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
movies_genres_tags.shape

(1574, 1494)

## Добавление таблицы с рейтингом

In [23]:
final_dataset = ratings.merge(movies_genres_tags, on="movieId")

final_dataset

Unnamed: 0,userId,movieId,rating,timestamp,title,action_x,adventure_x,animation_x,children_x,comedy_x,...,worldwari,worldwarii,writing,wrongfulimprisonment,wry,youngermen,zither,zoekazan,zombies,zooeydeschanel
0,1,1,4.0,964982703,Toy Story (1995),0.0,0.416846,0.516225,0.504845,0.267586,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.420342
1,5,1,4.0,847434962,Toy Story (1995),0.0,0.416846,0.516225,0.504845,0.267586,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.420342
2,7,1,4.5,1106635946,Toy Story (1995),0.0,0.416846,0.516225,0.504845,0.267586,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.420342
3,15,1,2.5,1510577970,Toy Story (1995),0.0,0.416846,0.516225,0.504845,0.267586,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.420342
4,17,1,4.5,1305696483,Toy Story (1995),0.0,0.416846,0.516225,0.504845,0.267586,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.420342
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48284,567,176419,3.0,1525287581,Mother! (2017),0.0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
48285,599,176419,3.5,1516604655,Mother! (2017),0.0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
48286,594,7023,4.5,1108972356,"Wedding Banquet, The (Xi yan) (1993)",0.0,0.000000,0.000000,0.000000,0.505015,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
48287,606,6107,4.0,1171324428,Night of the Shooting Stars (Notte di San Lore...,0.0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000


# Преобразование признаков датасета

In [24]:
final_dataset = final_dataset.drop(columns=["title", "timestamp"])

## Добавление признаков

In [27]:
final_dataset["user_avg_rating"] = final_dataset.groupby("userId")["rating"].transform("mean")
final_dataset["movie_avg_rating"] = final_dataset.groupby("movieId")["rating"].transform("mean")

final_dataset['user_rating_variance'] = final_dataset.groupby('userId')['rating'].transform('var')
final_dataset['movie_rating_variance'] = final_dataset.groupby('movieId')['rating'].transform('var')

final_dataset['user_min_rating'] = final_dataset.groupby('userId')['rating'].transform('min')
final_dataset['movie_min_rating'] = final_dataset.groupby('movieId')['rating'].transform('min')

final_dataset['user_max_rating'] = final_dataset.groupby('userId')['rating'].transform('max')
final_dataset['movie_max_rating'] = final_dataset.groupby('movieId')['rating'].transform('max')

final_dataset['user_rating_count'] = final_dataset.groupby('userId')['rating'].transform('count')
final_dataset['movie_rating_count'] = final_dataset.groupby('movieId')['rating'].transform('count')


final_dataset

Unnamed: 0,userId,movieId,rating,action_x,adventure_x,animation_x,children_x,comedy_x,crime_x,documentary_x,...,user_avg_rating,movie_avg_rating,user_rating_variance,movie_rating_variance,user_min_rating,movie_min_rating,user_max_rating,movie_max_rating,user_rating_count,movie_rating_count
0,1,1,4.0,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,...,4.403509,3.92093,0.667598,0.69699,1.0,0.5,5.0,5.0,114,215
1,5,1,4.0,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,...,3.743590,3.92093,0.879892,0.69699,2.0,0.5,5.0,5.0,39,215
2,7,1,4.5,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,...,3.543956,3.92093,1.370269,0.69699,0.5,0.5,5.0,5.0,91,215
3,15,1,2.5,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,...,3.562500,3.92093,1.154272,0.69699,1.0,0.5,5.0,5.0,80,215
4,17,1,4.5,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,...,4.279070,3.92093,0.256498,0.69699,3.0,0.5,5.0,5.0,86,215
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48284,567,176419,3.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,...,3.002907,3.25000,1.343559,0.12500,0.5,3.0,5.0,3.5,172,2
48285,599,176419,3.5,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,...,3.080292,3.25000,0.566643,0.12500,1.0,3.0,5.0,3.5,685,2
48286,594,7023,4.5,0.0,0.000000,0.000000,0.000000,0.505015,0.0,0.0,...,4.288235,4.50000,0.758193,,0.5,4.5,5.0,4.5,85,1
48287,606,6107,4.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,...,3.808725,4.00000,0.460529,,1.0,4.0,5.0,4.0,447,1


In [28]:
final_dataset = final_dataset.fillna(0)

final_dataset

Unnamed: 0,userId,movieId,rating,action_x,adventure_x,animation_x,children_x,comedy_x,crime_x,documentary_x,...,user_avg_rating,movie_avg_rating,user_rating_variance,movie_rating_variance,user_min_rating,movie_min_rating,user_max_rating,movie_max_rating,user_rating_count,movie_rating_count
0,1,1,4.0,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,...,4.403509,3.92093,0.667598,0.69699,1.0,0.5,5.0,5.0,114,215
1,5,1,4.0,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,...,3.743590,3.92093,0.879892,0.69699,2.0,0.5,5.0,5.0,39,215
2,7,1,4.5,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,...,3.543956,3.92093,1.370269,0.69699,0.5,0.5,5.0,5.0,91,215
3,15,1,2.5,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,...,3.562500,3.92093,1.154272,0.69699,1.0,0.5,5.0,5.0,80,215
4,17,1,4.5,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,...,4.279070,3.92093,0.256498,0.69699,3.0,0.5,5.0,5.0,86,215
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48284,567,176419,3.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,...,3.002907,3.25000,1.343559,0.12500,0.5,3.0,5.0,3.5,172,2
48285,599,176419,3.5,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,...,3.080292,3.25000,0.566643,0.12500,1.0,3.0,5.0,3.5,685,2
48286,594,7023,4.5,0.0,0.000000,0.000000,0.000000,0.505015,0.0,0.0,...,4.288235,4.50000,0.758193,0.00000,0.5,4.5,5.0,4.5,85,1
48287,606,6107,4.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,...,3.808725,4.00000,0.460529,0.00000,1.0,4.0,5.0,4.0,447,1


# Разделение на X и Y

In [29]:
from sklearn.model_selection import train_test_split


X = final_dataset.drop(columns=["userId", "movieId", "rating"])
y = final_dataset["rating"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape

((38631, 1502), (9658, 1502))

# Обучение RandomForestRegressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


rfr = RandomForestRegressor(random_state=42)

rfr.fit(X_train, y_train)

y_pred = rfr.predict(X_test)

rmse = mean_squared_error(y_test, y_pred, squared=False)

print(f"RMSE: {rmse}")

RMSE: 0.8001858185263194


# Обучение GradientBoostingRegressor

In [None]:
from sklearn.ensemble import GradientBoostingRegressor


gbr = GradientBoostingRegressor(random_state=42)

gbr.fit(X_train, y_train)

y_pred = gbr.predict(X_test)

rmse = mean_squared_error(y_test, y_pred, squared=False)

print(f"RMSE: {rmse}")

RMSE: 0.7847283228994816


# Обучение StackingRegressor

In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.ensemble import StackingRegressor


estimators = [
    ('rfr', RandomForestRegressor()),
    ('gbr', GradientBoostingRegressor())
]


stacker = StackingRegressor(estimators=estimators, final_estimator=GradientBoostingRegressor())


stacker.fit(X_train, y_train)

y_pred = stacker.predict(X_test)

rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"RMSE: {rmse}")


RMSE: 0.782903537409172


# GradientBoostingRegressor с подбором параметров

In [30]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error


gbr = GradientBoostingRegressor()


param_grid = {
    'learning_rate': [0.1, 0.3, 0.5, 1],
    'n_estimators': [100, 200, 300, 500, 1000],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 5, 10]
}


random_search = RandomizedSearchCV(gbr, param_grid, cv=5, n_iter=10, scoring='neg_mean_squared_error', n_jobs=-1)
random_search.fit(X_train, y_train)


best_params = random_search.best_params_
best_model = random_search.best_estimator_


y_pred = best_model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"RMSE: {rmse}")

RMSE: 0.7851452302675789


# Вывод

Подводя итоги, модели справились примерно одинаково со своей задачей, RMSE = примерно 0.79, тем самым указывая на то, что улучшение предсказаний может быть реализуемо несколькими способами:
1. Увеличение датасета (можно взять полный датасет MovieLens)
2. Добавление новых признаков

В рамках учебной практики не могу сказать, что результат оказался плохим. В целом, есть ключевые точки, ведущие к улучшению метрик.