In [90]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

In [2]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

Сгруппируем все теги для каждого фильма

In [3]:
new_tag = tags.groupby('movieId').tag.agg(lambda x: x.tolist())

Добавим новый столбец тегов для каждого фильма

In [4]:
new_movies = pd.merge(movies, new_tag, how='inner', on='movieId')

In [5]:
new_movies.head()

Unnamed: 0,movieId,title,genres,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[pixar, pixar, fun]"
1,2,Jumanji (1995),Adventure|Children|Fantasy,"[fantasy, magic board game, Robin Williams, game]"
2,3,Grumpier Old Men (1995),Comedy|Romance,"[moldy, old]"
3,5,Father of the Bride Part II (1995),Comedy,"[pregnancy, remake]"
4,7,Sabrina (1995),Comedy|Romance,[remake]


Объединим жанры и теги в одной колонке

In [6]:
def change_genres(s):
    return s.replace(' ', '').replace('-', '').split('|')

def change_tag(s):
    new_tags = []
    for tag in s:     
        new_tags.append(tag.replace(' ', ''))
    return new_tags

In [7]:
new_movies['list_genres'] = new_movies['genres'].apply(change_genres)
new_movies['list_tags'] = new_movies['tag'].apply(change_tag)
new_movies['genres_tags'] = new_movies['list_genres'] + new_movies['list_tags']
new_movies['Genres_Tags'] = new_movies['genres_tags'].agg(lambda x: ' '.join(map(str, x)))
new_movies.drop(['title', 'genres', 'tag', 'list_genres', 'list_tags', 'genres_tags'], axis=1, inplace=True)

In [8]:
new_movies.head()

Unnamed: 0,movieId,Genres_Tags
0,1,Adventure Animation Children Comedy Fantasy pi...
1,2,Adventure Children Fantasy fantasy magicboardg...
2,3,Comedy Romance moldy old
3,5,Comedy pregnancy remake
4,7,Comedy Romance remake


Добавим для кадждого фильма среднюю оценку

In [9]:
group_raitings = ratings.groupby('movieId').rating.agg(['mean'])

In [10]:
new_movies = new_movies.merge(group_raitings, how='inner', on='movieId')

Строим TFIDF Дата Фрейм жанров и тегов для каждого фильма

In [32]:
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(new_movies.Genres_Tags)
names = tfidf.get_feature_names()
tfidf_matrix = pd.DataFrame(tfidf_matrix.toarray(), columns=names)

Добавим в датафрейм данные по средней оценке фильма

In [35]:
TFIDF_with_mean = pd.concat([tfidf_matrix, new_movies], axis=1)
TFIDF_with_mean.drop(['movieId', 'Genres_Tags'], axis=1, inplace=True)

In [36]:
TFIDF_with_mean.head()

Unnamed: 0,06oscarnominatedbestmovie,1900s,1920s,1950s,1960s,1970s,1980s,1990s,2001,2danimation,...,worldwarii,writing,wrongfulimprisonment,wry,youngermen,zither,zoekazan,zombies,zooeydeschanel,mean
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.92093
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.431818
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.259615
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.071429
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.185185


In [37]:
TFIDF_with_mean.shape

(1554, 1506)

Создадим данные для обучения модели

In [44]:
X = TFIDF_with_mean.copy()
del X['mean']
y = TFIDF_with_mean['mean']

In [93]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [94]:
RFR = RandomForestRegressor(n_jobs=-1)

In [95]:
RFR.fit(x_train, y_train)
RFR.score(x_train, y_train)

0.8189327187092643

In [96]:
DTree = DecisionTreeRegressor()

In [97]:
DTree.fit(x_train, y_train)
DTree.score(x_train, y_train)

0.9242850177940937

In [98]:
y_predict_RFR = RFR.predict(x_test)
y_predict_DTree = DTree.predict(x_test)

In [99]:
print(f'RMSE RFR: {mean_squared_error(y_test, y_predict_RFR, squared=False)}')
print(f'MSE RFR: {mean_squared_error(y_test, y_predict_RFR)}')

print(f'RMSE DTree: {mean_squared_error(y_test, y_predict_DTree, squared=False)}')
print(f'MSE DTree: {mean_squared_error(y_test, y_predict_DTree)}')

RMSE RFR: 0.5195820455843376
MSE RFR: 0.26996550209360465
RMSE DTree: 0.5803036760617126
MSE DTree: 0.33675235645073714
