In [69]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

## Загрузка данных

In [70]:
movies = pd.read_csv('movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [71]:
tags = pd.read_csv('tags.csv')
#tags.drop('timestamp')
tags = tags[['movieId', 'tag']]
tags.head()

Unnamed: 0,movieId,tag
0,60756,funny
1,60756,Highly quotable
2,60756,will ferrell
3,89774,Boxing story
4,89774,MMA


# TFIDF на жанрах и тегах

In [72]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

In [73]:
#tags.drop('timestamp')
tags['tag'] = tags.groupby('movieId').transform(lambda x: '|'.join(x))
#tags = tags.groupby('movieId')['tag'].apply(lambda x: '|'.join(x), axis=1)
#tags = tags.groupby('movieId')['tag'].apply(parse_tags)
#tags = tags.groupby('movieId').sum()
#df[['name','text','month']].drop_duplicates()
tags = tags.drop_duplicates()
tags.head()

Unnamed: 0,movieId,tag
0,60756,funny|Highly quotable|will ferrell|comedy|funn...
3,89774,Boxing story|MMA|Tom Hardy
6,106782,drugs|Leonardo DiCaprio|Martin Scorsese|Stock ...
9,48516,way too long|Leonardo DiCaprio|suspense|twist ...
10,431,Al Pacino|gangster|mafia


In [74]:
movies_with_tags = pd.merge(movies, tags, on = 'movieId')

In [75]:
movies_with_tags['genres_split'] = movies_with_tags.genres.str.split('|')
movies_with_tags['genres_space'] = movies_with_tags.apply(lambda r: ' '.join(r['genres_split']), axis=1)
movies_with_tags['tags_split'] = movies_with_tags.genres.str.split('|')
movies_with_tags['tags_space'] = movies_with_tags.apply(lambda r: ' '.join(r['tags_split']), axis=1)
movies_with_tags['genres_and_tags_space'] = movies_with_tags.apply(lambda r: r['genres_space']+ ' ' +r['tags_space'], axis=1)
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,tag,genres_split,genres_space,tags_split,tags_space,genres_and_tags_space
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,pixar|pixar|fun,"[Adventure, Animation, Children, Comedy, Fantasy]",Adventure Animation Children Comedy Fantasy,"[Adventure, Animation, Children, Comedy, Fantasy]",Adventure Animation Children Comedy Fantasy,Adventure Animation Children Comedy Fantasy Ad...
1,2,Jumanji (1995),Adventure|Children|Fantasy,fantasy|magic board game|Robin Williams|game,"[Adventure, Children, Fantasy]",Adventure Children Fantasy,"[Adventure, Children, Fantasy]",Adventure Children Fantasy,Adventure Children Fantasy Adventure Children ...
2,3,Grumpier Old Men (1995),Comedy|Romance,moldy|old,"[Comedy, Romance]",Comedy Romance,"[Comedy, Romance]",Comedy Romance,Comedy Romance Comedy Romance
3,5,Father of the Bride Part II (1995),Comedy,pregnancy|remake,[Comedy],Comedy,[Comedy],Comedy,Comedy Comedy
4,7,Sabrina (1995),Comedy|Romance,remake,"[Comedy, Romance]",Comedy Romance,"[Comedy, Romance]",Comedy Romance,Comedy Romance Comedy Romance


In [76]:
cnt_vec = CountVectorizer()

In [77]:
processed = cnt_vec.fit_transform(movies_with_tags['genres_and_tags_space'])

In [78]:
tfidf = TfidfTransformer()

In [79]:
tfidf_dense = tfidf.fit_transform(processed).todense()

In [80]:
columns = [None for i in range(len(cnt_vec.vocabulary_))]
for k in cnt_vec.vocabulary_:
    columns[cnt_vec.vocabulary_[k]] = k

In [81]:
df_tfidf = pd.DataFrame(tfidf_dense, columns=columns)

In [82]:
df_tfidf.head()

Unnamed: 0,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,fi,...,listed,musical,mystery,no,noir,romance,sci,thriller,war,western
0,0.0,0.398613,0.521641,0.511277,0.282182,0.0,0.0,0.0,0.477459,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.495081,0.0,0.635009,0.0,0.0,0.0,0.0,0.593008,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.643145,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.765744,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.643145,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.765744,0.0,0.0,0.0,0.0


In [83]:
movies_with_tfidf = pd.concat((movies, df_tfidf), axis=1)

In [84]:
movies_with_tfidf.columns

Index(['movieId', 'title', 'genres', 'action', 'adventure', 'animation',
       'children', 'comedy', 'crime', 'documentary', 'drama', 'fantasy', 'fi',
       'film', 'genres', 'horror', 'imax', 'listed', 'musical', 'mystery',
       'no', 'noir', 'romance', 'sci', 'thriller', 'war', 'western'],
      dtype='object')

In [85]:
fc = ['movieId', 'title', 'action', 'adventure', 'animation',
       'children', 'comedy', 'crime', 'documentary', 'drama', 'fantasy', 'fi',
       'film', 'genres', 'horror', 'imax', 'listed', 'musical', 'mystery',
       'no', 'noir', 'romance', 'sci', 'thriller', 'war', 'western']

In [86]:
movies_for_ds = movies_with_tfidf[fc]
movies_for_ds.head()

Unnamed: 0,movieId,title,action,adventure,animation,children,comedy,crime,documentary,drama,...,listed,musical,mystery,no,noir,romance,sci,thriller,war,western
0,1,Toy Story (1995),0.0,0.398613,0.521641,0.511277,0.282182,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),0.0,0.495081,0.0,0.635009,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),0.0,0.0,0.0,0.0,0.643145,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.765744,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale (1995),0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II (1995),0.0,0.0,0.0,0.0,0.643145,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.765744,0.0,0.0,0.0,0.0


In [87]:
ratings = pd.read_csv('ratings.csv')

In [88]:
movies_with_ratings = pd.merge(ratings, movies_for_ds, on='movieId')

In [89]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler

In [90]:
import numpy as np

In [91]:
TARGET_USER = 503

In [101]:
df_for_user = movies_with_ratings[movies_with_ratings['userId']==TARGET_USER]
df_for_user = df_for_user[np.isfinite(df_for_user['action'])]

In [102]:
X, y = df_for_user[[
           'action', 'adventure', 'animation', 'children', 'comedy', 'crime',
           'documentary', 'drama', 'fantasy', 'fi', 'film', 'horror', 'imax',
           'listed', 'musical', 'mystery', 'no', 'noir', 'romance', 'sci',
           'thriller', 'war', 'western']], df_for_user['rating']

In [103]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [106]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [107]:
models = [LinearRegression, Lasso, Ridge, RandomForestRegressor, SVR]

In [108]:
for m in models:
        model = m()
        model.fit(X_train, y_train)
        print("{}. r2_train: {:.4f}, r2_test: {:.4f}, mae_train: {:.4f}, mae_test: {:.4f}".format(
            m.__name__, model.score(X_train ,y_train), model.score(X_test, y_test),
            mean_absolute_error(model.predict(X_train), y_train),
            mean_absolute_error(model.predict(X_test), y_test)
        ))

LinearRegression. r2_train: 1.0000, r2_test: -4.2318, mae_train: 0.0000, mae_test: 2.0639
Lasso. r2_train: 0.0000, r2_test: -1.1912, mae_train: 0.7708, mae_test: 1.1875
Ridge. r2_train: 0.9890, r2_test: -0.2152, mae_train: 0.0958, mae_test: 0.9521
RandomForestRegressor. r2_train: 0.8197, r2_test: -1.8700, mae_train: 0.3167, mae_test: 1.4875
SVR. r2_train: 0.6326, r2_test: -1.2929, mae_train: 0.3744, mae_test: 1.2932
