In [1]:
import pandas as pd
import numpy as np

from tqdm.notebook import tqdm

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

%matplotlib inline

In [2]:
movies = pd.read_csv('/Users/gost1/Desktop/Recommender systems/Downloads/movies.csv')
ratings = pd.read_csv('/Users/gost1/Desktop/Recommender systems/Downloads/ratings.csv')
links = pd.read_csv('/Users/gost1/Desktop/Recommender systems/Downloads/links.csv')
tags = pd.read_csv('/Users/gost1/Desktop/Recommender systems/Downloads/tags.csv')

In [3]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


## Средние оценки (+ median, variance, etc.) пользователя и фильма

In [6]:
df = movies.merge(ratings.assign(Вratings=ratings['movieId'].astype('float32')), on=['movieId'], how='left')

In [7]:
mean_movie = ratings[['movieId', 'rating']].groupby(['movieId']).mean()
df = df.merge(mean_movie, on='movieId', how='left')
df = df.rename(columns={df.columns[7]: "mean_movie" })

In [8]:
mean_user = ratings[['userId', 'rating']].groupby(['userId']).mean()
df = df.merge(mean_user, on='userId', how='left')
df = df.rename(columns={df.columns[8]: "mean_user" })

In [9]:
median_user = ratings[['userId', 'rating']].groupby(['userId']).median()
df = df.merge(median_user, on='userId', how='left')
df = df.rename(columns={df.columns[9]: "median_user" })

In [10]:
var_user = ratings[['userId', 'rating']].groupby(['userId']).var()
df = df.merge(var_user, on='userId', how='left')
df = df.rename(columns={df.columns[10]: "var_user" })

In [11]:
median_movie = ratings[['movieId', 'rating']].groupby(['movieId']).median()
df = df.merge(median_movie, on='movieId', how='left')
df = df.rename(columns={df.columns[11]: "median_movie" })

In [12]:
var_movie = ratings[['movieId', 'rating']].groupby(['movieId']).var()
df = df.merge(var_movie, on='movieId', how='left')
df = df.rename(columns={df.columns[12]: "var_movie" })

In [13]:
df.drop(['title', 'genres', 'Вratings'], axis=1, inplace=True)

In [14]:
df.head()

Unnamed: 0,movieId,userId,rating_x,timestamp,mean_movie,mean_user,median_user,var_user,median_movie,var_movie
0,1,1.0,4.0,964982700.0,3.92093,4.366379,5.0,0.640077,4.0,0.69699
1,1,5.0,4.0,847435000.0,3.92093,3.636364,4.0,0.980973,4.0,0.69699
2,1,7.0,4.5,1106636000.0,3.92093,3.230263,3.5,1.76782,4.0,0.69699
3,1,15.0,2.5,1510578000.0,3.92093,3.448148,3.5,1.284605,4.0,0.69699
4,1,17.0,4.5,1305696000.0,3.92093,4.209524,4.0,0.258562,4.0,0.69699


In [15]:
df.dropna(inplace=True)

In [16]:
Y = df['rating_x']

In [17]:
df.drop(['rating_x'], axis=1, inplace=True)
X = df

In [18]:
model = LinearRegression()
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.15)

In [19]:
model.fit(X_train, y_train)

LinearRegression()

In [20]:
y_pred = model.predict(X_test)

In [21]:
RMSE = mean_squared_error(y_test, y_pred, squared=False)

In [22]:
print("RMSE =", RMSE)

RMSE = 0.8159890517059791


## TF-IDF на тегах и жанрах

In [23]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [24]:
movie_genres = [change_string(g) for g in movies.genres.values]

In [25]:
movie_genres[2:10]

['Comedy Romance',
 'Comedy Drama Romance',
 'Comedy',
 'Action Crime Thriller',
 'Comedy Romance',
 'Adventure Children',
 'Action',
 'Action Adventure Thriller']

In [26]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(movie_genres)

In [27]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [28]:
X_train_tfidf.toarray()

array([[0.        , 0.41684567, 0.51622547, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.51236121, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.57860574, 0.        , 0.81560738, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [29]:
Tfid_genres = X_train_tfidf.toarray()

In [30]:
Tfid_genres = pd.DataFrame(Tfid_genres)

In [31]:
tags_genres = [g for g in tags.tag.values]

In [32]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(tags_genres)

In [33]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [43]:
Tfid_tags = X_train_tfidf.toarray()
Tfid_tags = pd.DataFrame(Tfid_tags)

In [44]:
tags_with_Tfid = pd.merge(tags, Tfid_tags, left_index=True, right_index=True)

In [45]:
movies_with_Tfid = pd.merge(movies, Tfid_genres, left_index=True, right_index=True)

In [46]:
df = movies_with_Tfid.merge(ratings.assign(Вratings=ratings['movieId'].astype('float32')), on=['movieId'], how='left')

In [47]:
df  = pd.merge(left=df, right=tags_with_Tfid, on=['movieId','userId'], how='inner')

In [48]:
df.drop(['title', 'genres', 'Вratings', 'tag'], axis=1, inplace=True)

In [49]:
df.dropna(inplace=True)

Y = df['rating']

df.drop(['rating'], axis=1, inplace=True)
X = df

In [50]:
model = LinearRegression()
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.15)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [51]:
RMSE = mean_squared_error(y_test, y_pred, squared=False)

print("RMSE =", RMSE)

RMSE = 0.8316417332329394
