In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error, mean_squared_error

%matplotlib inline

In [2]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [3]:
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId')
movies_with_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,964982700.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,847435000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7.0,4.5,1106636000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15.0,2.5,1510578000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17.0,4.5,1305696000.0


In [4]:
movies_with_ratings_and_tags = movies_with_ratings.groupby(['movieId','title','genres'])[['rating']].mean().reset_index().join(tags.set_index('movieId'), on='movieId')


In [5]:
movies_with_ratings_and_tags.head()

Unnamed: 0,movieId,title,genres,rating,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.92093,336.0,pixar,1139046000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.92093,474.0,pixar,1137207000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.92093,567.0,fun,1525286000.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.431818,62.0,fantasy,1528844000.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.431818,62.0,magic board game,1528844000.0


In [6]:
best_movies = movies_with_ratings_and_tags[['movieId', 'title', 'genres', 'rating','tag']]

In [7]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [8]:
best_movies.genres = best_movies.genres.apply(change_string)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [9]:
best_movies = best_movies[~best_movies.rating.isnull()]

In [10]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(best_movies.genres)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [11]:
X = X_train_tfidf
y = best_movies.rating

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42)

In [13]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [14]:
predictions = model.predict(X_test)

print('root_mean_squared_error = ', np.sqrt(mean_squared_error(y_test, predictions)))

root_mean_squared_error =  0.7932492480208965
