In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime

%matplotlib inline

In [2]:
links = pd.read_csv('ml-latest-small/links.csv')
movies = pd.read_csv('ml-latest-small/movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')
tags = pd.read_csv('ml-latest-small/tags.csv')

In [3]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [7]:
d=datetime.fromtimestamp(tags['timestamp'][1])

**Выделим 10ку самых акивных пользователей по числу оценок, а так же количеству тэгов**

In [8]:
ratings.groupby('userId')['rating'].count().sort_values(ascending=False)[:10]

userId
414    2698
599    2478
474    2108
448    1864
274    1346
610    1302
68     1260
380    1218
606    1115
288    1055
Name: rating, dtype: int64

In [9]:
tags.groupby('userId')['tag'].count().sort_values(ascending=False)[:10]

userId
474    1507
567     432
62      370
599     323
477     280
424     273
537     100
125      48
357      45
318      41
Name: tag, dtype: int64

**Пользователь с номером 474 достаточно активен в обоих наборах данных**

In [10]:
tags474 = tags[tags['userId']==474][['movieId', 'tag', 'timestamp']]
ratings474 = ratings[ratings['userId']==474][['movieId', 'rating', 'timestamp']]

**Сначала векторизуем тэги**

In [11]:
tags474.tag = tags474['tag'].apply(lambda x: ' '.join(x.replace(' ', '').replace('-','').split()))

In [12]:
countvec = CountVectorizer()
tags474vec = countvec.fit_transform(tags474.tag)
tfidf = TfidfTransformer()
tags474tfidf = tfidf.fit_transform(tags474vec)

In [13]:
countvec.vocabulary_

{'pixar': 377,
 'game': 192,
 'pregnancy': 385,
 'remake': 413,
 'politics': 381,
 'president': 388,
 'mafia': 293,
 'janeausten': 261,
 'hollywood': 229,
 'serialkiller': 444,
 'alcoholism': 20,
 'shakespeare': 447,
 'innetflixqueue': 250,
 'kidnapping': 274,
 'highschool': 224,
 'teacher': 500,
 'timetravel': 513,
 'animalmovie': 30,
 'pigs': 376,
 'deathpenalty': 135,
 'nun': 353,
 'twins': 530,
 'emma': 163,
 'southafrica': 467,
 'england': 165,
 'journalism': 270,
 'wedding': 551,
 'heist': 220,
 'adoption': 13,
 'prostitution': 394,
 'writing': 565,
 'music': 334,
 'jekyllandhyde': 266,
 'theater': 509,
 'crime': 124,
 'golf': 203,
 'muppets': 332,
 'scotland': 440,
 'assassination': 41,
 'holocaust': 230,
 'dating': 130,
 'moon': 320,
 'nasa': 338,
 'space': 470,
 'superhero': 496,
 'michaelcrichton': 310,
 'submarine': 491,
 'computers': 116,
 'mademecry': 292,
 'generationx': 196,
 'school': 439,
 'ireland': 256,
 'mentalillness': 306,
 'psychology': 396,
 'stephenking': 488,


In [14]:
tags474tfidf = tags474tfidf.toarray()

In [15]:
for column in range(tags474tfidf.shape[1]):
    col_name = 't'+ str(column)
    tags474[col_name] = pd.Series(tags474tfidf[:,column])
    

**Векторизуем жанры. Заодно выделим год создания фильма в отдельный параметр**

In [16]:
movies['genres'] = movies.apply(lambda x: ' '.join(x['genres'].replace(' ', '').replace('-','').split('|')), axis=1)

In [17]:
def extract_year(x):
    try:
        return int(x.rstrip()[-5:-1])
    except:
        return 0

In [18]:
movies['year'] = movies['title'].apply(extract_year)

In [19]:
movies = pd.get_dummies(movies, columns=['year'])

In [20]:
tfidfvec = TfidfVectorizer()
tfidfgenres = tfidfvec.fit_transform(movies.genres)
tfidfgenres = tfidfgenres.toarray()

In [21]:
for column in range(tfidfgenres.shape[1]):
    col_name = 'g'+ str(column)
    movies[col_name] = pd.Series(tfidfgenres[:,column])

**Добавим средний рейтинг к фильмам**

In [22]:
rating_mean = ratings.groupby('movieId')[['rating']].mean()

In [23]:
movies=movies.join(rating_mean, on='movieId')

In [24]:
movies.rename(columns={'rating':'mean_rating'}, inplace=True)

**Далее все объединяем**

In [25]:
user474 = ratings474.join(tags474.set_index('movieId'), on='movieId', lsuffix='_rating', rsuffix='_tag')

In [26]:
user474 = user474.join(movies.set_index('movieId'), on='movieId')

In [27]:
user474.drop(labels=['tag', 'title', 'genres', 'movieId','timestamp_rating', 'timestamp_tag'], axis=1, inplace=True)

In [28]:
user474.fillna(value=0, inplace=True)

**Подготовка модели**

In [29]:
X = user474.drop(labels=['rating'], axis=1)
y = user474.rating

In [34]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [46]:
model = KNeighborsRegressor()

In [47]:
grid = GridSearchCV(model, {'n_neighbors':range(10,15,1), 'p':[1,2]}, cv=5, scoring='neg_mean_squared_error')

In [48]:
grid.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=None, n_neighbors=5, p=2,
          weights='uniform'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_neighbors': range(10, 15), 'p': [1, 2]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [49]:
grid.best_params_

{'n_neighbors': 12, 'p': 2}

In [50]:
mean_squared_error(y_test, grid.predict(X_test))

0.4389379910213243

In [51]:
mean_absolute_error(y_test, grid.predict(X_test))

0.5064534231200898

**Ошибка в 0.5 баллов не должна быть сильно критичной.**