In [67]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

import math

In [2]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

# Features engineering

## Genres

In [3]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [4]:
movie_genres = [change_string(g) for g in movies.genres.values]

In [5]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(movie_genres)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [6]:
tfid = pd.DataFrame(X_train_tfidf.toarray(), columns=count_vect.get_feature_names())

In [7]:
movie_genres = pd.concat([movies, tfid], axis=1)
del(movie_genres['genres'])

In [8]:
movie_genres

Unnamed: 0,movieId,title,action,adventure,animation,children,comedy,crime,documentary,drama,...,horror,imax,musical,mystery,nogenreslisted,romance,scifi,thriller,war,western
0,1,Toy Story (1995),0.000000,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),0.000000,0.512361,0.000000,0.620525,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),0.000000,0.000000,0.000000,0.000000,0.570915,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.821009,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale (1995),0.000000,0.000000,0.000000,0.000000,0.505015,0.0,0.0,0.466405,...,0.0,0.0,0.0,0.0,0.0,0.726241,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II (1995),0.000000,0.000000,0.000000,0.000000,1.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),0.436010,0.000000,0.614603,0.000000,0.318581,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9738,193583,No Game No Life: Zero (2017),0.000000,0.000000,0.682937,0.000000,0.354002,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9739,193585,Flint (2017),0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9740,193587,Bungo Stray Dogs: Dead Apple (2018),0.578606,0.000000,0.815607,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


## Tags

In [9]:
# dictionary to then apply it for main dataset
mov_tags = {}


def stack_tags(row):
    global mov_tags
    if row['movieId'] in mov_tags:
        # I will devide tags by $$$
        mov_tags[row['movieId']] += '$$$' + str(row['tag']) 
    else:
        mov_tags[row['movieId']] = str(row['tag'])

    
tags.apply(stack_tags, axis=1)

0       None
1       None
2       None
3       None
4       None
        ... 
3678    None
3679    None
3680    None
3681    None
3682    None
Length: 3683, dtype: object

In [10]:
def assign_tags(id):
    if id in mov_tags:
        return mov_tags[id]
    else:
        return ''

movie_genres['tags'] = movie_genres['movieId'].apply(assign_tags)

In [11]:
count_vect = CountVectorizer(token_pattern=r'^[^$]+|(?<=\${3})[^$]+')
X_train_counts = count_vect.fit_transform(movie_genres['tags'])

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

TF-IDF for tags is too huge and if I try to convert it to DataFrame I will get Memory Error. So I don't concat it to dataset right now.

In [12]:
del(movie_genres['tags'])

In [13]:
tags_stat = X_train_tfidf.toarray().tolist()

## Rating stats

In [14]:
rating_stats = ratings.groupby('movieId')['rating'].agg(['std', 'mean', 'count']).reset_index()

In [15]:
# add rating stats to dataset
rating_stats.fillna(0, inplace=True)


def assign_rating(row):
    global movie_genres
    
    i = movie_genres[movie_genres['movieId'] == row['movieId']].index
    
    movie_genres.loc[i, 'std'] = row['std']
    movie_genres.loc[i, 'mean'] = row['mean']
    movie_genres.loc[i, 'views'] = row['count']


rating_stats.apply(assign_rating, axis=1)

0       None
1       None
2       None
3       None
4       None
        ... 
9719    None
9720    None
9721    None
9722    None
9723    None
Length: 9724, dtype: object

# Add target

For our task take most active user, so we have more data.

In [16]:
ratings['userId'].value_counts()

414    2698
599    2478
474    2108
448    1864
274    1346
       ... 
406      20
595      20
569      20
431      20
442      20
Name: userId, Length: 610, dtype: int64

And the winner is 414.

In [24]:
user = 414

In [17]:
target_movies = ratings[ratings['userId'] == user]['movieId'].array

# filter out only movies that are ranked by this user
data = movie_genres[movie_genres['movieId'].isin(target_movies)]

In [40]:
def add_target(row):
    global data
    
    i = data[data['movieId'] == row['movieId']].index
    
    data.loc[i, 'y'] = row['rating']

In [39]:
ratings[ratings['userId'] == user].apply(add_target, axis=1)

userId             414.0
movieId              1.0
rating               4.0
timestamp    961438127.0
Name: 62294, dtype: float64
userId             414.0
movieId              2.0
rating               3.0
timestamp    961594981.0
Name: 62295, dtype: float64
userId             414.0
movieId              3.0
rating               4.0
timestamp    961439278.0
Name: 62296, dtype: float64
userId             414.0
movieId              5.0
rating               2.0
timestamp    961437647.0
Name: 62297, dtype: float64
userId             414.0
movieId              6.0
rating               3.0
timestamp    961515642.0
Name: 62298, dtype: float64
userId             414.0
movieId              7.0
rating               3.0
timestamp    961439170.0
Name: 62299, dtype: float64
userId             414.0
movieId              8.0
rating               3.0
timestamp    961594849.0
Name: 62300, dtype: float64
userId             414.0
movieId             10.0
rating               3.0
timestamp    961515863.0
Name

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s



userId             414.0
movieId             44.0
rating               2.0
timestamp    961516249.0
Name: 62319, dtype: float64
userId             414.0
movieId             45.0
rating               3.0
timestamp    961438476.0
Name: 62320, dtype: float64
userId             414.0
movieId             46.0
rating               2.0
timestamp    961514407.0
Name: 62321, dtype: float64
userId             414.0
movieId             47.0
rating               4.0
timestamp    961681857.0
Name: 62322, dtype: float64
userId             414.0
movieId             48.0
rating               3.0
timestamp    961437741.0
Name: 62323, dtype: float64
userId             414.0
movieId             50.0
rating               5.0
timestamp    961681714.0
Name: 62324, dtype: float64
userId             414.0
movieId             52.0
rating               3.0
timestamp    961438413.0
Name: 62325, dtype: float64
userId       4.140000e+02
movieId      5.400000e+01
rating       1.000000e+00
timestamp    1.027521e+09

Name: 62411, dtype: float64
userId             414.0
movieId            262.0
rating               4.0
timestamp    961681228.0
Name: 62412, dtype: float64
userId             414.0
movieId            266.0
rating               5.0
timestamp    961512595.0
Name: 62413, dtype: float64
userId             414.0
movieId            267.0
rating               4.0
timestamp    961439405.0
Name: 62414, dtype: float64
userId             414.0
movieId            273.0
rating               2.0
timestamp    961518708.0
Name: 62415, dtype: float64
userId             414.0
movieId            276.0
rating               3.0
timestamp    961596299.0
Name: 62416, dtype: float64
userId             414.0
movieId            277.0
rating               2.0
timestamp    961518753.0
Name: 62417, dtype: float64
userId             414.0
movieId            280.0
rating               4.0
timestamp    961518040.0
Name: 62418, dtype: float64
userId       4.140000e+02
movieId      2.820000e+02
rating       2.500000e+0

Name: 62517, dtype: float64
userId             414.0
movieId            507.0
rating               3.0
timestamp    961515958.0
Name: 62518, dtype: float64
userId       4.140000e+02
movieId      5.080000e+02
rating       3.500000e+00
timestamp    1.258248e+09
Name: 62519, dtype: float64
userId             414.0
movieId            511.0
rating               2.0
timestamp    961516318.0
Name: 62520, dtype: float64
userId             414.0
movieId            514.0
rating               3.0
timestamp    961595566.0
Name: 62521, dtype: float64
userId             414.0
movieId            516.0
rating               3.0
timestamp    961438762.0
Name: 62522, dtype: float64
userId             414.0
movieId            517.0
rating               2.0
timestamp    961516154.0
Name: 62523, dtype: float64
userId             414.0
movieId            519.0
rating               2.0
timestamp    961515424.0
Name: 62524, dtype: float64
userId             414.0
movieId            521.0
rating               3

Name: 62627, dtype: float64
userId             414.0
movieId            900.0
rating               4.0
timestamp    961513901.0
Name: 62628, dtype: float64
userId             414.0
movieId            902.0
rating               4.0
timestamp    961517591.0
Name: 62629, dtype: float64
userId       4.140000e+02
movieId      9.030000e+02
rating       5.000000e+00
timestamp    1.011105e+09
Name: 62630, dtype: float64
userId       4.140000e+02
movieId      9.040000e+02
rating       5.000000e+00
timestamp    1.011105e+09
Name: 62631, dtype: float64
userId             414.0
movieId            908.0
rating               4.0
timestamp    961516776.0
Name: 62632, dtype: float64
userId             414.0
movieId            909.0
rating               4.0
timestamp    961516716.0
Name: 62633, dtype: float64
userId             414.0
movieId            910.0
rating               5.0
timestamp    961595311.0
Name: 62634, dtype: float64
userId             414.0
movieId            912.0
rating            

Name: 62730, dtype: float64
userId             414.0
movieId           1210.0
rating               5.0
timestamp    961436248.0
Name: 62731, dtype: float64
userId             414.0
movieId           1213.0
rating               5.0
timestamp    961516639.0
Name: 62732, dtype: float64
userId             414.0
movieId           1214.0
rating               5.0
timestamp    961514668.0
Name: 62733, dtype: float64
userId             414.0
movieId           1215.0
rating               4.0
timestamp    961438352.0
Name: 62734, dtype: float64
userId       4.140000e+02
movieId      1.216000e+03
rating       3.000000e+00
timestamp    1.028644e+09
Name: 62735, dtype: float64
userId             414.0
movieId           1217.0
rating               5.0
timestamp    961512311.0
Name: 62736, dtype: float64
userId             414.0
movieId           1218.0
rating               4.0
timestamp    961515566.0
Name: 62737, dtype: float64
userId             414.0
movieId           1220.0
rating               5

Name: 62830, dtype: float64
userId             414.0
movieId           1466.0
rating               3.0
timestamp    961517141.0
Name: 62831, dtype: float64
userId             414.0
movieId           1476.0
rating               4.0
timestamp    961436216.0
Name: 62832, dtype: float64
userId             414.0
movieId           1479.0
rating               3.0
timestamp    961514519.0
Name: 62833, dtype: float64
userId             414.0
movieId           1484.0
rating               4.0
timestamp    961437495.0
Name: 62834, dtype: float64
userId             414.0
movieId           1485.0
rating               3.0
timestamp    961438957.0
Name: 62835, dtype: float64
userId             414.0
movieId           1488.0
rating               3.0
timestamp    961512525.0
Name: 62836, dtype: float64
userId             414.0
movieId           1498.0
rating               4.0
timestamp    961513968.0
Name: 62837, dtype: float64
userId             414.0
movieId           1499.0
rating               2.0
t

Name: 62924, dtype: float64
userId             414.0
movieId           1863.0
rating               2.0
timestamp    961437759.0
Name: 62925, dtype: float64
userId             414.0
movieId           1866.0
rating               2.0
timestamp    961515975.0
Name: 62926, dtype: float64
userId             414.0
movieId           1870.0
rating               3.0
timestamp    998400974.0
Name: 62927, dtype: float64
userId             414.0
movieId           1876.0
rating               3.0
timestamp    961515271.0
Name: 62928, dtype: float64
userId             414.0
movieId           1882.0
rating               3.0
timestamp    961515476.0
Name: 62929, dtype: float64
userId       4.140000e+02
movieId      1.883000e+03
rating       3.000000e+00
timestamp    1.073682e+09
Name: 62930, dtype: float64
userId       4.140000e+02
movieId      1.884000e+03
rating       4.000000e+00
timestamp    1.036506e+09
Name: 62931, dtype: float64
userId       4.140000e+02
movieId      1.885000e+03
rating       4.0

Name: 63012, dtype: float64
userId             414.0
movieId           2100.0
rating               3.0
timestamp    961514035.0
Name: 63013, dtype: float64
userId             414.0
movieId           2103.0
rating               2.0
timestamp    961595149.0
Name: 63014, dtype: float64
userId             414.0
movieId           2105.0
rating               3.0
timestamp    961515229.0
Name: 63015, dtype: float64
userId             414.0
movieId           2106.0
rating               3.0
timestamp    961512595.0
Name: 63016, dtype: float64
userId             414.0
movieId           2109.0
rating               3.0
timestamp    961595734.0
Name: 63017, dtype: float64
userId             414.0
movieId           2110.0
rating               4.0
timestamp    961596045.0
Name: 63018, dtype: float64
userId             414.0
movieId           2114.0
rating               5.0
timestamp    961518026.0
Name: 63019, dtype: float64
userId             414.0
movieId           2115.0
rating               4.0
t

Name: 63115, dtype: float64
userId             414.0
movieId           2402.0
rating               3.0
timestamp    961512658.0
Name: 63116, dtype: float64
userId             414.0
movieId           2403.0
rating               4.0
timestamp    961515889.0
Name: 63117, dtype: float64
userId             414.0
movieId           2404.0
rating               2.0
timestamp    961516398.0
Name: 63118, dtype: float64
userId             414.0
movieId           2405.0
rating               3.0
timestamp    961514438.0
Name: 63119, dtype: float64
userId             414.0
movieId           2406.0
rating               4.0
timestamp    961514035.0
Name: 63120, dtype: float64
userId             414.0
movieId           2407.0
rating               4.0
timestamp    961595806.0
Name: 63121, dtype: float64
userId             414.0
movieId           2408.0
rating               2.0
timestamp    991145036.0
Name: 63122, dtype: float64
userId             414.0
movieId           2409.0
rating               4.0
t

Name: 63217, dtype: float64
userId             414.0
movieId           2736.0
rating               4.0
timestamp    961596009.0
Name: 63218, dtype: float64
userId             414.0
movieId           2745.0
rating               4.0
timestamp    991144487.0
Name: 63219, dtype: float64
userId             414.0
movieId           2746.0
rating               4.0
timestamp    961437549.0
Name: 63220, dtype: float64
userId             414.0
movieId           2751.0
rating               4.0
timestamp    961596281.0
Name: 63221, dtype: float64
userId             414.0
movieId           2759.0
rating               3.0
timestamp    961436657.0
Name: 63222, dtype: float64
userId             414.0
movieId           2761.0
rating               4.0
timestamp    961436800.0
Name: 63223, dtype: float64
userId             414.0
movieId           2762.0
rating               3.0
timestamp    961437029.0
Name: 63224, dtype: float64
userId             414.0
movieId           2763.0
rating               2.0
t

userId       4.140000e+02
movieId      3.098000e+03
rating       4.000000e+00
timestamp    1.038842e+09
Name: 63319, dtype: float64
userId             414.0
movieId           3100.0
rating               4.0
timestamp    961518026.0
Name: 63320, dtype: float64
userId             414.0
movieId           3101.0
rating               4.0
timestamp    961681927.0
Name: 63321, dtype: float64
userId             414.0
movieId           3104.0
rating               5.0
timestamp    961515708.0
Name: 63322, dtype: float64
userId             414.0
movieId           3105.0
rating               3.0
timestamp    961517141.0
Name: 63323, dtype: float64
userId             414.0
movieId           3107.0
rating               4.0
timestamp    961515997.0
Name: 63324, dtype: float64
userId             414.0
movieId           3108.0
rating               3.0
timestamp    961438502.0
Name: 63325, dtype: float64
userId       4.140000e+02
movieId      3.111000e+03
rating       4.000000e+00
timestamp    1.010588e

userId             414.0
movieId           3452.0
rating               3.0
timestamp    966259346.0
Name: 63417, dtype: float64
userId             414.0
movieId           3461.0
rating               3.0
timestamp    961437472.0
Name: 63418, dtype: float64
userId             414.0
movieId           3471.0
rating               4.0
timestamp    999617473.0
Name: 63419, dtype: float64
userId             414.0
movieId           3476.0
rating               3.0
timestamp    961681857.0
Name: 63420, dtype: float64
userId             414.0
movieId           3477.0
rating               3.0
timestamp    961517557.0
Name: 63421, dtype: float64
userId             414.0
movieId           3478.0
rating               2.0
timestamp    961518108.0
Name: 63422, dtype: float64
userId             414.0
movieId           3479.0
rating               4.0
timestamp    961514035.0
Name: 63423, dtype: float64
userId             414.0
movieId           3480.0
rating               4.0
timestamp    961518026.0
Name

Name: 63517, dtype: float64
userId             414.0
movieId           3825.0
rating               2.0
timestamp    965742470.0
Name: 63518, dtype: float64
userId             414.0
movieId           3826.0
rating               2.0
timestamp    978963021.0
Name: 63519, dtype: float64
userId             414.0
movieId           3827.0
rating               2.0
timestamp    989248906.0
Name: 63520, dtype: float64
userId             414.0
movieId           3831.0
rating               3.0
timestamp    980259417.0
Name: 63521, dtype: float64
userId             414.0
movieId           3835.0
rating               2.0
timestamp    968165599.0
Name: 63522, dtype: float64
userId             414.0
movieId           3836.0
rating               5.0
timestamp    965141917.0
Name: 63523, dtype: float64
userId             414.0
movieId           3841.0
rating               2.0
timestamp    968165572.0
Name: 63524, dtype: float64
userId             414.0
movieId           3844.0
rating               2.0
t

Name: 63621, dtype: float64
userId       4.140000e+02
movieId      4.144000e+03
rating       4.000000e+00
timestamp    1.050108e+09
Name: 63622, dtype: float64
userId       4.140000e+02
movieId      4.147000e+03
rating       4.000000e+00
timestamp    1.014733e+09
Name: 63623, dtype: float64
userId       4.140000e+02
movieId      4.148000e+03
rating       2.000000e+00
timestamp    1.095466e+09
Name: 63624, dtype: float64
userId             414.0
movieId           4149.0
rating               3.0
timestamp    982170225.0
Name: 63625, dtype: float64
userId       4.140000e+02
movieId      4.159000e+03
rating       2.000000e+00
timestamp    1.046701e+09
Name: 63626, dtype: float64
userId             414.0
movieId           4161.0
rating               3.0
timestamp    997709153.0
Name: 63627, dtype: float64
userId       4.140000e+02
movieId      4.167000e+03
rating       4.500000e+00
timestamp    1.073680e+09
Name: 63628, dtype: float64
userId             414.0
movieId           4168.0
rating

userId       4.140000e+02
movieId      4.565000e+03
rating       3.000000e+00
timestamp    1.008691e+09
Name: 63724, dtype: float64
userId       4.140000e+02
movieId      4.571000e+03
rating       4.000000e+00
timestamp    1.008691e+09
Name: 63725, dtype: float64
userId       4.140000e+02
movieId      4.572000e+03
rating       3.000000e+00
timestamp    1.016115e+09
Name: 63726, dtype: float64
userId       4.140000e+02
movieId      4.573000e+03
rating       2.000000e+00
timestamp    1.073681e+09
Name: 63727, dtype: float64
userId       4.140000e+02
movieId      4.577000e+03
rating       3.000000e+00
timestamp    1.010588e+09
Name: 63728, dtype: float64
userId       4.140000e+02
movieId      4.583000e+03
rating       4.000000e+00
timestamp    1.037024e+09
Name: 63729, dtype: float64
userId       4.140000e+02
movieId      4.585000e+03
rating       3.000000e+00
timestamp    1.026225e+09
Name: 63730, dtype: float64
userId       4.140000e+02
movieId      4.587000e+03
rating       3.000000e+0

Name: 63823, dtype: float64
userId       4.140000e+02
movieId      4.971000e+03
rating       3.000000e+00
timestamp    1.010588e+09
Name: 63824, dtype: float64
userId       4.140000e+02
movieId      4.973000e+03
rating       5.000000e+00
timestamp    1.027519e+09
Name: 63825, dtype: float64
userId       4.140000e+02
movieId      4.974000e+03
rating       3.000000e+00
timestamp    1.020450e+09
Name: 63826, dtype: float64
userId       4.140000e+02
movieId      4.975000e+03
rating       5.000000e+00
timestamp    1.008691e+09
Name: 63827, dtype: float64
userId       4.140000e+02
movieId      4.978000e+03
rating       3.000000e+00
timestamp    1.023466e+09
Name: 63828, dtype: float64
userId       4.140000e+02
movieId      4.979000e+03
rating       5.000000e+00
timestamp    1.014654e+09
Name: 63829, dtype: float64
userId       4.140000e+02
movieId      4.980000e+03
rating       2.000000e+00
timestamp    1.008691e+09
Name: 63830, dtype: float64
userId       4.140000e+02
movieId      4.985000e

Name: 63911, dtype: float64
userId       4.140000e+02
movieId      5.316000e+03
rating       3.500000e+00
timestamp    1.064942e+09
Name: 63912, dtype: float64
userId       4.140000e+02
movieId      5.321000e+03
rating       2.000000e+00
timestamp    1.064203e+09
Name: 63913, dtype: float64
userId       4.140000e+02
movieId      5.322000e+03
rating       4.000000e+00
timestamp    1.072058e+09
Name: 63914, dtype: float64
userId       4.140000e+02
movieId      5.324000e+03
rating       2.000000e+00
timestamp    1.035309e+09
Name: 63915, dtype: float64
userId       4.140000e+02
movieId      5.325000e+03
rating       4.000000e+00
timestamp    1.026741e+09
Name: 63916, dtype: float64
userId       4.140000e+02
movieId      5.328000e+03
rating       5.000000e+00
timestamp    1.047914e+09
Name: 63917, dtype: float64
userId       4.140000e+02
movieId      5.329000e+03
rating       3.000000e+00
timestamp    1.034003e+09
Name: 63918, dtype: float64
userId       4.140000e+02
movieId      5.337000e

Name: 64002, dtype: float64
userId       4.140000e+02
movieId      5.635000e+03
rating       3.000000e+00
timestamp    1.113230e+09
Name: 64003, dtype: float64
userId       4.140000e+02
movieId      5.636000e+03
rating       2.000000e+00
timestamp    1.089044e+09
Name: 64004, dtype: float64
userId       4.140000e+02
movieId      5.644000e+03
rating       2.500000e+00
timestamp    1.168281e+09
Name: 64005, dtype: float64
userId       4.140000e+02
movieId      5.646000e+03
rating       3.000000e+00
timestamp    1.040049e+09
Name: 64006, dtype: float64
userId       4.140000e+02
movieId      5.650000e+03
rating       5.000000e+00
timestamp    1.034002e+09
Name: 64007, dtype: float64
userId       4.140000e+02
movieId      5.663000e+03
rating       3.500000e+00
timestamp    1.064942e+09
Name: 64008, dtype: float64
userId       4.140000e+02
movieId      5.665000e+03
rating       3.000000e+00
timestamp    1.065052e+09
Name: 64009, dtype: float64
userId       4.140000e+02
movieId      5.666000e

Name: 64101, dtype: float64
userId       4.140000e+02
movieId      6.269000e+03
rating       4.000000e+00
timestamp    1.071506e+09
Name: 64102, dtype: float64
userId       4.140000e+02
movieId      6.280000e+03
rating       2.000000e+00
timestamp    1.089044e+09
Name: 64103, dtype: float64
userId       4.140000e+02
movieId      6.281000e+03
rating       2.500000e+00
timestamp    1.092324e+09
Name: 64104, dtype: float64
userId       4.140000e+02
movieId      6.283000e+03
rating       4.000000e+00
timestamp    1.058145e+09
Name: 64105, dtype: float64
userId       4.140000e+02
movieId      6.286000e+03
rating       4.000000e+00
timestamp    1.128977e+09
Name: 64106, dtype: float64
userId       4.140000e+02
movieId      6.287000e+03
rating       2.000000e+00
timestamp    1.079902e+09
Name: 64107, dtype: float64
userId       4.140000e+02
movieId      6.288000e+03
rating       3.500000e+00
timestamp    1.058631e+09
Name: 64108, dtype: float64
userId       4.140000e+02
movieId      6.294000e

Name: 64197, dtype: float64
userId       4.140000e+02
movieId      6.797000e+03
rating       3.000000e+00
timestamp    1.073682e+09
Name: 64198, dtype: float64
userId       4.140000e+02
movieId      6.807000e+03
rating       4.000000e+00
timestamp    1.065800e+09
Name: 64199, dtype: float64
userId       4.140000e+02
movieId      6.809000e+03
rating       3.000000e+00
timestamp    1.065800e+09
Name: 64200, dtype: float64
userId       4.140000e+02
movieId      6.810000e+03
rating       2.500000e+00
timestamp    1.064942e+09
Name: 64201, dtype: float64
userId       4.140000e+02
movieId      6.811000e+03
rating       3.000000e+00
timestamp    1.065800e+09
Name: 64202, dtype: float64
userId       4.140000e+02
movieId      6.812000e+03
rating       2.000000e+00
timestamp    1.065800e+09
Name: 64203, dtype: float64
userId       4.140000e+02
movieId      6.816000e+03
rating       4.000000e+00
timestamp    1.065800e+09
Name: 64204, dtype: float64
userId       4.140000e+02
movieId      6.820000e

userId       4.140000e+02
movieId      7.981000e+03
rating       3.500000e+00
timestamp    1.128977e+09
Name: 64301, dtype: float64
userId       4.140000e+02
movieId      8.119000e+03
rating       4.500000e+00
timestamp    1.093393e+09
Name: 64302, dtype: float64
userId       4.140000e+02
movieId      8.266000e+03
rating       2.500000e+00
timestamp    1.094942e+09
Name: 64303, dtype: float64
userId       4.140000e+02
movieId      8.360000e+03
rating       4.000000e+00
timestamp    1.100800e+09
Name: 64304, dtype: float64
userId       4.140000e+02
movieId      8.361000e+03
rating       3.000000e+00
timestamp    1.216150e+09
Name: 64305, dtype: float64
userId       4.140000e+02
movieId      8.362000e+03
rating       2.500000e+00
timestamp    1.216150e+09
Name: 64306, dtype: float64
userId       4.140000e+02
movieId      8.364000e+03
rating       4.000000e+00
timestamp    1.100799e+09
Name: 64307, dtype: float64
userId       4.140000e+02
movieId      8.366000e+03
rating       3.000000e+0

userId       4.140000e+02
movieId      3.258700e+04
rating       4.000000e+00
timestamp    1.112630e+09
Name: 64401, dtype: float64
userId       4.140000e+02
movieId      3.300400e+04
rating       3.500000e+00
timestamp    1.128977e+09
Name: 64402, dtype: float64
userId       4.140000e+02
movieId      3.316200e+04
rating       3.500000e+00
timestamp    1.192047e+09
Name: 64403, dtype: float64
userId       4.140000e+02
movieId      3.316600e+04
rating       4.000000e+00
timestamp    1.128977e+09
Name: 64404, dtype: float64
userId       4.140000e+02
movieId      3.342100e+04
rating       3.000000e+00
timestamp    1.152024e+09
Name: 64405, dtype: float64
userId       4.140000e+02
movieId      3.343700e+04
rating       3.500000e+00
timestamp    1.137431e+09
Name: 64406, dtype: float64
userId       4.140000e+02
movieId      3.349300e+04
rating       4.000000e+00
timestamp    1.117044e+09
Name: 64407, dtype: float64
userId       4.140000e+02
movieId      3.349500e+04
rating       2.500000e+0

Name: 64493, dtype: float64
userId       4.140000e+02
movieId      4.392100e+04
rating       3.000000e+00
timestamp    1.157472e+09
Name: 64494, dtype: float64
userId       4.140000e+02
movieId      4.392800e+04
rating       2.500000e+00
timestamp    1.216151e+09
Name: 64495, dtype: float64
userId       4.140000e+02
movieId      4.393600e+04
rating       3.500000e+00
timestamp    1.157132e+09
Name: 64496, dtype: float64
userId       4.140000e+02
movieId      4.402200e+04
rating       2.500000e+00
timestamp    1.489548e+09
Name: 64497, dtype: float64
userId       4.140000e+02
movieId      4.419100e+04
rating       4.000000e+00
timestamp    1.144256e+09
Name: 64498, dtype: float64
userId       4.140000e+02
movieId      4.419500e+04
rating       4.500000e+00
timestamp    1.145550e+09
Name: 64499, dtype: float64
userId       4.140000e+02
movieId      4.419900e+04
rating       4.000000e+00
timestamp    1.147702e+09
Name: 64500, dtype: float64
userId       4.140000e+02
movieId      4.420400e

Name: 64592, dtype: float64
userId       4.140000e+02
movieId      5.092300e+04
rating       2.500000e+00
timestamp    1.216150e+09
Name: 64593, dtype: float64
userId       4.140000e+02
movieId      5.107700e+04
rating       2.500000e+00
timestamp    1.216151e+09
Name: 64594, dtype: float64
userId       4.140000e+02
movieId      5.108000e+04
rating       4.000000e+00
timestamp    1.172770e+09
Name: 64595, dtype: float64
userId       4.140000e+02
movieId      5.108400e+04
rating       3.000000e+00
timestamp    1.199722e+09
Name: 64596, dtype: float64
userId       4.140000e+02
movieId      5.108600e+04
rating       3.000000e+00
timestamp    1.216150e+09
Name: 64597, dtype: float64
userId       4.140000e+02
movieId      5.109100e+04
rating       4.000000e+00
timestamp    1.216147e+09
Name: 64598, dtype: float64
userId       4.140000e+02
movieId      5.125500e+04
rating       3.500000e+00
timestamp    1.216147e+09
Name: 64599, dtype: float64
userId       4.140000e+02
movieId      5.141200e

userId       4.140000e+02
movieId      5.815400e+04
rating       4.000000e+00
timestamp    1.273977e+09
Name: 64688, dtype: float64
userId       4.140000e+02
movieId      5.815600e+04
rating       2.000000e+00
timestamp    1.273978e+09
Name: 64689, dtype: float64
userId       4.140000e+02
movieId      5.816200e+04
rating       3.500000e+00
timestamp    1.249835e+09
Name: 64690, dtype: float64
userId       4.140000e+02
movieId      5.829300e+04
rating       2.000000e+00
timestamp    1.273978e+09
Name: 64691, dtype: float64
userId       4.140000e+02
movieId      5.829500e+04
rating       3.500000e+00
timestamp    1.216145e+09
Name: 64692, dtype: float64
userId       4.140000e+02
movieId      5.855900e+04
rating       4.000000e+00
timestamp    1.251142e+09
Name: 64693, dtype: float64
userId       4.140000e+02
movieId      5.880600e+04
rating       3.000000e+00
timestamp    1.273978e+09
Name: 64694, dtype: float64
userId       4.140000e+02
movieId      5.883900e+04
rating       2.500000e+0

userId       4.140000e+02
movieId      7.183800e+04
rating       2.500000e+00
timestamp    1.273978e+09
Name: 64778, dtype: float64
userId       4.140000e+02
movieId      7.201100e+04
rating       4.500000e+00
timestamp    1.273977e+09
Name: 64779, dtype: float64
userId       4.140000e+02
movieId      7.222600e+04
rating       4.000000e+00
timestamp    1.273977e+09
Name: 64780, dtype: float64
userId       4.140000e+02
movieId      7.237800e+04
rating       2.000000e+00
timestamp    1.273978e+09
Name: 64781, dtype: float64
userId       4.140000e+02
movieId      7.260500e+04
rating       4.000000e+00
timestamp    1.275267e+09
Name: 64782, dtype: float64
userId       4.140000e+02
movieId      7.264100e+04
rating       3.000000e+00
timestamp    1.273977e+09
Name: 64783, dtype: float64
userId       4.140000e+02
movieId      7.273300e+04
rating       4.000000e+00
timestamp    1.489548e+09
Name: 64784, dtype: float64
userId       4.140000e+02
movieId      7.299800e+04
rating       4.000000e+0

Name: 64875, dtype: float64
userId       4.140000e+02
movieId      9.896100e+04
rating       4.000000e+00
timestamp    1.485657e+09
Name: 64876, dtype: float64
userId       4.140000e+02
movieId      9.911200e+04
rating       3.500000e+00
timestamp    1.525559e+09
Name: 64877, dtype: float64
userId       4.140000e+02
movieId      9.911400e+04
rating       4.500000e+00
timestamp    1.485657e+09
Name: 64878, dtype: float64
userId       4.140000e+02
movieId      1.024450e+05
rating       3.500000e+00
timestamp    1.489537e+09
Name: 64879, dtype: float64
userId       4.140000e+02
movieId      1.030480e+05
rating       4.000000e+00
timestamp    1.525563e+09
Name: 64880, dtype: float64
userId       4.140000e+02
movieId      1.033410e+05
rating       4.000000e+00
timestamp    1.511541e+09
Name: 64881, dtype: float64
userId       4.140000e+02
movieId      1.042410e+05
rating       3.000000e+00
timestamp    1.525562e+09
Name: 64882, dtype: float64
userId       4.140000e+02
movieId      1.043740e

Name: 64975, dtype: float64
userId       4.140000e+02
movieId      1.755690e+05
rating       4.000000e+00
timestamp    1.515207e+09
Name: 64976, dtype: float64
userId       4.140000e+02
movieId      1.756610e+05
rating       3.000000e+00
timestamp    1.527978e+09
Name: 64977, dtype: float64
userId       4.140000e+02
movieId      1.763710e+05
rating       5.000000e+00
timestamp    1.511536e+09
Name: 64978, dtype: float64
userId       4.140000e+02
movieId      1.764230e+05
rating       4.000000e+00
timestamp    1.525549e+09
Name: 64979, dtype: float64
userId       4.140000e+02
movieId      1.767510e+05
rating       4.000000e+00
timestamp    1.521844e+09
Name: 64980, dtype: float64
userId       4.140000e+02
movieId      1.775930e+05
rating       4.500000e+00
timestamp    1.521844e+09
Name: 64981, dtype: float64
userId       4.140000e+02
movieId      1.776150e+05
rating       4.500000e+00
timestamp    1.521844e+09
Name: 64982, dtype: float64
userId       4.140000e+02
movieId      1.780610e

62294    None
62295    None
62296    None
62297    None
62298    None
         ... 
64987    None
64988    None
64989    None
64990    None
64991    None
Length: 2698, dtype: object

In [20]:
# now we need to concat tf-idf of tags
tags_to_use = []
for i in data.index:
    tags_to_use.append(tags_stat[i])

In [60]:
X = data.copy()
y = data['y']
del(X['y'])
del(X['title'])

In [54]:
# add tags tf-idf
X = np.append(X,np.array(tags_to_use),axis=1)

In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = KNeighborsRegressor(n_neighbors=10)
model.fit(X_train, y_train)

p = model.predict(X_test)

print('RMSE: ', math.sqrt(mean_squared_error(y_test, p)))

RMSE:  0.8744204429855876


In [68]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

p = model.predict(X_test)

print('RMSE: ', math.sqrt(mean_squared_error(y_test, p)))

RMSE:  0.6487903476693017
