# Домашнее задание к лекции «Классификация: Логистическая регрессия и SVM»

In [180]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

pd.set_option('display.max_columns', 500)

Загрузим данные и посмотрим на них:

In [181]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [182]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [183]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [184]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [185]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


Удалим timestamp, он нигде не будет использоваться

In [186]:
ratings.drop(['timestamp'], axis=1, inplace=True)
tags.drop(['timestamp'], axis=1, inplace=True)

Для каждого фильма посчитаем моду, медиану и std по переменной rating и добавим эти значения к дата-сету movies

In [187]:
ratings_mean = ratings[['movieId', 'rating']].groupby(['movieId']).mean().reset_index()
ratings_median = ratings[['movieId', 'rating']].groupby(['movieId']).median().reset_index()
ratings_std = ratings[['movieId', 'rating']].groupby(['movieId']).std().reset_index()

In [188]:
movies = movies.join(ratings_mean.set_index('movieId'), on='movieId').join(
    ratings_median.set_index('movieId'), on='movieId', lsuffix='_mean', rsuffix='_median').join(
    ratings_std.set_index('movieId'), on='movieId')

In [189]:
movies.rename(columns={"rating": "rating_std"}, inplace=True)
movies

Unnamed: 0,movieId,title,genres,rating_mean,rating_median,rating_std
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.920930,4.0,0.834859
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.431818,3.5,0.881713
2,3,Grumpier Old Men (1995),Comedy|Romance,3.259615,3.0,1.054823
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,2.357143,3.0,0.852168
4,5,Father of the Bride Part II (1995),Comedy,3.071429,3.0,0.907148
...,...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,4.000000,4.0,
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,3.500000,3.5,
9739,193585,Flint (2017),Drama,3.500000,3.5,
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,3.500000,3.5,


Теперь в  дата-сет movies для каждого фильма добавим строку с тегами:

In [190]:
tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   userId   3683 non-null   int64 
 1   movieId  3683 non-null   int64 
 2   tag      3683 non-null   object
dtypes: int64(2), object(1)
memory usage: 86.4+ KB


In [191]:
# Смотрим частоту тегов

tag_value_counts = tags.tag.value_counts()

print(tag_value_counts)

In Netflix queue       131
atmospheric             36
superhero               24
thought-provoking       24
surreal                 23
                      ... 
menacing                 1
jake gyllenhaal          1
Boxing story             1
movies about movies      1
star wars                1
Name: tag, Length: 1589, dtype: int64


In [192]:
# Находим теги, которые встречаются меньше 5-ти раз

rare_tags = []

for tag, count in tags.tag.value_counts().iteritems():
    if count < 5:
        rare_tags.append(tag)
        
len(rare_tags)

1421

In [193]:
# Заменяем редкие теги на общий тег other

tags.tag = tags.tag.apply(lambda x: 'other' if(x in rare_tags) else x)

In [194]:
tags.head()

Unnamed: 0,userId,movieId,tag
0,2,60756,funny
1,2,60756,other
2,2,60756,other
3,2,89774,other
4,2,89774,other


In [195]:
# Теперь сгруппируем дата-сет с тегами по фильмам и склеим уникальные теги для каждого фильма в одну строку

tags_group = tags[['movieId', 'tag']].groupby(['movieId'])['tag'].apply(lambda x: ' '.join(list(set(x)))).reset_index()
tags_group

Unnamed: 0,movieId,tag
0,1,other fun
1,2,other fantasy
2,3,other
3,5,pregnancy remake
4,7,remake
...,...,...
1567,183611,other funny
1568,184471,adventure other
1569,187593,other sarcasm
1570,187595,other


In [196]:
# Добавим получившиеся теги в дата-сет movies

movies = movies.join(tags_group.set_index('movieId'), on='movieId')

movies

Unnamed: 0,movieId,title,genres,rating_mean,rating_median,rating_std,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.920930,4.0,0.834859,other fun
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.431818,3.5,0.881713,other fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance,3.259615,3.0,1.054823,other
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,2.357143,3.0,0.852168,
4,5,Father of the Bride Part II (1995),Comedy,3.071429,3.0,0.907148,pregnancy remake
...,...,...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,4.000000,4.0,,
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,3.500000,3.5,,
9739,193585,Flint (2017),Drama,3.500000,3.5,,
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,3.500000,3.5,,


In [197]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   movieId        9742 non-null   int64  
 1   title          9742 non-null   object 
 2   genres         9742 non-null   object 
 3   rating_mean    9724 non-null   float64
 4   rating_median  9724 non-null   float64
 5   rating_std     6278 non-null   float64
 6   tag            1572 non-null   object 
dtypes: float64(3), int64(1), object(3)
memory usage: 532.9+ KB


Теперь конвертируем жанры в строку с пробелами:

In [198]:
movies['genres'] = movies.apply(lambda x: ' '.join(x['genres'].split('|')), axis=1)

movies

Unnamed: 0,movieId,title,genres,rating_mean,rating_median,rating_std,tag
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,3.920930,4.0,0.834859,other fun
1,2,Jumanji (1995),Adventure Children Fantasy,3.431818,3.5,0.881713,other fantasy
2,3,Grumpier Old Men (1995),Comedy Romance,3.259615,3.0,1.054823,other
3,4,Waiting to Exhale (1995),Comedy Drama Romance,2.357143,3.0,0.852168,
4,5,Father of the Bride Part II (1995),Comedy,3.071429,3.0,0.907148,pregnancy remake
...,...,...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action Animation Comedy Fantasy,4.000000,4.0,,
9738,193583,No Game No Life: Zero (2017),Animation Comedy Fantasy,3.500000,3.5,,
9739,193585,Flint (2017),Drama,3.500000,3.5,,
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action Animation,3.500000,3.5,,


Присоединим все эти данные к дата-сету ratings

In [199]:
ratings_with_info = ratings.join(movies.set_index('movieId'), on='movieId')

ratings_with_info

Unnamed: 0,userId,movieId,rating,title,genres,rating_mean,rating_median,rating_std,tag
0,1,1,4.0,Toy Story (1995),Adventure Animation Children Comedy Fantasy,3.920930,4.0,0.834859,other fun
1,1,3,4.0,Grumpier Old Men (1995),Comedy Romance,3.259615,3.0,1.054823,other
2,1,6,4.0,Heat (1995),Action Crime Thriller,3.946078,4.0,0.817224,
3,1,47,5.0,Seven (a.k.a. Se7en) (1995),Mystery Thriller,3.975369,4.0,0.922429,serial killer mystery twist ending
4,1,50,5.0,"Usual Suspects, The (1995)",Crime Mystery Thriller,4.237745,4.5,0.800921,thriller heist twist ending mindfuck other sus...
...,...,...,...,...,...,...,...,...,...
100831,610,166534,4.0,Split (2017),Drama Horror Thriller,3.333333,4.0,1.570563,
100832,610,168248,5.0,John Wick: Chapter Two (2017),Action Crime Thriller,4.142857,4.0,0.748013,action other organized crime
100833,610,168250,5.0,Get Out (2017),Horror,3.633333,4.0,0.972234,
100834,610,168252,5.0,Logan (2017),Action Sci-Fi,4.280000,4.5,0.646787,other emotional gritty dark


In [200]:
ratings_with_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   userId         100836 non-null  int64  
 1   movieId        100836 non-null  int64  
 2   rating         100836 non-null  float64
 3   title          100836 non-null  object 
 4   genres         100836 non-null  object 
 5   rating_mean    100836 non-null  float64
 6   rating_median  100836 non-null  float64
 7   rating_std     97390 non-null   float64
 8   tag            48287 non-null   object 
dtypes: float64(4), int64(2), object(3)
memory usage: 6.9+ MB


Прежде чем приступить к обучению TfidfVectorizer, нужно избавиться от пропусков

In [201]:
ratings_with_info.tag.fillna(' ', inplace=True)
ratings_with_info.rating_std.fillna(-1, inplace=True)

In [202]:
ratings_with_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   userId         100836 non-null  int64  
 1   movieId        100836 non-null  int64  
 2   rating         100836 non-null  float64
 3   title          100836 non-null  object 
 4   genres         100836 non-null  object 
 5   rating_mean    100836 non-null  float64
 6   rating_median  100836 non-null  float64
 7   rating_std     100836 non-null  float64
 8   tag            100836 non-null  object 
dtypes: float64(4), int64(2), object(3)
memory usage: 6.9+ MB


Теперь обучим TfidfVectorizer по тегам и жанрам

In [203]:
# По жанрам

v = TfidfVectorizer()
X_genres = v.fit_transform(ratings_with_info.genres)

X_genres

<100836x24 sparse matrix of type '<class 'numpy.float64'>'
	with 292687 stored elements in Compressed Sparse Row format>

In [204]:
print(v.get_feature_names())

['action', 'adventure', 'animation', 'children', 'comedy', 'crime', 'documentary', 'drama', 'fantasy', 'fi', 'film', 'genres', 'horror', 'imax', 'listed', 'musical', 'mystery', 'no', 'noir', 'romance', 'sci', 'thriller', 'war', 'western']


In [205]:
df_genres = pd.DataFrame(X_genres.toarray(), columns=v.get_feature_names())

df_genres

Unnamed: 0,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,fi,film,genres,horror,imax,listed,musical,mystery,no,noir,romance,sci,thriller,war,western
0,0.000000,0.363885,0.549735,0.508407,0.291944,0.000000,0.0,0.000000,0.470819,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0
1,0.000000,0.000000,0.000000,0.000000,0.582902,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.812542,0.000000,0.000000,0.0,0.0
2,0.515013,0.000000,0.000000,0.000000,0.000000,0.657871,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.549516,0.0,0.0
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.836939,0.0,0.0,0.000000,0.000000,0.547296,0.0,0.0
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.548050,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.700054,0.0,0.0,0.000000,0.000000,0.457783,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100831,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.398976,0.000000,0.000000,0.0,0.0,0.770683,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.496856,0.0,0.0
100832,0.515013,0.000000,0.000000,0.000000,0.000000,0.657871,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.549516,0.0,0.0
100833,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,1.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0
100834,0.488728,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.616905,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.616905,0.000000,0.0,0.0


In [206]:
res = pd.concat([ratings_with_info, df_genres], axis=1)

res.head()

Unnamed: 0,userId,movieId,rating,title,genres,rating_mean,rating_median,rating_std,tag,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,fi,film,genres.1,horror,imax,listed,musical,mystery,no,noir,romance,sci,thriller,war,western
0,1,1,4.0,Toy Story (1995),Adventure Animation Children Comedy Fantasy,3.92093,4.0,0.834859,other fun,0.0,0.363885,0.549735,0.508407,0.291944,0.0,0.0,0.0,0.470819,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,3,4.0,Grumpier Old Men (1995),Comedy Romance,3.259615,3.0,1.054823,other,0.0,0.0,0.0,0.0,0.582902,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.812542,0.0,0.0,0.0,0.0
2,1,6,4.0,Heat (1995),Action Crime Thriller,3.946078,4.0,0.817224,,0.515013,0.0,0.0,0.0,0.0,0.657871,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.549516,0.0,0.0
3,1,47,5.0,Seven (a.k.a. Se7en) (1995),Mystery Thriller,3.975369,4.0,0.922429,serial killer mystery twist ending,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.836939,0.0,0.0,0.0,0.0,0.547296,0.0,0.0
4,1,50,5.0,"Usual Suspects, The (1995)",Crime Mystery Thriller,4.237745,4.5,0.800921,thriller heist twist ending mindfuck other sus...,0.0,0.0,0.0,0.0,0.0,0.54805,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.700054,0.0,0.0,0.0,0.0,0.457783,0.0,0.0


In [207]:
# По тегам

X_tags = v.fit_transform(ratings_with_info.tag)
X_tags

<100836x206 sparse matrix of type '<class 'numpy.float64'>'
	with 148460 stored elements in Compressed Sparse Row format>

In [208]:
print(v.get_feature_names())

['250', 'action', 'adam', 'adolescence', 'adultery', 'adventure', 'africa', 'al', 'alcoholism', 'aliens', 'amnesia', 'and', 'animal', 'animation', 'anime', 'apocalyptic', 'appealing', 'artificial', 'arts', 'assassination', 'astaire', 'atmospheric', 'australia', 'bad', 'baseball', 'based', 'beautiful', 'bible', 'biopic', 'bittersweet', 'black', 'book', 'boxing', 'brad', 'brothers', 'business', 'cerebral', 'charles', 'christmas', 'cinematography', 'civil', 'classic', 'clever', 'coen', 'comedy', 'comic', 'commentary', 'corruption', 'court', 'creepy', 'crime', 'cross', 'cult', 'dark', 'death', 'dialogue', 'dicaprio', 'dickens', 'disability', 'disney', 'disturbing', 'divorce', 'drag', 'dreamlike', 'dressing', 'drugs', 'dystopia', 'emotional', 'ending', 'england', 'existentialism', 'family', 'fantasy', 'ferrell', 'fi', 'film', 'friendship', 'fun', 'funny', 'future', 'gambling', 'ghosts', 'good', 'gothic', 'great', 'gritty', 'hallucinatory', 'heartwarming', 'heist', 'hepburn', 'high', 'hit', 

In [209]:
df_tags = pd.DataFrame(X_tags.toarray(), columns=v.get_feature_names())

df_tags

Unnamed: 0,250,action,adam,adolescence,adultery,adventure,africa,al,alcoholism,aliens,amnesia,and,animal,animation,anime,apocalyptic,appealing,artificial,arts,assassination,astaire,atmospheric,australia,bad,baseball,based,beautiful,bible,biopic,bittersweet,black,book,boxing,brad,brothers,business,cerebral,charles,christmas,cinematography,civil,classic,clever,coen,comedy,comic,commentary,corruption,court,creepy,crime,cross,cult,dark,death,dialogue,dicaprio,dickens,disability,disney,disturbing,divorce,drag,dreamlike,dressing,drugs,dystopia,emotional,ending,england,existentialism,family,fantasy,ferrell,fi,film,friendship,fun,funny,future,gambling,ghosts,good,gothic,great,gritty,hallucinatory,heartwarming,heist,hepburn,high,hit,holocaust,humor,humorous,ii,illness,imdb,in,india,inspirational,intelligence,intelligent,ireland,jason,journalism,judaism,kidnapping,killer,king,lawyers,leonardo,mafia,magic,marriage,martial,men,mental,military,mindfuck,movie,murder,music,mystery,netflix,new,nick,nora,on,opera,organized,other,pacino,paranoia,penalty,philosophical,philosophy,pitt,poignant,police,politics,post,predictable,pregnancy,prostitution,provoking,psychological,psychology,quentin,queue,quirky,race,racism,religion,remade,remake,revenge,robots,rogers,romance,samurai,sandler,sarcasm,satire,school,sci,sequel,serial,sexuality,shakespeare,social,soundtrack,space,spoof,sports,stephen,stylish,stylized,superhero,surreal,suspense,swashbuckler,tarantino,television,tense,terrorism,thought,thriller,time,top,touching,tracy,travel,twins,twist,vietnam,violence,visually,war,wedding,white,will,witty,world,york,zombies
0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.945383,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.325962,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,1.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.376827,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.500397,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.463906,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.500397,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.376827,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.372639,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.435222,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.392115,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.154898,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.382809,0.0,0.0,0.0,0.0,0.0,0.0,0.456699,0.0,0.0,0.0,0.0,0.0,0.0,0.372639,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100831,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100832,0.0,0.529442,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.534759,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.625242,0.206872,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100833,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100834,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.501116,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.563755,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.622051,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.210037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [210]:
res = pd.concat([res, df_tags], axis=1)

res.head()

Unnamed: 0,userId,movieId,rating,title,genres,rating_mean,rating_median,rating_std,tag,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,fi,film,genres.1,horror,imax,listed,musical,mystery,no,noir,romance,sci,thriller,war,western,250,action.1,adam,adolescence,adultery,adventure.1,africa,al,alcoholism,aliens,amnesia,and,animal,animation.1,anime,apocalyptic,appealing,artificial,arts,assassination,astaire,atmospheric,australia,bad,baseball,based,beautiful,bible,biopic,bittersweet,black,book,boxing,brad,brothers,business,cerebral,charles,christmas,cinematography,civil,classic,clever,coen,comedy.1,comic,commentary,corruption,court,creepy,crime.1,cross,cult,dark,death,dialogue,dicaprio,dickens,disability,disney,disturbing,divorce,drag,dreamlike,dressing,drugs,dystopia,emotional,ending,england,existentialism,family,fantasy.1,ferrell,fi.1,film.1,friendship,fun,funny,future,gambling,ghosts,good,gothic,great,gritty,hallucinatory,heartwarming,heist,hepburn,high,hit,holocaust,humor,humorous,ii,illness,imdb,in,india,inspirational,intelligence,intelligent,ireland,jason,journalism,judaism,kidnapping,killer,king,lawyers,leonardo,mafia,magic,marriage,martial,men,mental,military,mindfuck,movie,murder,music,mystery.1,netflix,new,nick,nora,on,opera,organized,other,pacino,paranoia,penalty,philosophical,philosophy,pitt,poignant,police,politics,post,predictable,pregnancy,prostitution,provoking,psychological,psychology,quentin,queue,quirky,race,racism,religion,remade,remake,revenge,robots,rogers,romance.1,samurai,sandler,sarcasm,satire,school,sci.1,sequel,serial,sexuality,shakespeare,social,soundtrack,space,spoof,sports,stephen,stylish,stylized,superhero,surreal,suspense,swashbuckler,tarantino,television,tense,terrorism,thought,thriller.1,time,top,touching,tracy,travel,twins,twist,vietnam,violence,visually,war.1,wedding,white,will,witty,world,york,zombies
0,1,1,4.0,Toy Story (1995),Adventure Animation Children Comedy Fantasy,3.92093,4.0,0.834859,other fun,0.0,0.363885,0.549735,0.508407,0.291944,0.0,0.0,0.0,0.470819,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.945383,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.325962,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,3,4.0,Grumpier Old Men (1995),Comedy Romance,3.259615,3.0,1.054823,other,0.0,0.0,0.0,0.0,0.582902,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.812542,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,6,4.0,Heat (1995),Action Crime Thriller,3.946078,4.0,0.817224,,0.515013,0.0,0.0,0.0,0.0,0.657871,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.549516,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,47,5.0,Seven (a.k.a. Se7en) (1995),Mystery Thriller,3.975369,4.0,0.922429,serial killer mystery twist ending,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.836939,0.0,0.0,0.0,0.0,0.547296,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.376827,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.500397,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.463906,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.500397,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.376827,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,50,5.0,"Usual Suspects, The (1995)",Crime Mystery Thriller,4.237745,4.5,0.800921,thriller heist twist ending mindfuck other sus...,0.0,0.0,0.0,0.0,0.0,0.54805,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.700054,0.0,0.0,0.0,0.0,0.457783,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.372639,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.435222,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.392115,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.154898,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.382809,0.0,0.0,0.0,0.0,0.0,0.0,0.456699,0.0,0.0,0.0,0.0,0.0,0.0,0.372639,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Теперь приступим к обучению модели линейной регрессии.

Сначала посмотрим, какое кол-во оценок ставили юзеры

In [211]:
users_rate = res[['userId', 'rating']].groupby(['userId']).count()

users_rate.sort_values(by='rating', ascending=False)

Unnamed: 0_level_0,rating
userId,Unnamed: 1_level_1
414,2698
599,2478
474,2108
448,1864
274,1346
...,...
442,20
569,20
320,20
576,20


Попробуем построить модель для userId == 274

In [212]:
df_274 = res[res['userId'] == 274]

df_274.head()

Unnamed: 0,userId,movieId,rating,title,genres,rating_mean,rating_median,rating_std,tag,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,fi,film,genres.1,horror,imax,listed,musical,mystery,no,noir,romance,sci,thriller,war,western,250,action.1,adam,adolescence,adultery,adventure.1,africa,al,alcoholism,aliens,amnesia,and,animal,animation.1,anime,apocalyptic,appealing,artificial,arts,assassination,astaire,atmospheric,australia,bad,baseball,based,beautiful,bible,biopic,bittersweet,black,book,boxing,brad,brothers,business,cerebral,charles,christmas,cinematography,civil,classic,clever,coen,comedy.1,comic,commentary,corruption,court,creepy,crime.1,cross,cult,dark,death,dialogue,dicaprio,dickens,disability,disney,disturbing,divorce,drag,dreamlike,dressing,drugs,dystopia,emotional,ending,england,existentialism,family,fantasy.1,ferrell,fi.1,film.1,friendship,fun,funny,future,gambling,ghosts,good,gothic,great,gritty,hallucinatory,heartwarming,heist,hepburn,high,hit,holocaust,humor,humorous,ii,illness,imdb,in,india,inspirational,intelligence,intelligent,ireland,jason,journalism,judaism,kidnapping,killer,king,lawyers,leonardo,mafia,magic,marriage,martial,men,mental,military,mindfuck,movie,murder,music,mystery.1,netflix,new,nick,nora,on,opera,organized,other,pacino,paranoia,penalty,philosophical,philosophy,pitt,poignant,police,politics,post,predictable,pregnancy,prostitution,provoking,psychological,psychology,quentin,queue,quirky,race,racism,religion,remade,remake,revenge,robots,rogers,romance.1,samurai,sandler,sarcasm,satire,school,sci.1,sequel,serial,sexuality,shakespeare,social,soundtrack,space,spoof,sports,stephen,stylish,stylized,superhero,surreal,suspense,swashbuckler,tarantino,television,tense,terrorism,thought,thriller.1,time,top,touching,tracy,travel,twins,twist,vietnam,violence,visually,war.1,wedding,white,will,witty,world,york,zombies
39229,274,1,4.0,Toy Story (1995),Adventure Animation Children Comedy Fantasy,3.92093,4.0,0.834859,other fun,0.0,0.363885,0.549735,0.508407,0.291944,0.0,0.0,0.0,0.470819,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.945383,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.325962,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
39230,274,2,3.5,Jumanji (1995),Adventure Children Fantasy,3.431818,3.5,0.881713,other fantasy,0.0,0.464933,0.0,0.649587,0.0,0.0,0.0,0.0,0.60156,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.94857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.316569,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
39231,274,6,4.0,Heat (1995),Action Crime Thriller,3.946078,4.0,0.817224,,0.515013,0.0,0.0,0.0,0.0,0.657871,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.549516,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
39232,274,8,3.0,Tom and Huck (1995),Adventure Children,2.875,3.0,1.125992,,0.0,0.582019,0.0,0.813175,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
39233,274,10,4.0,GoldenEye (1995),Action Adventure Thriller,3.496212,3.5,0.859381,,0.544987,0.604027,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.581498,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Удаляем лишние столбцы

In [213]:
df_274 = df_274.drop(['userId', 'movieId', 'title', 'genres', 'tag'], axis=1)

In [214]:
X = df_274.drop(['rating'], axis=1)
y = df_274.rating

Данных получилось довольно много, попробуем уменьшить размерность

In [215]:
svd = TruncatedSVD(n_components=10, random_state=42)

In [216]:
X = svd.fit_transform(X)

X

array([[ 5.68361158, -0.07279231,  0.40799754, ..., -0.41678731,
         0.17705431,  0.02743227],
       [ 4.99443879, -0.05445664,  0.17412932, ..., -0.45620714,
         0.0440866 , -0.02490224],
       [ 5.69839114,  0.39930574, -0.1789477 , ..., -0.35750224,
         0.19143813, -0.03491533],
       ...,
       [ 5.08683313,  0.39878135, -0.25838443, ...,  0.32923058,
         0.11622272,  0.28844028],
       [ 5.04496022,  0.13775047,  0.03684939, ..., -0.14765807,
        -0.37574704, -0.17068046],
       [ 4.9162667 , -0.64434076,  0.40637942, ...,  0.10075552,
        -0.01630697,  0.10031489]])

И строим модель

In [217]:
def get_score(X, y, random_seed=42, model=None):
    
    if model is None:
        model = LinearRegression()
        
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_seed )  
    
    model.fit(X_train, y_train)
    pred_values_train = model.predict(X_train)
    pred_values_test = model.predict(X_test)
    
    print('TRAIN')
    print('Error RMSE: {}'.format(np.sqrt(mean_squared_error(y_train, pred_values_train))))    
    print('Score: {}'.format(model.score(X_train, y_train)))   
    print()
    print('TEST')
    print('Error RMSE: {}'.format(np.sqrt(mean_squared_error(y_test, pred_values_test))))    
    print('Score: {}'.format(model.score(X_test, y_test)))  
    
    return np.sqrt(mean_squared_error(y_test, pred_values_test)), model.score(X_test, y_test)

In [218]:
get_score(X, y)

TRAIN
Error RMSE: 0.5861496928677981
Score: 0.3760581813324624

TEST
Error RMSE: 0.6676124512993434
Score: 0.3117961741987495


(0.6676124512993434, 0.3117961741987495)