In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm_notebook

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

%matplotlib inline

In [2]:
links = pd.read_csv('../lecture-1/links.csv')
movies = pd.read_csv('../lecture-1/movies.csv')
ratings = pd.read_csv('../lecture-1/ratings.csv')
tags = pd.read_csv('../lecture-1/tags.csv')

In [3]:
movies.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [4]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [5]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [7]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


# Домашнее задание (begin)

In [8]:
joined_ratings = ratings.join(movies.set_index('movieId'), on='movieId').reset_index()
joined_ratings_tags = pd.merge(joined_ratings, tags,  how='left', on=['movieId','userId'])
#дропнуть индекс, вытащить из timestamp день недели и месяц, дропнуть timestamp (оба поля), заменить NaN на значение non_tag, timestamp незаполненный заменить на 0

In [9]:
joined_ratings_tags.head()

Unnamed: 0,index,userId,movieId,rating,timestamp_x,title,genres,tag,timestamp_y
0,0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,,
1,1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance,,
2,2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller,,
3,3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,,
4,4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,,


Вытащим из timestamp номер дня недели и номер месяца, заполним NaN "нулями" и дропнем этим столбцы

In [10]:
joined_ratings_tags['rating_month']=pd.to_datetime(joined_ratings_tags['timestamp_x'],unit='s').dt.month
joined_ratings_tags['rating_dayweek']=pd.to_datetime(joined_ratings_tags['timestamp_x'],unit='s').dt.dayofweek
joined_ratings_tags['tag_month']=pd.to_datetime(joined_ratings_tags['timestamp_y'],unit='s').dt.month
joined_ratings_tags['tag_dayweek']=pd.to_datetime(joined_ratings_tags['timestamp_y'],unit='s').dt.dayofweek

In [11]:
joined_ratings_tags['rating_month'] = joined_ratings_tags['rating_month'].fillna(0)
joined_ratings_tags['rating_dayweek'] = joined_ratings_tags['rating_dayweek'].fillna(0)
joined_ratings_tags['tag_month'] = joined_ratings_tags['tag_month'].fillna(0)
joined_ratings_tags['tag_dayweek'] = joined_ratings_tags['tag_dayweek'].fillna(0)

In [12]:
joined_ratings_tags['tag'] = joined_ratings_tags['tag'].fillna('notag')

In [13]:
joined_ratings_tags = joined_ratings_tags.drop(['index', 'timestamp_x', 'timestamp_y'], 1) 

In [14]:
#обработаем столбец с жанрами, чтобы его можно было токенизировать
joined_ratings_tags['genres'] = joined_ratings_tags['genres'].apply(change_string)

In [15]:
#прогоним tfidf на тэгах и жанрах и загоним результаты в новые датафреймы, которые потом объединим с основным датасетом
#tfidfv = TfidfVectorizer()
#tag_tfidf = tfidfv.fit_transform(joined_ratings_tags['tag'])
#genre_tfidf = tfidfv.fit_transform(joined_ratings_tags['genres'])

In [56]:
count_vect1 = CountVectorizer()
count_vect2 = CountVectorizer()
tag_counts = count_vect1.fit_transform(joined_ratings_tags['tag'])
genre_counts = count_vect2.fit_transform(joined_ratings_tags['genres'])
tfidf_transformer1 = TfidfTransformer()
tfidf_transformer2 = TfidfTransformer()
tag_tfidf = tfidf_transformer1.fit_transform(tag_counts)
genre_tfidf = tfidf_transformer2.fit_transform(genre_counts)

In [52]:
df_genre = pd.DataFrame(genre_tfidf.toarray(), columns=count_vect2.get_feature_names())

In [60]:
tag_tfidf

<102677x1709 sparse matrix of type '<class 'numpy.float64'>'
	with 104354 stored elements in Compressed Sparse Row format>

## Хорошо бы разобраться с тэгами, как их загнать в датафрейм

In [61]:
df_tag = pd.DataFrame(tag_tfidf.toarray(), columns=count_vect1.get_feature_names())

MemoryError: 

In [62]:
result_df = pd.concat([joined_ratings_tags, df_genre], axis=1)

In [64]:
result_df.drop('genres', axis=1, inplace=True)

In [65]:
result_df.head()

Unnamed: 0,userId,movieId,rating,title,tag,rating_month,rating_dayweek,tag_month,tag_dayweek,action,...,horror,imax,musical,mystery,nogenreslisted,romance,scifi,thriller,war,western
0,1,1,4.0,Toy Story (1995),notag,7,6,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,3,4.0,Grumpier Old Men (1995),notag,7,6,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.812591,0.0,0.0,0.0,0.0
2,1,6,4.0,Heat (1995),notag,7,6,0.0,0.0,0.516958,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.54915,0.0,0.0
3,1,47,5.0,Seven (a.k.a. Se7en) (1995),notag,7,6,0.0,0.0,0.0,...,0.0,0.0,0.0,0.837071,0.0,0.0,0.0,0.547094,0.0,0.0
4,1,50,5.0,"Usual Suspects, The (1995)",notag,7,6,0.0,0.0,0.0,...,0.0,0.0,0.0,0.700492,0.0,0.0,0.0,0.457828,0.0,0.0


In [69]:
user_stat = result_df.groupby('userId')[['rating']].agg(['mean','count','median','std','var']).reset_index()
user_stat.columns = ['userId', 'rating_mean','rating_count','rating_median','rating_std','rating_var']
user_stat.head()

Unnamed: 0,userId,rating_mean,rating_count,rating_median,rating_std,rating_var
0,1,4.366379,232,5.0,0.800048,0.640077
1,2,4.128571,35,4.0,0.834397,0.696218
2,3,2.435897,39,0.5,2.090642,4.370783
3,4,3.555556,216,4.0,1.314204,1.727132
4,5,3.636364,44,4.0,0.990441,0.980973


In [70]:
movie_stat = result_df.groupby('movieId')[['rating']].agg(['mean','count','median','std','var']).reset_index()
movie_stat.columns = ['movieId', 'movie_mean','movie_count','movie_median','movie_std','movie_var']
movie_stat.head()

Unnamed: 0,movieId,movie_mean,movie_count,movie_median,movie_std,movie_var
0,1,3.92093,215,4.0,0.834859,0.69699
1,2,3.441964,112,3.5,0.876997,0.769124
2,3,3.245283,53,3.0,1.049829,1.102141
3,4,2.357143,7,3.0,0.852168,0.72619
4,5,3.04,50,3.0,0.924938,0.85551


In [71]:
result_df = pd.merge(result_df, user_stat,  how='left', on=['userId'])
result_df = pd.merge(result_df, movie_stat,  how='left', on=['movieId'])
result_df.head()

Unnamed: 0,userId,movieId,rating,title,tag,rating_month,rating_dayweek,tag_month,tag_dayweek,action,...,rating_mean,rating_count,rating_median,rating_std,rating_var,movie_mean,movie_count,movie_median,movie_std,movie_var
0,1,1,4.0,Toy Story (1995),notag,7,6,0.0,0.0,0.0,...,4.366379,232,5.0,0.800048,0.640077,3.92093,215,4.0,0.834859,0.69699
1,1,3,4.0,Grumpier Old Men (1995),notag,7,6,0.0,0.0,0.0,...,4.366379,232,5.0,0.800048,0.640077,3.245283,53,3.0,1.049829,1.102141
2,1,6,4.0,Heat (1995),notag,7,6,0.0,0.0,0.516958,...,4.366379,232,5.0,0.800048,0.640077,3.946078,102,4.0,0.817224,0.667856
3,1,47,5.0,Seven (a.k.a. Se7en) (1995),notag,7,6,0.0,0.0,0.0,...,4.366379,232,5.0,0.800048,0.640077,3.980392,204,4.0,0.922947,0.85183
4,1,50,5.0,"Usual Suspects, The (1995)",notag,7,6,0.0,0.0,0.0,...,4.366379,232,5.0,0.800048,0.640077,4.252404,208,4.5,0.800057,0.640091


In [72]:
result_df.drop('title', axis=1, inplace=True)
result_df.drop('tag', axis=1, inplace=True) #предполагается, что тэги мы преобразовали в столбцы

In [81]:
result_df['movie_std'] = result_df['movie_std'].fillna(0)
result_df['movie_var'] = result_df['movie_var'].fillna(0)

In [87]:
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [82]:
X = result_df.loc[:, result_df.columns != 'rating']
y = result_df['rating']

In [83]:
result_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 102677 entries, 0 to 102676
Data columns (total 37 columns):
userId            102677 non-null int64
movieId           102677 non-null int64
rating            102677 non-null float64
rating_month      102677 non-null int64
rating_dayweek    102677 non-null int64
tag_month         102677 non-null float64
tag_dayweek       102677 non-null float64
action            102677 non-null float64
adventure         102677 non-null float64
animation         102677 non-null float64
children          102677 non-null float64
comedy            102677 non-null float64
crime             102677 non-null float64
documentary       102677 non-null float64
drama             102677 non-null float64
fantasy           102677 non-null float64
filmnoir          102677 non-null float64
horror            102677 non-null float64
imax              102677 non-null float64
musical           102677 non-null float64
mystery           102677 non-null float64
nogenreslisted 

In [84]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 777)

In [85]:
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)
print('R2 score LinearRegression ', r2_score(lr_pred, y_test))
print('mean square error LinearRegression ', mean_squared_error(lr_pred, y_test))
print('Variance score ', lr.score(X_test, y_test))

R2 score LinearRegression  -0.42936874881628184
mean square error LinearRegression  0.6337916214925956
Variance score  0.4084487374183655


In [88]:
rfr =  RandomForestRegressor(n_estimators=30, max_depth=5, min_samples_leaf=20, max_features=0.5, n_jobs=-1)
rfr.fit(X_train, y_train)
rfr_pred = rfr.predict(X_test)
print('R2 score svr_rbf ', r2_score(rfr_pred, y_test))
print('mean square error svr_rbf ', mean_squared_error(rfr_pred, y_test))
print('Variance score ', rfr.score(X_test, y_test))

R2 score svr_rbf  -0.6865207502319737
mean square error svr_rbf  0.6352768373353813
Variance score  0.407062506870008


# Домашнее задание (end)

In [18]:
movie_genres = [change_string(g) for g in movies.genres.values]

In [19]:
movie_genres[:10]

['Adventure Animation Children Comedy Fantasy',
 'Adventure Children Fantasy',
 'Comedy Romance',
 'Comedy Drama Romance',
 'Comedy',
 'Action Crime Thriller',
 'Comedy Romance',
 'Adventure Children',
 'Action',
 'Action Adventure Thriller']

In [20]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(movie_genres)

In [21]:
count_vect.vocabulary_

{'adventure': 1,
 'animation': 2,
 'children': 3,
 'comedy': 4,
 'fantasy': 8,
 'romance': 15,
 'drama': 7,
 'action': 0,
 'crime': 5,
 'thriller': 17,
 'horror': 10,
 'mystery': 13,
 'scifi': 16,
 'war': 18,
 'musical': 12,
 'documentary': 6,
 'imax': 11,
 'western': 19,
 'filmnoir': 9,
 'nogenreslisted': 14}

In [22]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [23]:
tfidf_transformer_1 = TfidfVectorizer() #по сути равен CountVectorizer + TfidfTransformer
X_train_tfidf_1 = tfidf_transformer_1.fit_transform(movie_genres)

In [19]:
neigh = NearestNeighbors(n_neighbors=7, n_jobs=-1, metric='euclidean') 
neigh.fit(X_train_tfidf_1)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='euclidean',
         metric_params=None, n_jobs=-1, n_neighbors=7, p=2, radius=1.0)

In [45]:
test = change_string("Adventure|Comedy|Fantasy|Crime")

predict = count_vect.transform([test])
X_tfidf2 = tfidf_transformer.transform(predict)

res = neigh.kneighbors(X_tfidf2, return_distance=True)

In [21]:
res

(array([[0.42079615, 0.53300564, 0.54288608, 0.54288608, 0.54288608,
         0.54288608, 0.54288608]]),
 array([[6774, 9096, 5636, 6723, 3376, 7496, 9717]], dtype=int64))

In [22]:
movies.iloc[res[1][0]]

Unnamed: 0,movieId,title,genres
6774,60074,Hancock (2008),Action|Adventure|Comedy|Crime|Fantasy
9096,143559,L.A. Slasher (2015),Comedy|Crime|Fantasy
5636,27368,Asterix & Obelix: Mission Cleopatra (Astérix &...,Adventure|Comedy|Fantasy
6723,58972,Nim's Island (2008),Adventure|Comedy|Fantasy
3376,4591,Erik the Viking (1989),Adventure|Comedy|Fantasy
7496,82854,Gulliver's Travels (2010),Adventure|Comedy|Fantasy
9717,188833,The Man Who Killed Don Quixote (2018),Adventure|Comedy|Fantasy


In [24]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [25]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [26]:
movies_with_tags = movies.join(tags.set_index('movieId'), on='movieId')

In [27]:
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,1139046000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,1137207000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun,1525286000.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,fantasy,1528844000.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,magic board game,1528844000.0


In [28]:
movies_with_tags[movies_with_tags.title == 'Toy Story (1995)']

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,1139046000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,1137207000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun,1525286000.0


In [29]:
movies_with_tags.tag.unique()

array(['pixar', 'fun', 'fantasy', ..., 'star wars', 'gintama', 'remaster'],
      dtype=object)

In [30]:
movies_with_tags.dropna(inplace=True)

In [31]:
movies_with_tags.title.unique().shape

(1572,)

In [32]:
tag_strings = []
movies = []

for movie, group in tqdm_notebook(movies_with_tags.groupby('title')):
    tag_strings.append(' '.join([str(s).replace(' ', '').replace('-', '') for s in group.tag.values]))
    movies.append(movie)

HBox(children=(IntProgress(value=0, max=1572), HTML(value='')))




In [33]:
tag_strings[:5]

['artistic Funny humorous inspiring intelligent quirky romance ZooeyDeschanel',
 'lawyers',
 'creepy suspense',
 'Shakespearesortof',
 'dogs remake']

In [34]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(tag_strings)

In [35]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [36]:
neigh = NearestNeighbors(n_neighbors=10, n_jobs=-1, metric='manhattan') 
neigh.fit(X_train_tfidf)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='manhattan',
         metric_params=None, n_jobs=-1, n_neighbors=10, p=2, radius=1.0)

In [63]:
for i in range(len(movies)):
    if 'Magnolia (1999)' == movies[i]:
        print(i)

822


In [40]:
tag_strings[823]

'baseball'

In [37]:
test = change_string('pixar pixar fun')

predict = count_vect.transform([test])
X_tfidf2 = tfidf_transformer.transform(predict)

res = neigh.kneighbors(X_tfidf2, return_distance=True)

In [38]:
res

(array([[0., 0., 1., 1., 1., 1., 1., 1., 1., 1.]]),
 array([[661, 822, 947, 955, 954, 953, 951, 950, 949, 959]], dtype=int64))

In [39]:
for i in res[1][0]:
    print(movies[i])

In a Lonely Place (1950)
Magnolia (1999)
Neon Genesis Evangelion: Death & Rebirth (Shin seiki Evangelion Gekijô-ban: Shito shinsei) (1997)
Night and Day (1946)
Nicholas Nickleby (2002)
Niagara (1953)
Never Been Kissed (1999)
Network (1976)
Net, The (1995)
Night of the Hunter, The (1955)
