In [1]:
import pandas as pd
import numpy as np
import igraph as ig

import plotly.express as px

from matplotlib import pyplot as plt

from helpers import flatten_list
from build_graphml_file import get_user_movie_graph, get_movie_prop_graph

import nltk
from nltk.corpus import stopwords

In [2]:
random_state = 42
stopwords = stopwords.words('english')

In [3]:
genome_tag_vec = pd.read_csv("cleaned/genome_tag_vec.csv")
imdb_data = pd.read_csv("cleaned/imdb_data.csv")
movies = pd.read_csv("cleaned/movies.csv")
# tags = pd.read_csv("data/tags.csv")
test = pd.read_csv("data/test.csv")
train = pd.read_csv("data/train.csv")

In [4]:
genome_tag_vec.head()

Unnamed: 0,movieId,007,007 (series),18th century,1920s,1930s,1950s,1960s,1970s,1980s,...,world politics,world war i,world war ii,writer's life,writers,writing,wuxia,wwii,zombie,zombies
0,1,0.02875,0.02375,0.0625,0.07575,0.14075,0.14675,0.0635,0.20375,0.202,...,0.0405,0.01425,0.0305,0.035,0.14125,0.05775,0.039,0.02975,0.08475,0.022
1,2,0.04125,0.0405,0.06275,0.08275,0.091,0.06125,0.06925,0.096,0.0765,...,0.0525,0.01575,0.0125,0.02,0.12225,0.03275,0.021,0.011,0.10525,0.01975
2,3,0.04675,0.0555,0.02925,0.087,0.0475,0.04775,0.046,0.14275,0.0285,...,0.06275,0.0195,0.02225,0.023,0.122,0.03475,0.017,0.018,0.091,0.01775
3,4,0.03425,0.038,0.0405,0.031,0.065,0.03575,0.029,0.0865,0.032,...,0.05325,0.028,0.01675,0.03875,0.182,0.0705,0.01625,0.01425,0.0885,0.015
4,5,0.043,0.05325,0.038,0.041,0.054,0.06725,0.02775,0.0765,0.0215,...,0.0535,0.0205,0.01425,0.0255,0.19225,0.02675,0.01625,0.013,0.087,0.016


In [5]:
imdb_data.head()

Unnamed: 0,movieId,title_cast,director,runtime,budget,plot_keywords
0,1,Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal...,John Lasseter,81.0,30000000.0,toy|rivalry|cowboy|cgi animation
1,2,Robin Williams|Jonathan Hyde|Kirsten Dunst|Bra...,Jonathan Hensleigh,104.0,65000000.0,board game|adventurer|fight|game
2,3,Walter Matthau|Jack Lemmon|Sophia Loren|Ann-Ma...,Mark Steven Johnson,101.0,25000000.0,boat|lake|neighbor|rivalry
3,4,Whitney Houston|Angela Bassett|Loretta Devine|...,Terry McMillan,124.0,16000000.0,black american|husband wife relationship|betra...
4,5,Steve Martin|Diane Keaton|Martin Short|Kimberl...,Albert Hackett,106.0,30000000.0,fatherhood|doberman|dog|mansion


In [6]:
movies.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995.0
1,2,Jumanji,Adventure|Children|Fantasy,1995.0
2,3,Grumpier Old Men,Comedy|Romance,1995.0
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995.0
4,5,Father of the Bride Part II,Comedy,1995.0


In [7]:
# tags.head()

In [8]:
test.head()

Unnamed: 0,userId,movieId
0,1,2011
1,1,4144
2,1,5767
3,1,6711
4,1,7318


In [9]:
train.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,5163,57669,4.0,1518349992
1,106343,5,4.5,1206238739
2,146790,5459,5.0,1076215539
3,106362,32296,2.0,1423042565
4,9041,366,3.0,833375837


In [10]:
distrib = train.rating.value_counts(normalize=True)

fig = px.bar(distrib, color=distrib.index)

fig.show()

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import OneHotEncoder, Normalizer, StandardScaler, MinMaxScaler, RobustScaler

from sklearn.metrics.pairwise import cosine_similarity

from scipy import sparse

from sklearn.decomposition import TruncatedSVD

from sklearn.cluster import KMeans

In [12]:
full_combine = movies.merge(imdb_data, on='movieId', how='left')
normal_year_mean = full_combine[full_combine.year != 0].year.mean()
full_combine['year'].replace(0, normal_year_mean, inplace=True)

# movies.year.fillna(int(movies.year.median()), inplace=True)
# movies.genres.fillna("<unknown>", inplace=True)
full_combine.title = full_combine.title.str.strip().str.replace(r"(.*), The$", r"The \1", regex=True)
full_combine.title_cast.fillna("", inplace=True)
full_combine.director.fillna("", inplace=True)
full_combine.runtime.fillna(int(full_combine.runtime.median()), inplace=True)
full_combine.budget.fillna(int(full_combine.budget.median()), inplace=True)
full_combine.plot_keywords.fillna("", inplace=True)

full_combine['cast_size'] = full_combine.title_cast.str.split('|').apply(len)
full_combine['genre_count'] = full_combine.genres.str.split('|').apply(len)

movie_groups = train.groupby('movieId')
full_combine['rating_mean'] = movie_groups.rating.mean()
full_combine['rating_std'] = movie_groups.rating.std()
full_combine['rating_iqr'] =  movie_groups.rating.quantile(0.75) - movie_groups.rating.quantile(0.25)
full_combine['rating_count'] = movie_groups.rating.count()

full_combine.fillna(0, inplace=True)

In [13]:
features = []
full_combine.columns

Index(['movieId', 'title', 'genres', 'year', 'title_cast', 'director',
       'runtime', 'budget', 'plot_keywords', 'cast_size', 'genre_count',
       'rating_mean', 'rating_std', 'rating_iqr', 'rating_count'],
      dtype='object')

In [14]:
# title_vectrz = TfidfVectorizer(min_df=20, ngram_range=(1, 3), stop_words=stopwords, norm=None)
# title_vec = title_vectrz.fit_transform(full_combine.title)
# print("Title Tokens:", len(title_vectrz.get_feature_names()))
# features.extend(title_vectrz.get_feature_names())

In [15]:
genre_vectrz = TfidfVectorizer(token_pattern=r"[A-z\-]+", min_df=2, norm=None)
genre_vec = genre_vectrz.fit_transform(full_combine.genres)
print("Genre Tokens:", len(genre_vectrz.get_feature_names()))
features.extend(genre_vectrz.get_feature_names())

Genre Tokens: 20


In [16]:
# cast_vectrz = TfidfVectorizer(token_pattern=r"[^\|]+", min_df=50, norm=None)
# cast_vec = cast_vectrz.fit_transform(full_combine.title_cast)
# print("Cast Tokens:", len(cast_vectrz.get_feature_names()))
# features.extend(cast_vectrz.get_feature_names())

In [17]:
# director_vectrz = TfidfVectorizer(token_pattern=r".+", min_df=5, stop_words=['see full summary'], norm=None)
# director_vec = director_vectrz.fit_transform(full_combine.director)
# print("Director Tokens:", len(director_vectrz.get_feature_names()))
# features.extend(director_vectrz.get_feature_names())

In [18]:
plot_vectrz = TfidfVectorizer(token_pattern=r"[^\|]+", min_df=20, stop_words=stopwords, norm=None)
plot_vec = plot_vectrz.fit_transform(full_combine.plot_keywords)
print("Plot KW Tokens:", len(plot_vectrz.get_feature_names()))
features.extend(plot_vectrz.get_feature_names())

Plot KW Tokens: 439


In [19]:
# gtag_vec = genome_tag_vec.drop('movieId', axis=1).values
# gtag_vec_sm = sparse.coo_matrix(gtag_vec, shape=(10000,1128))


In [20]:
extra_features = ["year", 'rating_mean', 'rating_std', 'rating_iqr', 'rating_count']
features.extend(extra_features)

extra_features = full_combine[extra_features]

scaler = MinMaxScaler()
transformed = scaler.fit_transform(extra_features)
std_extra_sparse = sparse.csr_matrix(transformed)

tfidf_vecs = sparse.hstack([
    # title_vec,
    genre_vec,
    # cast_vec,
    # director_vec,
    plot_vec,
]).tocsr()

vecs = sparse.hstack([tfidf_vecs, std_extra_sparse])
norm = Normalizer(copy=True)
norm_vecs = norm.transform(vecs)
norm_vecs.shape

(62423, 464)

In [21]:
a = norm_vecs.getrow(17067).toarray().flatten()
b = norm_vecs.getrow(14628).toarray().flatten()

f = pd.DataFrame(index=features)
f['a'] = a
f['b'] = b
f['diff'] = np.abs(f.a - f.b)

f[f.a != f.b].sort_values('diff', ascending=False)

Unnamed: 0,a,b,diff
based on comic book,0.0,0.566415,0.566415
thriller,0.0,0.191897,0.191897
superhero,0.670019,0.537044,0.132976
imax,0.544148,0.436153,0.107994
sci-fi,0.310074,0.248535,0.061539
adventure,0.298624,0.239358,0.059267
action,0.252571,0.202445,0.050127
year,0.076569,0.060483,0.016086


In [22]:
def most_similar_to(movieId):
    movie_idx = movies[movies['movieId'] == movieId].index[0]

    return norm_vecs.dot(norm_vecs.getrow(movie_idx).transpose())

avengers = 89745
avengers2 = 122892
amazing_spiderman = 95510
fault_in_stars = 111921

mst = most_similar_to(avengers).toarray().flatten()
n = 10
top_n = mst.argsort(axis=0)[-n:][::-1]
bot_n = mst.argsort(axis=0)[:n][::-1]

for similar_idx in top_n:
    title = full_combine.iloc[similar_idx].title
    sim = round(mst[similar_idx], 10)
    print(sim, similar_idx, title)

print('-' * 20)

for similar_idx in bot_n:
    title = full_combine.iloc[similar_idx].title
    sim = mst[similar_idx]
    print(sim, title)
# most_similar = most_similar_to(avengers).toarray().argmax()
# movie_data.iloc[most_similar]

1.0 17067 The Avengers
0.8624198267 10002 Batman Begins
0.8389877328 28066 Power/Rangers
0.8370835155 3692 X-Men
0.8046544477 5241 Spider-Man
0.8014666 14628 Iron Man 2
0.7665353997 19678 Iron Man 3
0.764033161 24172 Super Capers
0.7423435796 19885 Pacific Rim
0.7423429644 21607 Godzilla
--------------------
0.002133029211248556 Roundhay Garden Scene
0.001954029887012206 Man Walking Around a Corner
0.001910125214377244 Dying Swan, The (Umirayushchii lebed)
0.0017722046933492286 The Black Devil
0.001353019361113644 Buffalo Running
0.0013467001345045465 Twilight of a Woman's Soul (Sumerki zhenskoi dushi)
0.001311982434090117 Old Man Drinking a Glass of Beer
0.0009106128721082572 Athlete Swinging a Pick
0.0006014172925917367 Sallie Gardner at a Gallop
0.0 Passage de Venus


In [23]:
movie_indexes = full_combine.movieId.copy()
experiment = train.copy().drop('timestamp', axis=1)

mid_to_idx = pd.Series(movie_indexes.index.values, index=movie_indexes).to_dict()

In [24]:
def get_similarity(id1, id2):
    a = vecs[mid_to_idx[id1], :]
    b = vecs[mid_to_idx[id2], :]
    return a.dot(b.transpose()).toarray().flatten()[0]

def batch_similarity(id1, id_list: list):
    a = vecs[mid_to_idx[id1], :]
    idxs = [mid_to_idx[i] for i in id_list]
    bs = vecs[idxs, :]
    return bs.dot(a.transpose()).toarray().flatten()

In [25]:
# _test = train.groupby('userId').sample(frac=0.25)
# _train = train[~train.index.isin(_test.index.values)]
def skip_diag_strided(A):
    m = A.shape[0]
    strided = np.lib.stride_tricks.as_strided
    s0,s1 = A.strides
    return strided(A.ravel()[1:], shape=(m-1,m), strides=(s0+s1,s1)).reshape(m,-1)

def remove_diag(x):
    x_no_diag = np.ndarray.flatten(x)
    x_no_diag = np.delete(x_no_diag, range(0, len(x_no_diag), len(x) + 1), 0)
    x_no_diag = x_no_diag.reshape(len(x), len(x) - 1)
    return x_no_diag

In [26]:
mean_movie_rating_by_movie_id = train.groupby('movieId').rating.mean()

In [27]:
combined_data = pd.concat([train, test])
combined_data.drop("timestamp", axis=1, inplace=True)
combined_data.sort_values(['userId', 'rating'], inplace=True)
combined_data.reset_index(inplace=True, drop=True)
combined_data

Unnamed: 0,userId,movieId,rating
0,1,8685,1.0
1,1,2068,2.5
2,1,7939,2.5
3,1,7937,3.0
4,1,4308,3.0
...,...,...,...
15000052,162541,4079,
15000053,162541,4467,
15000054,162541,4980,
15000055,162541,5689,


In [28]:
from joblib import Parallel, delayed
from multiprocessing import cpu_count

def applyParallel(grouped, func):
    results = Parallel(n_jobs=cpu_count())(delayed(func)(group) for _, group in grouped)
    return pd.concat(results)

In [29]:
def predict_ratings(data):
    history = data[data.rating.notna()]
    to_predict = data[data.rating.isna()]

    if len(history) == 1:
        print(data.userId.unique())
        mean_guesses = mean_movie_rating_by_movie_id.loc[to_predict.movieId.values].values
        print(mean_guesses)
        to_predict['pred'] = mean_guesses
        return pd.concat([history, to_predict])

    if len(to_predict) == 0:
        return data

    idxs = [mid_to_idx[i] for i in data.movieId]

    historic_ratings = history.rating.values

    similarity_matrix = remove_diag(cosine_similarity(norm_vecs[idxs, :]))
    sim_matrix = similarity_matrix[len(history):, :len(history)]
    
    sim_totals = sim_matrix.sum(axis=1)
    ratings = np.tile(history.rating, (len(to_predict.rating),1))
    weighted_sums = np.einsum('ij,ij->i', sim_matrix, ratings)

    to_predict['pred'] = weighted_sums / sim_totals
    
    return pd.concat([history, to_predict])

In [30]:
%%time
parallel_preds = applyParallel(combined_data.groupby('userId'), predict_ratings)

CPU times: user 2min 2s, sys: 17.7 s, total: 2min 20s
Wall time: 3min 24s


In [31]:
%%time
predictions_serial = combined_data.groupby('userId').apply(predict_ratings)

[53640]
[2.75215443 2.97679325 2.76315789 3.7418251  3.74937411]
[55362]
[4.2780728  4.04721613 4.31135496 4.12053043 4.04209353 4.15425312]
[79087]
[3.55076969 2.75755068 3.95141928 3.72488976 4.11325255 4.04466623
 3.41178059 3.56951622 2.63598999 3.05463872 3.77619048]
[105714]
[3.06937584 3.99877072 3.89964607 3.60034752 3.81352541 3.97647059
 3.75946283 3.51939351 3.33562823]
[121071]
[3.07925791 3.50635619 3.40513861 3.67543449 3.18970852 3.20113524
 3.36215745 2.80259572 3.62929233]
[127098]
[3.99411938 3.51356327 3.09928385 2.95521511 3.66397244 4.41762359
 3.32575679 3.70742183 4.14417199]
CPU times: user 5min 30s, sys: 20.1 s, total: 5min 50s
Wall time: 6min 36s


In [None]:
predictions.reset_index(drop=True)

In [None]:
predictions[predictions.userId == 53640]

In [None]:
submission = predictions.copy()
submission.drop(submission[submission.rating.notna()].index, inplace=True)
submission["rating"] = submission["pred"]

In [None]:
submission['Id'] = submission.userId.astype(str) + '_' + submission.movieId.astype(str)

In [None]:
%%time 
submission[['Id', 'rating']].to_csv("submission.csv", index=False, chunksize=100_000)