In [1]:
import pandas as pd
import numpy as np
import igraph as ig

import plotly.express as px

from matplotlib import pyplot as plt

from helpers import flatten_list, applyParallel
from build_graphml_file import get_user_movie_graph, get_movie_prop_graph

import nltk
from nltk.corpus import stopwords

In [2]:
random_state = 42
stopwords = stopwords.words('english')

In [3]:
genome_tag_vec = pd.read_csv("cleaned/genome_tag_vec.csv")
imdb_data = pd.read_csv("cleaned/imdb_data.csv")
movies = pd.read_csv("cleaned/movies.csv")
# tags = pd.read_csv("data/tags.csv")
test = pd.read_csv("data/test.csv")
train = pd.read_csv("data/train.csv")

In [4]:
genome_tag_vec.head()

Unnamed: 0,movieId,007,007 (series),18th century,1920s,1930s,1950s,1960s,1970s,1980s,...,world politics,world war i,world war ii,writer's life,writers,writing,wuxia,wwii,zombie,zombies
0,1,0.02875,0.02375,0.0625,0.07575,0.14075,0.14675,0.0635,0.20375,0.202,...,0.0405,0.01425,0.0305,0.035,0.14125,0.05775,0.039,0.02975,0.08475,0.022
1,2,0.04125,0.0405,0.06275,0.08275,0.091,0.06125,0.06925,0.096,0.0765,...,0.0525,0.01575,0.0125,0.02,0.12225,0.03275,0.021,0.011,0.10525,0.01975
2,3,0.04675,0.0555,0.02925,0.087,0.0475,0.04775,0.046,0.14275,0.0285,...,0.06275,0.0195,0.02225,0.023,0.122,0.03475,0.017,0.018,0.091,0.01775
3,4,0.03425,0.038,0.0405,0.031,0.065,0.03575,0.029,0.0865,0.032,...,0.05325,0.028,0.01675,0.03875,0.182,0.0705,0.01625,0.01425,0.0885,0.015
4,5,0.043,0.05325,0.038,0.041,0.054,0.06725,0.02775,0.0765,0.0215,...,0.0535,0.0205,0.01425,0.0255,0.19225,0.02675,0.01625,0.013,0.087,0.016


In [5]:
imdb_data.head()

Unnamed: 0,movieId,title_cast,director,runtime,budget,plot_keywords
0,1,Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal...,John Lasseter,81.0,30000000.0,toy|rivalry|cowboy|cgi animation
1,2,Robin Williams|Jonathan Hyde|Kirsten Dunst|Bra...,Jonathan Hensleigh,104.0,65000000.0,board game|adventurer|fight|game
2,3,Walter Matthau|Jack Lemmon|Sophia Loren|Ann-Ma...,Mark Steven Johnson,101.0,25000000.0,boat|lake|neighbor|rivalry
3,4,Whitney Houston|Angela Bassett|Loretta Devine|...,Terry McMillan,124.0,16000000.0,black american|husband wife relationship|betra...
4,5,Steve Martin|Diane Keaton|Martin Short|Kimberl...,Albert Hackett,106.0,30000000.0,fatherhood|doberman|dog|mansion


In [6]:
movies.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995.0
1,2,Jumanji,Adventure|Children|Fantasy,1995.0
2,3,Grumpier Old Men,Comedy|Romance,1995.0
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995.0
4,5,Father of the Bride Part II,Comedy,1995.0


In [7]:
# tags.head()

In [8]:
test.head()

Unnamed: 0,userId,movieId
0,1,2011
1,1,4144
2,1,5767
3,1,6711
4,1,7318


In [9]:
train.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,5163,57669,4.0,1518349992
1,106343,5,4.5,1206238739
2,146790,5459,5.0,1076215539
3,106362,32296,2.0,1423042565
4,9041,366,3.0,833375837


In [10]:
distrib = train.rating.value_counts(normalize=True)

fig = px.bar(distrib, color=distrib.index)

fig.show()

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import OneHotEncoder, Normalizer, StandardScaler, MinMaxScaler, RobustScaler

from sklearn.metrics.pairwise import cosine_similarity

from scipy import sparse

from sklearn.decomposition import TruncatedSVD

from sklearn.cluster import KMeans

In [12]:
full_combine = (
    movies
    .merge(imdb_data,      on="movieId", how="left")
    .merge(genome_tag_vec, on="movieId", how="left")
    .set_index("movieId", drop=True)
)

normal_year_mean = full_combine[full_combine.year != 0].year.mean()
full_combine['year'].replace(0, normal_year_mean, inplace=True)

# movies.year.fillna(int(movies.year.median()), inplace=True)
# movies.genres.fillna("<unknown>", inplace=True)
full_combine.title = full_combine.title.str.strip().str.replace(r"(.*), The$", r"The \1", regex=True)
full_combine.title_cast.fillna("", inplace=True)
full_combine.director.fillna("", inplace=True)
full_combine.runtime.fillna(int(full_combine.runtime.median()), inplace=True)
full_combine.budget.fillna(int(full_combine.budget.median()), inplace=True)
full_combine.plot_keywords.fillna("", inplace=True)

full_combine['cast_size'] = full_combine.title_cast.str.split('|').apply(len)
full_combine['genre_count'] = full_combine.genres.str.split('|').apply(len)

movie_groups = train.set_index('movieId', drop=True).groupby('movieId')
full_combine['rating_mean'] = movie_groups.rating.mean()
full_combine['rating_std'] = movie_groups.rating.std().fillna(0)

q3 = movie_groups.rating.quantile(0.75)
q1 = movie_groups.rating.quantile(0.25)

full_combine['rating_iqr'] =  q3 - q1
full_combine['rating_count'] = movie_groups.apply(len)

full_combine.fillna(0, inplace=True)

full_combine['movieId'] = full_combine.index.values
full_combine.reset_index(inplace=True, drop=True)
full_combine

Unnamed: 0,title,genres,year,title_cast,director,runtime,budget,plot_keywords,007,007 (series),...,wwii,zombie,zombies,cast_size,genre_count,rating_mean,rating_std,rating_iqr,rating_count,movieId
0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995.0,Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal...,John Lasseter,81.0,30000000.0,toy|rivalry|cowboy|cgi animation,0.02875,0.02375,...,0.02975,0.08475,0.02200,15,5,3.889971,0.924249,1.0,23062.0,1
1,Jumanji,Adventure|Children|Fantasy,1995.0,Robin Williams|Jonathan Hyde|Kirsten Dunst|Bra...,Jonathan Hensleigh,104.0,65000000.0,board game|adventurer|fight|game,0.04125,0.04050,...,0.01100,0.10525,0.01975,15,3,3.263414,0.966744,1.0,9654.0,2
2,Grumpier Old Men,Comedy|Romance,1995.0,Walter Matthau|Jack Lemmon|Sophia Loren|Ann-Ma...,Mark Steven Johnson,101.0,25000000.0,boat|lake|neighbor|rivalry,0.04675,0.05550,...,0.01800,0.09100,0.01775,15,2,3.132325,1.016368,1.0,4727.0,3
3,Waiting to Exhale,Comedy|Drama|Romance,1995.0,Whitney Houston|Angela Bassett|Loretta Devine|...,Terry McMillan,124.0,16000000.0,black american|husband wife relationship|betra...,0.03425,0.03800,...,0.01425,0.08850,0.01500,15,3,2.878099,1.073617,2.0,968.0,4
4,Father of the Bride Part II,Comedy,1995.0,Steve Martin|Diane Keaton|Martin Short|Kimberl...,Albert Hackett,106.0,30000000.0,fatherhood|doberman|dog|mansion,0.04300,0.05325,...,0.01300,0.08700,0.01600,15,1,3.059165,0.995982,1.5,4648.0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62418,We,Drama,2018.0,,,98.0,13000000.0,,0.00000,0.00000,...,0.00000,0.00000,0.00000,1,1,0.000000,0.000000,0.0,0.0,209157
62419,Window of the Soul,Documentary,2001.0,,,98.0,13000000.0,,0.00000,0.00000,...,0.00000,0.00000,0.00000,1,1,3.000000,0.000000,0.0,1.0,209159
62420,Bad Poems,Comedy|Drama,2018.0,,,98.0,13000000.0,,0.00000,0.00000,...,0.00000,0.00000,0.00000,1,2,0.000000,0.000000,0.0,0.0,209163
62421,A Girl Thing,unknown,2001.0,,,98.0,13000000.0,,0.00000,0.00000,...,0.00000,0.00000,0.00000,1,1,3.000000,0.000000,0.0,1.0,209169


In [46]:
features = []
full_combine.columns

Index(['title', 'genres', 'year', 'title_cast', 'director', 'runtime',
       'budget', 'plot_keywords', '007', '007 (series)',
       ...
       'wwii', 'zombie', 'zombies', 'cast_size', 'genre_count', 'rating_mean',
       'rating_std', 'rating_iqr', 'rating_count', 'movieId'],
      dtype='object', length=1143)

In [47]:
title_vectrz = TfidfVectorizer(min_df=25, ngram_range=(1, 3), stop_words=stopwords, norm=None)
title_vec = title_vectrz.fit_transform(full_combine.title)
print("Title Tokens:", len(title_vectrz.get_feature_names()))
features.extend(title_vectrz.get_feature_names())

Title Tokens: 954


In [48]:
genre_vectrz = TfidfVectorizer(token_pattern=r"[A-z\-]+", min_df=2, norm=None)
genre_vec = genre_vectrz.fit_transform(full_combine.genres)
print("Genre Tokens:", len(genre_vectrz.get_feature_names()))
features.extend(genre_vectrz.get_feature_names())

Genre Tokens: 20


In [49]:
cast_vectrz = TfidfVectorizer(token_pattern=r"[^\|]+", min_df=25, norm=None)
cast_vec = cast_vectrz.fit_transform(full_combine.title_cast)
print("Cast Tokens:", len(cast_vectrz.get_feature_names()))
features.extend(cast_vectrz.get_feature_names())

Cast Tokens: 465


In [50]:
director_vectrz = TfidfVectorizer(token_pattern=r".+", min_df=10, stop_words=['see full summary'], norm=None)
director_vec = director_vectrz.fit_transform(full_combine.director)
print("Director Tokens:", len(director_vectrz.get_feature_names()))
features.extend(director_vectrz.get_feature_names())

Director Tokens: 30


In [51]:
plot_vectrz = TfidfVectorizer(token_pattern=r"[^\|]+", min_df=10, stop_words=stopwords, norm=None)
plot_vec = plot_vectrz.fit_transform(full_combine.plot_keywords)
print("Plot KW Tokens:", len(plot_vectrz.get_feature_names()))
features.extend(plot_vectrz.get_feature_names())

Plot KW Tokens: 1001


In [52]:
genome_features = [col for col in genome_tag_vec.columns if col != 'movieId']
features.extend(genome_features)
sparse_genome = sparse.csr_matrix(full_combine[genome_features])

In [53]:
extra_features = ["year", "runtime", "budget", "rating_mean", "rating_std", "rating_iqr", "rating_count"]
features.extend(extra_features)

extra_features = full_combine[extra_features]

scaler = MinMaxScaler()
transformed = scaler.fit_transform(extra_features)
std_extra_sparse = sparse.csr_matrix(transformed)

tfidf_vecs = sparse.hstack([
    title_vec,
    genre_vec,
    cast_vec,
    director_vec,
    plot_vec,
]).tocsr()

vecs = sparse.hstack([tfidf_vecs, sparse_genome, std_extra_sparse])
norm = Normalizer(copy=True)
norm_vecs = norm.transform(vecs)
norm_vecs.shape

(62423, 3605)

In [54]:
a = norm_vecs.getrow(17067).toarray().flatten()
b = norm_vecs.getrow(14628).toarray().flatten()

f = pd.DataFrame(index=features)
f['a'] = a
f['b'] = b
f['diff'] = np.abs(f.a - f.b)

f[f.a != f.b].sort_values('diff', ascending=False)

Unnamed: 0,a,b,diff
alien invasion,0.308608,0.000000,3.086083e-01
mark ruffalo,0.285379,0.000000,2.853786e-01
stellan skarsgård,0.273822,0.000000,2.738218e-01
based on comic book,0.000000,0.263021,2.630209e-01
mickey rourke,0.000000,0.263021,2.630209e-01
...,...,...,...
stand-up comedy,0.002555,0.002553,2.630175e-06
aardman studios,0.000545,0.000546,1.893940e-06
cheerleading,0.000855,0.000853,1.174136e-06
american civil war,0.000419,0.000419,3.051645e-07


In [55]:
def most_similar_to(movieId):
    movie_idx = movies[movies['movieId'] == movieId].index[0]

    return norm_vecs.dot(norm_vecs.getrow(movie_idx).transpose())

avengers = 89745
avengers2 = 122892
amazing_spiderman = 95510
fault_in_stars = 111921

mst = most_similar_to(avengers).toarray().flatten()
n = 10
top_n = mst.argsort(axis=0)[-n:][::-1]
bot_n = mst.argsort(axis=0)[:n][::-1]

for similar_idx in top_n:
    title = full_combine.iloc[similar_idx].title
    sim = round(mst[similar_idx], 10)
    print(sim, similar_idx, title)

print('-' * 20)

for similar_idx in bot_n:
    title = full_combine.iloc[similar_idx].title
    sim = mst[similar_idx]
    print(sim, title)
# most_similar = most_similar_to(avengers).toarray().argmax()
# movie_data.iloc[most_similar]

1.0 17067 The Avengers
0.6506146377 14628 Iron Man 2
0.4841823834 19678 Iron Man 3
0.4656577916 12324 Iron Man
0.3978722067 16312 Thor
0.3825096741 18006 Battleship
0.3613001218 21348 Captain America: The Winter Soldier
0.3604275148 25070 Guardians of the Galaxy 2
0.3561891629 2549 Superman
0.3552878818 25069 Thor: Ragnarok
--------------------
0.000712199004382333 Grandma and the Bad Boys
0.0007074352950732604 Be Silent, Sorrow, Be Silent
0.0006898165235561085 Bob Kick, the Mischievous Kid
0.0006807916275889754 Indian Day School
0.0006578197303551492 The Magic Book
0.0005892557717065172 Je vous aime
0.0005820514409189286 Misfortune Never Comes Alone
0.0005733452629870708 Optocht van voorstanders van het Esperanto
0.0005106092886706594 Fifth Avenue, New York
0.00046285286154854787 Bang! Bang! You're Dead! (Our Man in Marrakesh) (Bang, Bang, Bang! Marrakesh) (I Spy, You Spy)


In [56]:
movie_indexes = full_combine.movieId.copy()
experiment = train.copy().drop('timestamp', axis=1)

mid_to_idx = pd.Series(movie_indexes.index.values, index=movie_indexes).to_dict()

In [57]:
def remove_diag(x):
    x_no_diag = np.ndarray.flatten(x)
    x_no_diag = np.delete(x_no_diag, range(0, len(x_no_diag), len(x) + 1), 0)
    x_no_diag = x_no_diag.reshape(len(x), len(x) - 1)
    return x_no_diag

In [58]:
mean_movie_rating_by_movie_id = train.groupby('movieId').rating.mean()

In [None]:
combined_data = pd.concat([train, test])
combined_data.drop("timestamp", axis=1, inplace=True)
combined_data.sort_values(['userId', 'rating'], inplace=True)
combined_data.reset_index(inplace=True, drop=True)
combined_data

In [59]:
def predict_ratings(data):
    history = data[data.rating.notna()]
    to_predict = data[data.rating.isna()]

    if len(history) == 1:
        print(data.userId.unique())
        mean_guesses = mean_movie_rating_by_movie_id.loc[to_predict.movieId.values].values
        print(mean_guesses)
        to_predict['pred'] = mean_guesses
        return pd.concat([history, to_predict])

    if len(to_predict) == 0:
        return data

    idxs = [mid_to_idx[i] for i in data.movieId]

    historic_ratings = history.rating.values

    similarity_matrix = remove_diag(cosine_similarity(norm_vecs[idxs, :]))
    sim_matrix = similarity_matrix[len(history):, :len(history)]
    
    sim_totals = sim_matrix.sum(axis=1)
    ratings = np.tile(history.rating, (len(to_predict.rating),1))
    weighted_sums = np.einsum('ij,ij->i', sim_matrix, ratings)

    to_predict['pred'] = weighted_sums / sim_totals
    
    return pd.concat([history, to_predict])

In [None]:
%%time
parallel_preds = applyParallel(combined_data.groupby("userId"), predict_ratings)

In [60]:
experiment = train.copy()
experiment.drop('timestamp', inplace=True, axis=1)

experiment['original'] = experiment['rating']
experiment['rating'] = experiment.rating.sample(frac=0.85)

In [61]:
%%time
experimental_preds = applyParallel(experiment.groupby("userId"), predict_ratings)

CPU times: user 7min 18s, sys: 51 s, total: 8min 9s
Wall time: 17min 17s


In [62]:
p = experimental_preds[experimental_preds.pred.notna()].copy()
p['err'] = np.abs(p.original - p.pred)

In [63]:
mae_by_movie = p.groupby('movieId').err.mean().sort_values(ascending=False)
extreme_movies = pd.DataFrame(mae_by_movie[mae_by_movie > 3.])
extreme_movies['count'] = train.groupby('movieId').apply(len)
extreme_movies['rating_mean'] = full_combine.set_index('movieId', drop=True).rating_mean
extreme_movies

Unnamed: 0_level_0,err,count,rating_mean
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
154933,4.045369,3,2.333333
26008,3.923890,4,2.500000
199738,3.878695,3,2.833333
193037,3.762070,1,0.500000
162436,3.761181,1,0.500000
...,...,...,...
55844,3.011783,26,3.307692
154890,3.010147,16,3.500000
135703,3.001704,1,0.500000
42312,3.001571,15,3.066667


In [64]:
np.sqrt(np.mean(np.power(p.original - p.pred, 2)))

0.9615530138753207

In [None]:
mae_by_user = p.groupby('userId').err.mean().sort_values(ascending=False)
extreme_users = mae_by_user[mae_by_user > 3.]
extreme_users

In [None]:
def make_submission_file(df, filename="submission.csv"):
    df.reset_index(drop=True)
    df.drop(df[df.rating.notna()].index, inplace=True)
    df["rating"] = df["pred"]
    df["Id"] = df.userId.astype(str) + "_" + df.movieId.astype(str)
    df[["Id", "rating"]].to_csv(filename, index=False, chunksize=100_000)