In [1]:
import pandas as pd
import numpy as np
import igraph as ig

import plotly.express as px

from matplotlib import pyplot as plt

from helpers import flatten_list
from build_graphml_file import get_user_movie_graph, get_movie_prop_graph

import nltk
from nltk.corpus import stopwords

ModuleNotFoundError: No module named 'helpers'

In [None]:
random_state = 42
stopwords = stopwords.words('english')

In [None]:
genome_tag_vec = pd.read_csv("cleaned/genome_tag_vec.csv")
imdb_data = pd.read_csv("cleaned/imdb_data.csv")
movies = pd.read_csv("cleaned/movies.csv")
# tags = pd.read_csv("data/tags.csv")
test = pd.read_csv("data/test.csv")
train = pd.read_csv("data/train.csv")

In [None]:
genome_tag_vec.head()

In [None]:
imdb_data.head()

In [None]:
movies.head()

In [None]:
# tags.head()

In [None]:
test.head()

In [None]:
train.head()

In [None]:
distrib = train.rating.value_counts(normalize=True)

fig = px.bar(distrib, color=distrib.index)

fig.show()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import OneHotEncoder, Normalizer, StandardScaler, MinMaxScaler, RobustScaler

from sklearn.metrics.pairwise import cosine_similarity

from scipy import sparse

from sklearn.decomposition import TruncatedSVD

from sklearn.cluster import KMeans

In [None]:
full_combine = movies.merge(imdb_data, on='movieId', how='left')
normal_year_mean = full_combine[full_combine.year != 0].year.mean()
full_combine['year'].replace(0, normal_year_mean, inplace=True)

# movies.year.fillna(int(movies.year.median()), inplace=True)
# movies.genres.fillna("<unknown>", inplace=True)
full_combine.title = full_combine.title.str.strip().str.replace(r"(.*), The$", r"The \1", regex=True)
full_combine.title_cast.fillna("", inplace=True)
full_combine.director.fillna("", inplace=True)
full_combine.runtime.fillna(int(full_combine.runtime.median()), inplace=True)
full_combine.budget.fillna(int(full_combine.budget.median()), inplace=True)
full_combine.plot_keywords.fillna("", inplace=True)

full_combine['cast_size'] = full_combine.title_cast.str.split('|').apply(len)
full_combine['genre_count'] = full_combine.genres.str.split('|').apply(len)

movie_groups = train.groupby('movieId')
full_combine['rating_mean'] = movie_groups.rating.mean()
full_combine['rating_std'] = movie_groups.rating.std()
full_combine['rating_iqr'] =  movie_groups.rating.quantile(0.75) - movie_groups.rating.quantile(0.25)
full_combine['rating_count'] = movie_groups.rating.count()

full_combine.fillna(0, inplace=True)

In [None]:
features = []
full_combine.columns

In [None]:
# title_vectrz = TfidfVectorizer(min_df=20, ngram_range=(1, 3), stop_words=stopwords, norm=None)
# title_vec = title_vectrz.fit_transform(full_combine.title)
# print("Title Tokens:", len(title_vectrz.get_feature_names()))
# features.extend(title_vectrz.get_feature_names())

In [None]:
genre_vectrz = TfidfVectorizer(token_pattern=r"[A-z\-]+", min_df=2, norm=None)
genre_vec = genre_vectrz.fit_transform(full_combine.genres)
print("Genre Tokens:", len(genre_vectrz.get_feature_names()))
features.extend(genre_vectrz.get_feature_names())

In [None]:
# cast_vectrz = TfidfVectorizer(token_pattern=r"[^\|]+", min_df=50, norm=None)
# cast_vec = cast_vectrz.fit_transform(full_combine.title_cast)
# print("Cast Tokens:", len(cast_vectrz.get_feature_names()))
# features.extend(cast_vectrz.get_feature_names())

In [None]:
# director_vectrz = TfidfVectorizer(token_pattern=r".+", min_df=5, stop_words=['see full summary'], norm=None)
# director_vec = director_vectrz.fit_transform(full_combine.director)
# print("Director Tokens:", len(director_vectrz.get_feature_names()))
# features.extend(director_vectrz.get_feature_names())

In [None]:
plot_vectrz = TfidfVectorizer(token_pattern=r"[^\|]+", min_df=20, stop_words=stopwords, norm=None)
plot_vec = plot_vectrz.fit_transform(full_combine.plot_keywords)
print("Plot KW Tokens:", len(plot_vectrz.get_feature_names()))
features.extend(plot_vectrz.get_feature_names())

In [None]:
# gtag_vec = genome_tag_vec.drop('movieId', axis=1).values
# gtag_vec_sm = sparse.coo_matrix(gtag_vec, shape=(10000,1128))


In [None]:
extra_features = ["year", 'rating_mean', 'rating_std', 'rating_iqr', 'rating_count']
features.extend(extra_features)

extra_features = full_combine[extra_features]

scaler = MinMaxScaler()
transformed = scaler.fit_transform(extra_features)
std_extra_sparse = sparse.csr_matrix(transformed)

tfidf_vecs = sparse.hstack([
    # title_vec,
    genre_vec,
    # cast_vec,
    # director_vec,
    plot_vec,
]).tocsr()

vecs = sparse.hstack([tfidf_vecs, std_extra_sparse])
norm = Normalizer(copy=True)
norm_vecs = norm.transform(vecs)
norm_vecs.shape

In [None]:
a = norm_vecs.getrow(17067).toarray().flatten()
b = norm_vecs.getrow(14628).toarray().flatten()

f = pd.DataFrame(index=features)
f['a'] = a
f['b'] = b
f['diff'] = np.abs(f.a - f.b)

f[f.a != f.b].sort_values('diff', ascending=False)

In [None]:
def most_similar_to(movieId):
    movie_idx = movies[movies['movieId'] == movieId].index[0]

    return norm_vecs.dot(norm_vecs.getrow(movie_idx).transpose())

avengers = 89745
avengers2 = 122892
amazing_spiderman = 95510
fault_in_stars = 111921

mst = most_similar_to(avengers).toarray().flatten()
n = 10
top_n = mst.argsort(axis=0)[-n:][::-1]
bot_n = mst.argsort(axis=0)[:n][::-1]

for similar_idx in top_n:
    title = full_combine.iloc[similar_idx].title
    sim = round(mst[similar_idx], 10)
    print(sim, similar_idx, title)

print('-' * 20)

for similar_idx in bot_n:
    title = full_combine.iloc[similar_idx].title
    sim = mst[similar_idx]
    print(sim, title)
# most_similar = most_similar_to(avengers).toarray().argmax()
# movie_data.iloc[most_similar]

In [None]:
movie_indexes = full_combine.movieId.copy()
experiment = train.copy().drop('timestamp', axis=1)

mid_to_idx = pd.Series(movie_indexes.index.values, index=movie_indexes).to_dict()

In [None]:
def get_similarity(id1, id2):
    a = vecs[mid_to_idx[id1], :]
    b = vecs[mid_to_idx[id2], :]
    return a.dot(b.transpose()).toarray().flatten()[0]

def batch_similarity(id1, id_list: list):
    a = vecs[mid_to_idx[id1], :]
    idxs = [mid_to_idx[i] for i in id_list]
    bs = vecs[idxs, :]
    return bs.dot(a.transpose()).toarray().flatten()

In [None]:
# _test = train.groupby('userId').sample(frac=0.25)
# _train = train[~train.index.isin(_test.index.values)]
def skip_diag_strided(A):
    m = A.shape[0]
    strided = np.lib.stride_tricks.as_strided
    s0,s1 = A.strides
    return strided(A.ravel()[1:], shape=(m-1,m), strides=(s0+s1,s1)).reshape(m,-1)

def remove_diag(x):
    x_no_diag = np.ndarray.flatten(x)
    x_no_diag = np.delete(x_no_diag, range(0, len(x_no_diag), len(x) + 1), 0)
    x_no_diag = x_no_diag.reshape(len(x), len(x) - 1)
    return x_no_diag

In [None]:
mean_movie_rating_by_movie_id = train.groupby('movieId').rating.mean()

In [None]:
combined_data = pd.concat([train, test])
combined_data.drop("timestamp", axis=1, inplace=True)
combined_data.sort_values(['userId', 'rating'], inplace=True)
combined_data.reset_index(inplace=True, drop=True)
combined_data

In [None]:
%%time

def predict_ratings(data):
    history = data[data.rating.notna()]
    to_predict = data[data.rating.isna()]

    if len(history) == 1:
        print(data.userId.unique())
        mean_guesses = mean_movie_rating_by_movie_id.loc[to_predict.movieId.values].values
        print(mean_guesses)
        to_predict['pred'] = mean_guesses
        return pd.concat([history, to_predict])

    if len(to_predict) == 0:
        return data

    idxs = [mid_to_idx[i] for i in data.movieId]

    historic_ratings = history.rating.values

    similarity_matrix = remove_diag(cosine_similarity(norm_vecs[idxs, :]))
    sim_matrix = similarity_matrix[len(history):, :len(history)]
    
    sim_totals = sim_matrix.sum(axis=1)
    ratings = np.tile(history.rating, (len(to_predict.rating),1))
    weighted_sums = np.einsum('ij,ij->i', sim_matrix, ratings)

    to_predict['pred'] = weighted_sums / sim_totals
    
    return pd.concat([history, to_predict])

predictions = combined_data.groupby('userId').apply(predict_ratings)

In [None]:
predictions.reset_index(drop=True)

In [None]:
predictions[predictions.userId == 53640]

In [None]:
submission = predictions.copy()
submission.drop(submission[submission.rating.notna()].index, inplace=True)
submission["rating"] = submission["pred"]

In [None]:
submission['Id'] = submission.userId.astype(str) + '_' + submission.movieId.astype(str)

In [None]:
%%time 
submission[['Id', 'rating']].to_csv("submission.csv", index=False, chunksize=100_000)