In [1]:
%load_ext autoreload
%autoreload 2

# These importing lines pull in the Recommender_Lib from folders above this file
from DataAugmentation_Lib import *

# These importing lines pull in env installed libs
import json
import pandas as pd
import numpy as np

In [2]:
'''
Read each .dat file into a DF and give them col headers to match the .csv 25 mil data sets
'''
ratings_1M_df = pd.read_table(
    PATH_TO_MOVIE_LENS_1M_RATINGS,
    sep=MOVIE_LENS_1M_DELIM, # sep="::"
    header=None, # tell the read that the data has no col headers
    engine="python") # explicit use of python engine to use > 1 length sep ("::") without a warning
ratings_1M_df.columns = MOVIE_LENS_25M_RATINGS_COLS

ratings_25M_df = pd.read_csv(PATH_TO_MOVIE_LENS_25M_RATINGS)

users_df = pd.read_table(
    PATH_TO_MOVIE_LENS_1M_USERS,
    sep=MOVIE_LENS_1M_DELIM,
    header=None,
    engine="python")
users_df.columns = MOVIE_LENS_1M_USERS_COLS

movies_1M_df = pd.read_table(
    PATH_TO_MOVIE_LENS_1M_MOVIES,
    sep=MOVIE_LENS_1M_DELIM,
    header=None,
    engine="python",
    encoding="ISO-8859-1") # default utf8 encoding fails during read of movies data.
movies_1M_df.columns = MOVIE_LENS_25M_MOVIES_COLS

movies_25M_df = pd.read_csv(PATH_TO_MOVIE_LENS_25M_MOVIES)

In [3]:
'''
Combine the movies DFs. Need to change the 1M movie IDs so we can tell them apart later by
adding the highest movie ID in the 25M to each. Then concat the movies in the 1M DF that are
not in the 25M DF to the end of the 25M Df
'''
max_25M_movie_id = movies_25M_df[MOVIE_LENS_25M_MOVIE_ID_COL].max()
add_max_movie_id = lambda id: id + max_25M_movie_id
movies_1M_df[MOVIE_LENS_25M_MOVIE_ID_COL] = movies_1M_df[MOVIE_LENS_25M_MOVIE_ID_COL].map(
    add_max_movie_id)
ratings_1M_df[MOVIE_LENS_25M_MOVIE_ID_COL] = ratings_1M_df[MOVIE_LENS_25M_MOVIE_ID_COL].map(
    add_max_movie_id)

movies_df = pd.concat(
    [movies_25M_df,
    movies_1M_df[~movies_1M_df[MOVIE_LENS_25M_MOVIE_TITLE_COL].isin(
        movies_25M_df[MOVIE_LENS_25M_MOVIE_TITLE_COL])]],
    ignore_index=True)

In [4]:
'''
Now need to change the 1M user IDs in all DFs that have userIds so that I can tell them apart
from the 25M user IDs. Then combine them and fill in the blanks with 0s.
'''
total_25M_users = len(ratings_25M_df[MOVIE_LENS_25M_USER_ID_COL].unique())
add_max_user_id = lambda x: x + total_25M_users
users_df[MOVIE_LENS_25M_USER_ID_COL] = users_df[MOVIE_LENS_25M_USER_ID_COL].map(
    add_max_user_id)
ratings_1M_df[MOVIE_LENS_25M_USER_ID_COL] = ratings_1M_df[MOVIE_LENS_25M_USER_ID_COL].map(
    add_max_user_id)

users_df = pd.concat([pd.DataFrame({
    MOVIE_LENS_25M_USER_ID_COL : ratings_25M_df[MOVIE_LENS_25M_USER_ID_COL].unique()
    }),
    users_df],
    ignore_index=True).fillna(0)
ratings_df = pd.concat([ratings_25M_df, ratings_1M_df], ignore_index=True)

In [5]:
'''
Define some variables that will help us load stuff later and make decisions below
'''
total_users = users_df.shape[0]
total_movies = movies_df.shape[0]
reindexer = np.arange(1, total_users+1)
'''
Give a function to the SparseMatrixMaker to check for bad data so that if the ratings
data matrix gets outof the range of the ratings numbers, 0.0 - 5.0, then an exception
will throw, because something is broken.
'''
sparse_mat_maker = SparseMatrixMaker(bad_data_detector=(True, lambda d: d < 0.0 or d > 5.0))
print("Total users: ", total_users, "\nTotal movies: ", total_movies)

Sparse row count =  0  and sparse col count =  0
Total users:  168581 
Total movies:  62967


In [6]:
'''
Now we have 3 DFs, two of which correspond to user ratings: users_df, ratings_df. Lets combine them.
We have movies_df, which has all the movie IDs, so make a column for each movie ID to users_df
and include the rating if the user has rated it.
It takes awhile to run, longer if you give it a quiet=False so that it reports progress
to not appear stlled. After, save the sparse coords and data as a numpy.
'''
# The store_ratings_for_mov_id_as_sparse_coords method builds a sparse matrix
# coordinates into 3 long arrays and those are saved to be loaded in later.
# store_ratings_for_mov_id_as_sparse_coords just returns the movie_id it is given.
movie_ids_series = movies_df[MOVIE_LENS_25M_MOVIE_ID_COL].map(
    lambda movie_id: sparse_mat_maker.store_ratings_for_mov_id_as_sparse_coords(
        movie_id,
        ratings_df,
        reindexer,
        quiet=True))

sparse_mat_maker.save_sparse_mat_coords(PATH_TO_PROCESSED_RATINGS_DATA)
print("Finshed and saved ratings matrix.")

Row coords len:  5466512 
Col coords len:  5466512 
Data len:  5466512
Row coords len:  10040512 
Col coords len:  10040512 
Data len:  10040512
Row coords len:  12943583 
Col coords len:  12943583 
Data len:  12943583
Row coords len:  14960782 
Col coords len:  14960782 
Data len:  14960782
Row coords len:  16261268 
Col coords len:  16261268 
Data len:  16261268
Row coords len:  17209729 
Col coords len:  17209729 
Data len:  17209729
Row coords len:  18077694 
Col coords len:  18077694 
Data len:  18077694
Row coords len:  18840487 
Col coords len:  18840487 
Data len:  18840487
Row coords len:  19186392 
Col coords len:  19186392 
Data len:  19186392
Row coords len:  19576086 
Col coords len:  19576086 
Data len:  19576086
Row coords len:  20262225 
Col coords len:  20262225 
Data len:  20262225
Row coords len:  21033047 
Col coords len:  21033047 
Data len:  21033047
Row coords len:  21609053 
Col coords len:  21609053 
Data len:  21609053
Row coords len:  22008403 
Col coords len

In [8]:
'''
Goal is to have the number 0 be "not specified" or "no rating" and
need to encode F vs M numerically, so using 1 vs -1, instead of 1 vs 0.
Also, there are some zip codes that are like 12345-6789 and I want them to just be able
to be straight numbers, so truncate the -6789 from the zip codes that are like that.
After that, we finally have our users matrix, so save it as a numpy.
'''
users_df[MOVIE_LENS_1M_GENDER_COL] = users_df[MOVIE_LENS_1M_GENDER_COL].map(
    lambda g: 1 if g == "F" else -1)

users_df[MOVIE_LENS_1M_ZIPCODE_COL] = users_df[MOVIE_LENS_1M_ZIPCODE_COL].map(drop_zipcode_tail)

np.savez(PATH_TO_PROCESSED_USERS_DATA, users_matrix=users_df.to_numpy(dtype=np.int32))

In [6]:
'''
Iterate over the genres of each movie and get a one-hot encoding representation,
then make it into a DF and concat the results to the end of the DF with the movie titles
'''
movies_df[MOVIE_LENS_25M_MOVIE_GENRES_COL] = movies_df[MOVIE_LENS_25M_MOVIE_GENRES_COL].map(
    lambda gs: genres_to_one_hot(gs, '|', MOVIE_LENS_GENRES))

In [7]:
''''
The movie titles have years tacked on the ends of them, with some parenths.
Take these off, then insert them as their own column of "numbers" for the matrix
of numbers we are building.
'''
year_accumulator = []
movies_df[MOVIE_LENS_25M_MOVIE_TITLE_COL] = movies_df[MOVIE_LENS_25M_MOVIE_TITLE_COL].apply(
    lambda title: accumulate_year(year_accumulator, title))
movies_df = pd.concat(
    [movies_df, pd.DataFrame({YEAR_COL: year_accumulator})],
    axis=1,
    join="inner")

'''
Found some bad chars in the year column of our movies_df.
Filter these out manually so we have just a column of strings that can be converted to ints without issue.
'''
movies_df[YEAR_COL] = movies_df[YEAR_COL].map(lambda year: year.replace("L'Associe", ""))
movies_df[YEAR_COL] = movies_df[YEAR_COL].map(lambda year: year.replace(')', ""))
movies_df[YEAR_COL] = movies_df[YEAR_COL].map(lambda year: year.replace('(', ""))
movies_df[YEAR_COL] = movies_df[YEAR_COL].map(lambda year: year.split('-')[0])
movies_df[YEAR_COL] = movies_df[YEAR_COL].map(lambda year: year.split('–')[0])

In [8]:
'''
Now we have to do some word embedding. Make a glove that was trained on Twitter object.
'''
embedded_vector_len = 25 # There is 25, 50, 100, 200
glove = Glove(PATH_TO_GLOVE, embedded_vector_len)

In [12]:
'''
Now, need to inspect the tags, clean them so that they are only words seperated by spaces.
Then, either throw out duplicate movie ID tags or combine all tags that are for the same movie id. 
Then find the tag with the most words and call that the number of vectors we need to make for each movie.
'''
tags_df = pd.read_csv(PATH_TO_MOVIE_LENS_25M_TAGS)
tags_df[MOVIE_LENS_25M_TAG_COL] = tags_df[MOVIE_LENS_25M_TAG_COL].map(glove.clean_str)
tags_df = tags_df.drop_duplicates(subset=[MOVIE_LENS_25M_MOVIE_ID_COL])

In [13]:
'''
Find that there are some really long tags. Need to make an equal number of vectors for each movie.
'''
glove_vects_per_movie = tags_df[MOVIE_LENS_25M_TAG_COL].map(lambda tag: len(tag.split())).unique()
glove_vects_per_movie

array([ 1,  2,  4,  3,  9,  6,  5, 15,  7, 16, 10,  8])

In [14]:
'''
Make a version of the tags where even movies without tags have a row that is just the empty string
'''
movie_id_reindexer = movies_df[MOVIE_LENS_25M_MOVIE_ID_COL].to_numpy()
tags_df_mut = tags_df.set_index(MOVIE_LENS_25M_MOVIE_ID_COL)
tags_df_mut = tags_df_mut.reindex(movie_id_reindexer, fill_value="")

In [15]:
'''
Don't want to make so many glove embeddings, so will only embed the first certain number
of tokens for those movies with tags.
Embed the tag for each movie. Some of this is just embedding "", which will
return an embedding of all 0s.
'''
num_of_toks_to_embed = 6
embedded_col = tags_df_mut[MOVIE_LENS_25M_TAG_COL].map(lambda tag: glove.embed_str(tag, num_of_toks_to_embed))

In [16]:
'''
Tack it onto the end of the movies_df
'''
movies_df = movies_df.set_index(MOVIE_LENS_25M_MOVIE_ID_COL)
movies_df = movies_df.join(embedded_col)

In [17]:
'''
Now clean and embed the movie titles
'''
movies_df[MOVIE_LENS_25M_MOVIE_TITLE_COL] = movies_df[MOVIE_LENS_25M_MOVIE_TITLE_COL].map(
    glove.clean_str).map(lambda title: glove.embed_str(title, num_of_toks_to_embed))

In [18]:
'''
Now we can see it is all numbers.
'''
movies_df = movies_df.reset_index()
movies_df

Unnamed: 0,movieId,title,genres,year,tag
0,1,"[[-0.5956, -0.67774, 0.63825, -0.55081, 0.3310...","[0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",1995,"[[-0.16845, 0.39337, 0.24115, -1.2381, 0.06361..."
1,2,"[[0.10984, -0.72454, 1.212, -0.16188, -0.77879...","[0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",1995,"[[-0.36034, -0.16173, 0.52871, 0.1684, -1.0275..."
2,3,"[[-0.63078, 0.23414, 1.0839, -0.4605, 0.36985,...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...",1995,"[[0.42466, -0.23493, 0.67394, -0.51295, 0.6706..."
3,4,"[[-0.94693, 1.202, 0.68523, -0.13582, -0.76029...","[0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, ...",1995,"[[0.57621, -0.0097165, -0.8488, -0.4566, 0.643..."
4,5,"[[-1.2006, 0.59454, 0.27821, 0.86424, 0.021296...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1995,"[[-0.86323, -0.13674, -1.2718, 0.67397, 1.0864..."
...,...,...,...,...,...
62962,213101,"[[-0.95787, -1.298, 0.021577, 0.48332, 0.71122...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...",1954,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
62963,213106,"[[-0.95826, 0.34048, 0.51397, -0.22547, -0.069...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...",1973,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
62964,213107,"[[-0.42776, -0.097852, 0.54833, 0.20915, -0.18...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...",1943,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
62965,213110,"[[-1.9711, 0.44933, 0.57468, 0.39935, -0.04219...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...",1987,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


In [None]:
'''
Want to map over all movies rows and turn that into a row in sparse matrix, then save
'''
sparse_mat_maker.reset((True, lambda x: x == np.inf))
movie_ids_series = movies_df.apply(
    lambda series: sparse_mat_maker.make_movies_df_row_element_into_sparse_coords(series, quiet=True),
    axis=1)
sparse_mat_maker.save_sparse_mat_coords(PATH_TO_PROCESSED_MOVIES_DATA)

'''
Now there are 3 matrices:
movies.npz, ratings.npz, users.npz
movies.npz and ratings.nps are saved in sparse matrix format while users is not.
'''

In [28]:
'''
Will need to know some of the stuff we have here out in the other files where we load
in the matrices, so save that all here to a specs json.
'''
num_columns_embedded = 2
num_cols_one_encoded = 1
movie_mat_col_cnt = (
    len(movies_df.columns) - num_columns_embedded - num_cols_one_encoded) + (
        (num_columns_embedded*(embedded_vector_len*num_of_toks_to_embed)) + MOVIE_LENS_NUM_GENRES)
with open(PATH_TO_PROCESSED_DATA_SPECS, "w") as out_file:
    out_file.write(
        json.dumps(
            {
                SPEC_MOVIE_IDS: movies_df[MOVIE_LENS_25M_MOVIE_ID_COL].to_list(),
                SPEC_USER_IDS: users_df[MOVIE_LENS_25M_USER_ID_COL].to_list(),
                SPEC_MOVIES_MATRIX_COLUMN_COUNT: movie_mat_col_cnt
            },
            indent=4))