# MovieLens Data Analysis


In [65]:
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix, save_npz
import json


In [68]:
path = "/kaggle/input/movielens/data/"
opath = "/kaggle/working/"


movies = pd.read_csv(f"{path}ml-32m/movies.csv")
tags = pd.read_json(f"{path}tag-genome-2021/raw/tags.json", lines=True)
scores = pd.read_csv(f"{path}tag-genome-2021/scores/glmer.csv")

In [13]:
movies.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [11]:
movies.describe(), movies.info(), movies.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87585 entries, 0 to 87584
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  87585 non-null  int64 
 1   title    87585 non-null  object
 2   genres   87585 non-null  object
dtypes: int64(1), object(2)
memory usage: 2.0+ MB


(             movieId
 count   87585.000000
 mean   157651.365519
 std     79013.402099
 min         1.000000
 25%    112657.000000
 50%    165741.000000
 75%    213203.000000
 max    292757.000000,
 None,
 (87585, 3))

In [22]:
tags.head(5), tags.shape

(              tag   id
 0         aardman   22
 1  secret service  112
 2      hillarious  167
 3       christian  270
 4           mummy  362,
 (1094, 2))

In [25]:
scores.head(5), scores.shape

(        tag  item_id     score
 0  airplane        2  0.042391
 1  airplane        3  0.050673
 2  airplane        4  0.033161
 3  airplane        5  0.047030
 4  airplane        6  0.051117,
 (10551655, 3))

## Renaming column names for consistency

In [31]:
# Renaming for consistency of column names

movies = movies.rename(columns={"title":"movie", "movieId":"movie_id"})
tags = tags.rename(columns={"id":"tag_id"})
scores = scores.rename(columns={"item_id":"movie_id"})

In [32]:
movies.head(5), tags.head(5), scores.head(5)

(   movie_id                               movie  \
 0         1                    Toy Story (1995)   
 1         2                      Jumanji (1995)   
 2         3             Grumpier Old Men (1995)   
 3         4            Waiting to Exhale (1995)   
 4         5  Father of the Bride Part II (1995)   
 
                                         genres  
 0  Adventure|Animation|Children|Comedy|Fantasy  
 1                   Adventure|Children|Fantasy  
 2                               Comedy|Romance  
 3                         Comedy|Drama|Romance  
 4                                       Comedy  ,
               tag  tag_id
 0         aardman      22
 1  secret service     112
 2      hillarious     167
 3       christian     270
 4           mummy     362,
         tag  movie_id     score
 0  airplane         2  0.042391
 1  airplane         3  0.050673
 2  airplane         4  0.033161
 3  airplane         5  0.047030
 4  airplane         6  0.051117)

## Adding tag_ids to scores

In [33]:
# Inner join - getting tag_ids from tag dataset
scores = scores.merge(tags, how='inner', on='tag')

In [35]:
scores.head(5), scores.shape

(        tag  movie_id     score  tag_id
 0  airplane         2  0.042391  108266
 1  airplane         3  0.050673  108266
 2  airplane         4  0.033161  108266
 3  airplane         5  0.047030  108266
 4  airplane         6  0.051117  108266,
 (10551655, 4))

## Building movies lookup index

In [39]:
covered_movies = np.sort(scores['movie_id'].unique())

covered_movies

array([     1,      2,      3, ..., 107141, 107348, 108932])

In [51]:
movie_index = pd.DataFrame({
    "row_index": np.arange(len(covered_movies), dtype=np.int32),
    "movie_id": covered_movies
}
)

movie_index, movie_index.shape

(      row_index  movie_id
 0             0         1
 1             1         2
 2             2         3
 3             3         4
 4             4         5
 ...         ...       ...
 9729       9729    106920
 9730       9730    107069
 9731       9731    107141
 9732       9732    107348
 9733       9733    108932
 
 [9734 rows x 2 columns],
 (9734, 2))

## Building tags lookup index

In [48]:
tag_index = pd.DataFrame({
    "col_index":np.arange(len(tags), dtype=np.int32),
    "tag_id": tags.tag_id,
    "tag": tags.tag
})

In [49]:
tag_index, tag_index.shape

(      col_index  tag_id             tag
 0             0      22         aardman
 1             1     112  secret service
 2             2     167      hillarious
 3             3     270       christian
 4             4     362           mummy
 ...         ...     ...             ...
 1089       1089  109147   disappointing
 1090       1090  109249         lesbian
 1091       1091  109333        morality
 1092       1092  109336     new orleans
 1093       1093  109660         bad cgi
 
 [1094 rows x 3 columns],
 (1094, 3))

## Sparse Matrix Format

In [56]:
# Lookup Dictionaries
movie_rows = dict(zip(movie_index.movie_id, movie_index.row_index))
tag_cols = dict(zip(tag_index.tag_id, tag_index.col_index))

# Mapping ids to score
row = scores['movie_id'].map(movie_rows).astype("int32").to_numpy()
col = scores['tag_id'].map(tag_cols).astype("int32").to_numpy()
val = scores['score'].astype("float32").to_numpy()

In [59]:
M, T = len(movie_index), len(tag_index)

coo = coo_matrix((val, (row, col)), shape=(M, T), dtype="float32")

In [62]:
csr = coo.tocsr().astype("float32")


In [64]:
csr, coo

(<Compressed Sparse Row sparse matrix of dtype 'float32'
 	with 10551655 stored elements and shape (9734, 1094)>,
 <COOrdinate sparse matrix of dtype 'float32'
 	with 10551655 stored elements and shape (9734, 1094)>)

In [66]:
save_npz(f"{opath}tag_matrix.npz", csr)


In [69]:
movie_index.to_csv(f"{opath}movie_index.csv", index=False)
tag_index.to_csv(f"{opath}tag_index.csv", index=False)

In [70]:
meta = {
    "source": "genome_glmer",
    "matrix_dtype": "float32",
    "n_movies": int(csr.shape[0]),
    "n_tags": int(csr.shape[1]),
    "nnz": int(csr.nnz),
    "density": float(csr.nnz) / float(csr.shape[0] * csr.shape[1])
}
with open(f"{opath}meta.json", "w", encoding="utf-8") as f:
    json.dump(meta, f, indent=2)