In [1]:
import pandas as pd 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MinMaxScaler

In [2]:
rating_data = pd.read_csv("../Data/ratings.csv")
movies_data = pd.read_csv("../Data/movies.csv")
tags_data = pd.read_csv("../Data/tags.csv")

In [3]:
movies_data.insert(2, "year",  movies_data["title"].str.extract(r"\((\d{4})\)"))
movies_data["title"] = movies_data["title"].str.replace(r"(\s*\(\d{4})\)", "", regex=True)

In [4]:
tags_data["tag"] = tags_data["tag"].astype(str) 
tags = tags_data.groupby(["movieId"], as_index=False).aggregate({"tag" : " ".join})
tags["tag"] = tags["tag"].str.replace(" ", "|")
movies_data = movies_data.merge(tags, on="movieId", how="left")
movies_data["tag"] = movies_data["tag"].fillna("")
movies_data["tf_idf_input"] = movies_data["genres"] + " " + movies_data["tag"]

movies_data = movies_data.drop("tag", axis=1)

movies_data

Unnamed: 0,movieId,title,year,genres,tf_idf_input
0,1,Toy Story,1995,Adventure|Animation|Children|Comedy|Fantasy,Adventure|Animation|Children|Comedy|Fantasy an...
1,2,Jumanji,1995,Adventure|Children|Fantasy,Adventure|Children|Fantasy animals|based|on|a|...
2,3,Grumpier Old Men,1995,Comedy|Romance,Comedy|Romance sequel|moldy|old|old|age|old|me...
3,4,Waiting to Exhale,1995,Comedy|Drama|Romance,Comedy|Drama|Romance characters|chick|flick|gi...
4,5,Father of the Bride Part II,1995,Comedy,Comedy family|pregnancy|wedding|4th|wall|aging...
...,...,...,...,...,...
86532,288967,State of Siege: Temple Attack,2021,Action|Drama,Action|Drama
86533,288971,Ouija Japan,2021,Action|Horror,Action|Horror
86534,288975,The Men Who Made the Movies: Howard Hawks,1973,Documentary,Documentary
86535,288977,Skinford: Death Sentence,2023,Crime|Thriller,Crime|Thriller


In [5]:
genres = ["Action", "Adventure", "Animation", "Children", "Comedy", 
          "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", 
          "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", 
          "War", "Western", "(no genres listed)"]

df = movies_data

vectorizer = CountVectorizer()
vectorizer.fit(genres)
df["BoW_input"] = df["genres"].apply( lambda row: vectorizer.transform([row]).toarray())

df

Unnamed: 0,movieId,title,year,genres,tf_idf_input,BoW_input
0,1,Toy Story,1995,Adventure|Animation|Children|Comedy|Fantasy,Adventure|Animation|Children|Comedy|Fantasy an...,"[[0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,..."
1,2,Jumanji,1995,Adventure|Children|Fantasy,Adventure|Children|Fantasy animals|based|on|a|...,"[[0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,..."
2,3,Grumpier Old Men,1995,Comedy|Romance,Comedy|Romance sequel|moldy|old|old|age|old|me...,"[[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
3,4,Waiting to Exhale,1995,Comedy|Drama|Romance,Comedy|Drama|Romance characters|chick|flick|gi...,"[[0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,..."
4,5,Father of the Bride Part II,1995,Comedy,Comedy family|pregnancy|wedding|4th|wall|aging...,"[[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
...,...,...,...,...,...,...
86532,288967,State of Siege: Temple Attack,2021,Action|Drama,Action|Drama,"[[1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,..."
86533,288971,Ouija Japan,2021,Action|Horror,Action|Horror,"[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,..."
86534,288975,The Men Who Made the Movies: Howard Hawks,1973,Documentary,Documentary,"[[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,..."
86535,288977,Skinford: Death Sentence,2023,Crime|Thriller,Crime|Thriller,"[[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."


In [6]:
rating_data

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,1225734739
1,1,110,4.0,1225865086
2,1,158,4.0,1225733503
3,1,260,4.5,1225735204
4,1,356,5.0,1225735119
...,...,...,...,...
33832157,330975,8340,2.0,1091583256
33832158,330975,8493,2.5,1091585709
33832159,330975,8622,4.0,1091581777
33832160,330975,8665,3.0,1091581765


In [7]:
time_tags = tags_data.drop(["userId", "tag"],axis=1)
# time_tags.rename({"timestamp":"tag_timestamp"}, inplace=True)
time_rating = rating_data.drop(["userId", "rating"], axis=1)
# time_rating.rename({"timestamp":"rating_timestamp"}, inplace=True)
print(len(time_rating),len(time_tags))

time_data = pd.concat([time_tags, time_rating])



33832162 2328315


In [8]:
time_data.groupby(["movieId"]).max()
time_data.groupby(["movieId"]).min()
print(time_data.groupby(["movieId"]).sum())
print(time_data.groupby(["movieId"]).count() < 2)

              timestamp
movieId                
1        92800244625366
2        35984924322279
3        15536719885246
4         2893165593814
5        16054159106026
...                 ...
288967       1689748357
288971       1689798322
288975       1689812351
288977       1689815902
288983       1689834886

[86537 rows x 1 columns]
         timestamp
movieId           
1            False
2            False
3            False
4            False
5            False
...            ...
288967        True
288971        True
288975        True
288977        True
288983        True

[86537 rows x 1 columns]


In [9]:
scaler = MinMaxScaler()
time_data["scaled time"] = scaler.fit_transform(time_data.groupby("movieId"))

time_data

  if hasattr(array, "dtypes") and hasattr(array.dtypes, "__array__"):
  if hasattr(array, "dtypes") and hasattr(array.dtypes, "__array__"):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


KeyboardInterrupt: 