In [None]:
# import libs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# useful command
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

plt.rcParams.update({'font.size': 14})

In [None]:
df_ratings = pd.read_csv(
    "./Dataset/ML-1M/ratings.dat",
    sep="::",
    names=["userId", "itemId", "rating", "timestamp"],
)
df_movies = pd.read_csv(
    "./Dataset/ML-1M/movies.dat",
    sep="::",
    names=["movieId", "title", "genres"],
)
df_users = pd.read_csv(
    "./Dataset/ML-1M/users.dat",
    sep="::",
    names=["userId", "gender", "age", "ocupation", "zip-code"],
)

In [None]:
df_ratings.head()
df_movies.head()
df_users.head()

In [None]:
nb_users = len(df_ratings["userId"].unique())
nb_items = len(df_ratings["itemId"].unique())
nb_ratings = len(df_users["userId"])

In [None]:
sparsity = 1 - nb_ratings / float(nb_users * nb_items)
print(sparsity)

In [None]:
print("Basic Overview:")
print("- Nb of users:", nb_users)
print("- Nb of items:", nb_items)
print("- Nb of ratings:", nb_ratings)
print("- Data sparsity:", sparsity)

In [None]:
items_consumed_by_user = df_ratings.groupby("userId")["itemId"].apply(list)

print(items_consumed_by_user)
print(items_consumed_by_user.shape)

In [None]:
users_hitory = {}

for itemId in df_ratings["userId"].unique():
    users_hitory[itemId] = len(items_consumed_by_user[itemId][::])

In [None]:
distribution = list(users_hitory.values())
distribution.sort()
plt.plot(distribution, ".", color="blue")
plt.ylabel("Amount of movies watched")
plt.xlabel("Users")
plt.show()

In [None]:
users_hitory_nd = np.array([int(item) for item in users_hitory.values()])

print("Specific information:")
print("- Mean: ", users_hitory_nd.mean())
print("- Median: ", np.median(users_hitory_nd))
print("- Min: ", users_hitory_nd.min())
print("- Max: ", users_hitory_nd.max())
print("- std: ", users_hitory_nd.std())

In [None]:
users_who_watched = df_ratings.groupby("itemId")["userId"].apply(list)

In [None]:
users_who_watched[:5]

In [None]:
items_popularity = {}

for itemId in df_ratings["itemId"].unique():
    items_popularity[itemId] = len(items_consumed_by_user[itemId][::])

In [None]:
distribution = list(items_popularity.values())
distribution.sort()
plt.plot(distribution, ".", color="blue")
plt.ylabel("Amount of movies watched")
plt.xlabel("Users")
plt.show()

In [None]:
items_popularity_nd = np.array([int(item) for item in items_popularity.values()])

print("Specific information:")
print("- Mean: ", items_popularity_nd.mean())
print("- Median: ", np.median(items_popularity_nd))
print("- Min: ", items_popularity_nd.min())
print("- Max: ", items_popularity_nd.max())
print("- std: ", items_popularity_nd.std())