In [None]:
import pandas as pd
import numpy as np
import scipy.sparse as sparse

from cdhf.data import Data

data = Data("../input/mmdata.json")
data.load_all()

In [None]:
df = pd.DataFrame.from_records([vars(cm) for cm in data.channel_members])
df["index"] = df["channel_id"] + "-" + df["user_id"]
df.set_index('index', inplace=True)

df_grouped_users = df.groupby(["channel_id"]).count()
allowed_channels = df_grouped_users[df_grouped_users["user_id"] > 5].index.array
df = df[df["channel_id"].isin(allowed_channels)]
df['u_id'] = df['user_id'].astype("category").cat.codes
df['c_id'] = df['channel_id'].astype("category").cat.codes

# Create lists of all users, artists and plays
users = list(np.sort(df.u_id.unique()))
channels = list(np.sort(df.c_id.unique()))
interactions = list(df.msg_count)

# Get the rows and columns for our new matrix
rows = df.u_id.astype(int)
cols = df.c_id.astype(int)

# Contruct a sparse matrix for our users and items containing number of plays
data_sparse = sparse.csr_matrix((interactions, (rows, cols)), shape=(len(users), len(channels)))

## Overall Sparsity Measure - OS
$$\begin{equation*} OS=1-\frac {N_{e}}{N_{u}\times N_{i}} \tag{8}\end{equation*}$$

where $N_{e}$ denotes the number of evaluations, $N_{u}$ the number of users and $N_{i}$ the number of items.

In [None]:
sparsity = (1 - (len(interactions)/(len(users)*len(channels))))*100
print(f'Sparsity {sparsity:3.2f}%')

## Users Specific Sparsity Measure - USS

$$ \begin{equation*} USS_{u}=1-\displaystyle \frac {n_{u}}{max _{u\in U}(n_{u})} \tag{9}\end{equation*} $$

where $n_{u}$ is the number of ratings given by user u and $max _{u\in U}$ is the maximum number of evaluations given by a single user in U , the set of all users.

In [None]:
user_sparsity_df = df.copy()
user_sparsity_df = user_sparsity_df.groupby(["u_id"]).count()
user_sparsity_max = user_sparsity_df.c_id.max()
user_sparsity_df['USS'] = (1 - (user_sparsity_df["c_id"]/user_sparsity_max)) * 100
display(user_sparsity_df['USS'].describe())


## Items Specific Sparsity Measure - ISS

$$ \begin{equation*} ISS_{i}=1-\displaystyle \frac {n_{i}}{max _{i\in I}(n_{i})} \tag{10}\end{equation*} $$

where $n_{i}$ is the number of evaluations given to item i,I is the set of all items and $max _{i\in I}$ is the maximum number of evaluations given to an item in I. Just as the $USS$, the $ISS$ is also a relative measure, but regarding items.

In [None]:
item_sparsity_df = df.copy()
item_sparsity_df = item_sparsity_df.groupby(["c_id"]).count()
item_rating_max = item_sparsity_df.u_id.max()
item_sparsity_df['ISS'] = (1 - (item_sparsity_df["u_id"]/item_rating_max)) * 100
display(item_sparsity_df['ISS'].describe())

In [None]:
print(f"Number of users is {len(users)}")
print(f"Number of channels is {len(channels)}")
display(data_sparse)