In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from nb_007a import *

# Movie Lens

Data available from http://files.grouplens.org/datasets/movielens/ml-latest-small.zip

In [None]:
PATH = Path('data/ml-latest-small/')

Table user/movie -> rating

In [None]:
ratings = pd.read_csv(PATH/'ratings.csv')
ratings.head()

Table to get the titles of the movies.

In [None]:
movies = pd.read_csv(PATH/'movies.csv')
movies.head()

In [None]:
ratings.columns

In [None]:
type(movies)

In [None]:
#export
def categorify(col):
    "Transform a colum of a dataframe in categories."
    uniq = col.unique()
    name2idx = {o:i for i,o in enumerate(uniq)}
    return uniq, name2idx, np.array([name2idx[n] for n in col], dtype=np.int64)

In [None]:
#export
@dataclass
class ColabFilteringDataset():
    user:Collection
    user_dict:dict
    user_ids:Collection
    item:Collection
    item_dict:dict
    item_ids:Collection    
    ratings:pd.DataFrame
    
    def __len__(self): return len(self.ratings)
    def __getitem__(self, idx): return (self.user_ids[idx], self.item_ids[idx]), self.ratings[idx]
    
    @classmethod
    def from_df(cls, rating_df, pct_val=0.2, user_name=None, item_name=None, rating_name=None):
        if user_name is None:   user_name = rating_df.columns[0]
        if item_name is None:   item_name = rating_df.columns[1]
        if rating_name is None: rating_name = rating_df.columns[2]
        user,user_dict,user_ids = categorify(rating_df[user_name])
        item,item_dict,item_ids = categorify(rating_df[item_name])
        ratings = np.array(rating_df[rating_name], dtype=np.float32)
        idx = np.random.permutation(len(ratings))
        cut = int(pct_val * len(ratings))
        return (cls(user, user_dict, user_ids[idx[cut:]], item, item_dict, item_ids[idx[cut:]], ratings[idx[cut:]]),
                cls(user, user_dict, user_ids[idx[:cut]], item, item_dict, item_ids[idx[:cut]], ratings[idx[:cut]]))
    
    @classmethod
    def from_csv(cls, csv_name, pct_val=0.2, user_name=None, item_name=None, rating_name=None):
        df = pd.read_csv(csv_name)
        return cls.from_df(df, user_name, pct_val, item_name, rating_name)

In [None]:
train_ds, valid_ds = ColabFilteringDataset.from_df(ratings)

In [None]:
len(ratings), len(train_ds), len(valid_ds)

In [None]:
bs = 64
data = DataBunch.create(train_ds, valid_ds, bs=bs, num_workers=0)

In [None]:
#export
def get_embedding(ni,nf):
    emb = nn.Embedding(ni, nf)
    emb.weight.data.uniform_(-0.05,0.05)
    return emb

In [None]:
#export
class EmbeddingDotBias(nn.Module):
    def __init__(self, n_factors, n_users, n_items, min_score, max_score):
        super().__init__()
        self.min_score,self.max_score = min_score,max_score
        (self.u_weight, self.i_weight, self.u_bias, self.i_bias) = [get_embedding(*o) for o in [
            (n_users, n_factors), (n_items, n_factors), (n_users,1), (n_items,1)
        ]]

    def forward(self, users, items):
        dot = self.u_weight(users)* self.i_weight(items)
        res = dot.sum(1) + self.u_bias(users).squeeze() + self.i_bias(items).squeeze()
        return torch.sigmoid(res) * (self.max_score-self.min_score) + self.min_score

In [None]:
n_factors = 50
model = EmbeddingDotBias(n_factors, len(train_ds.user), len(train_ds.item), 0, 5) 

In [None]:
learn = Learner(data, model)
learn.loss_fn = F.mse_loss

In [None]:
learn.lr_find()

In [None]:
learn.recorder.plot()

In [None]:
learn.fit_one_cycle(5, 1e-2, wd=0.1)

In [None]:
math.sqrt(0.77)