# Processing LastFM dataset
* Uses LastFM data available [here](http://ocelma.net/MusicRecommendationDataset/lastfm-1K.html) at the time of writing.
* 10 percent sampling
* Generate file assigning uid to original uid: uid_to_uid.csv
* Generate file assigning iid to original artist-song pair: iid_to_artistsong.csv
* Generate 3 folders: train, validation, test
* Each of the folders should in the end have X.npy, y.npy, seq_lens.npy, user_ids.npy (the last one is not explicitly needed for training but may be useful for debugging)
* The order of things is as follows:
    * Assign unique ids to users and items (keep track of original values - make one column for artist+song)
    * Convert time to unix epochs
    * Remove users with 2 or fewer interactions
    * Sort each user's interaction by time so that the first thing that happened is also placed first, break ties deterministically
    * Add delta_t by removing the first interaction for each user
    * Make remaining items to be sequential - record to iid_to_song.csv
        * Do that only after removing items that are not in train. This has to be done as 5% of items are removed from val-test - they would be random noise that would also lead to possibly noticeable memory waste in the embedding matrix. Thus from the original DataFrame all unique iid-artist-song combinations are obtained, joined together with the train_df from which all the unique items used in the experiments are obtained. A new factorised column is created and the conversion from these new indices to relevant artist-song pairs is made. The new table is then joined with the train_df, val_df and test_df again on iid and thus new index is supplied to these DataFrames.
    * Split into train-validation-test with overhangs of one item (for label)
        * Apply same logic as in dataset.py
    * for each subset:
        * Remove items from validation and test if they are not present in train
        * Split into X,y
        * Place into numpy arrays 20 interactions at a time, apply padding if needed
        * Obtain seq_lens and user_ids
        * Save

## Imports 

In [None]:
import pandas as pd
import numpy as np
import time
import datetime
from pathlib import Path
from IPython.display import display, HTML
from itertools import compress
import sys
import os
from importlib import reload

## Settings

In [None]:
project_root = Path("/Users/nknyazev/Documents/Delft/Thesis/temporal") # Specify your own project root
data_root = project_root.joinpath("data")
code_root = project_root.joinpath("code")
input_path = data_root.joinpath("original/lastfm-dataset-1K/userid-timestamp-artid-artname-traid-traname.tsv")
output_dir = data_root.joinpath("processed/final/lastfm_10_pc/")
input_columns = ["og_user", "og_time", "artist_code", "artist", "song_code", "song"]

## Additional imports from own modules 

In [None]:
sys.path.append(str(code_root))
import model.utils.datasplit
reload(model.utils.datasplit)
from model.utils.datasplit import train_val_test_split_train_overlapping, remove_unseen_items_in_train, generate_big_hop_numpy_files

## Load dataset as pandas DataFrame
As the file is encoded in a weird way Pandas discards lines.
So reading of the file is performed in a more manual way

In [None]:
lines = []
with open(input_path) as input_file:
    for line in input_file:
        split = line.rstrip().split("\t")
        lines.append(split)

In [None]:
df = pd.DataFrame(lines, columns=input_columns)

### Clear up memory

In [None]:
del(lines)

### Preview of the data

In [None]:
df.head()

## Assign unique ids to users and songs

### uid

In [None]:
df["uid"] = [int(x[5:]) for x in df["og_user"]]

### iid 

In [None]:
df["iid"] = df.groupby(["artist", "song"]).ngroup()

### Intermediary results

In [None]:
df.head()

In [None]:
print("Number of unique users: {}".format(df["uid"].nunique()))
print("Number of unique items: {}".format(df["iid"].nunique()))

## Remove users with 2 or fewer interactions
One interaction is needed for delta_t<br/>
One interaction is needed to produce a label<br/>
At least one more interaction is needed to have an entry in X

### All interactions grouped by user id

In [None]:
grouped = df.groupby("uid")

### Find indices for all interactions for all users who have too few interactions

In [None]:
# Each item is a list containing indices of interactions belonging to a short user
cols_for_users_under_12 = [grouped.groups[k] for k in grouped.groups.keys() if len(grouped.groups[k]) < 12]
# Rows in DataFrame to remove
flattened = [idx for user in cols_for_users_under_12 for idx in user]
print("Found {} users summing to {} interactions".format(len(cols_for_users_under_12), len(flattened)))

### Remove the specified rows from the DataFrame

In [None]:
df = df.drop(flattened)

## Sort each user in time (the ones happened longer ago first). Break ties non-randomly

### All interactions grouped by user id

In [None]:
grouped = df.groupby("uid")

### Convert time into unix epoch format

In [None]:
# Function to convert iso date to unix epoch
def iso_to_epoch(iso):
    datetime_object = datetime.datetime.strptime(iso, '%Y-%m-%dT%H:%M:%SZ')
    return int((datetime_object.timestamp())) + 7200

In [None]:
# Create a column with unix timestamp for each interaction
epoch_col = [iso_to_epoch(x) for x in list(df["og_time"])]
df["t"] = epoch_col

In [None]:
display(df.head())
print("Time is sorted but in the reverse order")

### Drop unnecessary columns 

In [None]:
df = df.drop(columns=["og_user", "og_time", "artist_code", "song_code"])

### Reordering each user

In [None]:
# While this reordering is faster,
# in case of zero gaps items are (likely) entered from most recent to the oldest and sorting using index will
# be different from inverting each user's history.
# df = df.reset_index().sort_values(by=["uid", "t", "index"]).drop(columns="index").reset_index(drop=True)

grouped = df.groupby("uid")
new_index_col = []
for uid, items in grouped:
    new_indices = items.index[::-1]
    new_index_col.extend(new_indices)
    if uid % 100 == 1:
        print("Processed user {}.".format(uid))
df.index = new_index_col
df = df.sort_index()

### Verify the time is now sorted

In [None]:
df.head()

### OPTIONAL: Sampling done so that user retains 10% steps where predictions are made - to minimise discarding of users if a user 12 items and we have to pick 3 (one used for delta_t shift, one for features and the last for label) - one could view that we have 10 possible starting points for this sequence of three - sufficient to sample 10%.

In [None]:
np.random.seed(1234)
grouped = df.groupby("uid")
indices_to_keep = []
for uid, interactions in grouped:
    num_interactions = len(interactions)
    post_sample_size = int(np.floor(0.1 * (num_interactions-2))) + 2
    
    last_allowed_index = num_interactions-post_sample_size
    sample_start = np.random.randint(0, last_allowed_index+1, 1)[0]
    sample_end = sample_start + post_sample_size
    user_indices_to_keep = interactions.index[sample_start:sample_end]
    indices_to_keep.extend(user_indices_to_keep)

In [None]:
df = df.loc[indices_to_keep]

### Resulting average user sequence length

In [None]:
np.mean([len(x) for uid, x in df.groupby("uid")])

## Calculate delta_t's 

### Array to keep track of indices of 1st interaction for each user - these indices will be removed


In [None]:
remove_interaction_indices = []

### Array to keep track of time deltas

In [None]:
time_deltas = []

### Process each user - this should produce the same total number of interactions but each user's first interaction will have NaN in time_deltas

In [None]:
grouped = df.groupby("uid")
for uid, interactions in grouped:
    if len(interactions) > 2:
        remove_interaction_indices.append(interactions.index[0])
        time_delta_with_na = interactions["t"] - interactions.shift(1)["t"]
        time_deltas.extend(time_delta_with_na)
    else:
        remove_interaction_indices.extend(interactions.index)
        print("Removed interactions directly.")
    if uid % 50 == 1:
        print("Completed user {}.".format(uid))

### Remove nan's

In [None]:
time_deltas_wo_na = list(compress(time_deltas, ~np.isnan(time_deltas)))

### Sanity Check: len of original df - number of nan's = len of new df

In [None]:
assert len(df) - len(remove_interaction_indices) == len(time_deltas_wo_na)

### Remove interactions without time deltas 

In [None]:
df = df.drop(remove_interaction_indices)

### Add time deltas to output

In [None]:
df["dt"] = np.array(time_deltas_wo_na, dtype=np.int32)

In [None]:
percentiles_to_consider = sorted([0.5, 99.5] + list(range(1,100)))
percentiles = {x:np.percentile(df["dt"], x) for x in percentiles_to_consider}

In [None]:
for x,y in percentiles.items():
    delta_t = str(int(y)) + " seconds" if y < 60 else str(round(y/60, 1)) + " minutes (" \
    + str(round(y/60/60, 1)) + " hours)"
    count = np.sum(df["dt"] <= y) if x < 50 else np.sum(df["dt"] >= y)
    print("Percentile: {} - {}. {} interactions".format(x, delta_t, count))

### See how current progress looks like - can manually inspect that so far completed correctly

In [None]:
df.head()

## Split data into three dataframes: train, validation, test - 0.9, 0.05, 0.05 of each user's sequence respectively

`train_val_test_split_train_overlapping` from `model.utils.datasplit` of this repo.

In [None]:
train_df, val_df, test_df = train_val_test_split_train_overlapping(df=df[["uid", "iid", "dt"]], 
                                                                   col_names=["uid", "iid", "dt"],
                                                                  split=[0.9, 0.05, 0.05])

In [None]:
print("Original DataFrame Length - {}\nResulting DataFrame lengths:\nTrain - {}\nValidation - {}\nTest - {}\nTotal lengths - {}".format(len(df), len(train_df), len(val_df), len(test_df), len(train_df)+len(val_df)+len(test_df)))

## For test/eval remove interactions with items not present in train

In [None]:
og_val_items = set(val_df["iid"])
og_ts_items = set(test_df["iid"])
og_val_ts_items = og_val_items.union(og_ts_items)

In [None]:
val_df = remove_unseen_items_in_train(train_df=train_df, test_df=val_df)
test_df = remove_unseen_items_in_train(train_df=train_df, test_df=test_df)

In [None]:
val_items = set(val_df["iid"])
ts_items = set(test_df["iid"])
val_ts_items = val_items.union(ts_items)
items_removed = len(og_val_ts_items)-len(val_ts_items)
items_in_original_df = df["iid"].nunique()
print("Removed {} unique items from train and validation, which is {} of the original dataset's items.".format(items_removed,round(items_removed/items_in_original_df, 2)))

## Need to create linkage between indices in train_df and original df
* Get unique item id's from train_df
* Join these unique id's with original dataframe's iid-artist-song slice
* Factorize on iid
* Save factorized iid, artist, song as csv
* Join factorized iid, old iid with train_df, val_df, test_df. Then drop old iid column from each.

### DataFrame with unique items as indices, uid, t as values

In [None]:
unique_iid_train_df = train_df.groupby("iid").first()

### DataFrame with unique items as indices, artist, song name as values

In [None]:
unique_iid_artist_song = df[["iid", "artist", "song"]].groupby("iid").first()

In [None]:
len(unique_iid_artist_song.index)

### Merge the two above on the iid index

In [None]:
unique_iid_artist_song_uid_t = unique_iid_train_df.join(unique_iid_artist_song)[["artist", "song"]]

### Create new column with factorized iid

In [None]:
len(unique_iid_artist_song_uid_t.index)

In [None]:
unique_iid_artist_song_uid_t["new_iid"] = pd.factorize(unique_iid_artist_song_uid_t.index)[0]

In [None]:
unique_iid_artist_song_uid_t.tail()

### Save data to a separate file containing explanations what artist-song pair each item id stands for

#### Specify output path

In [None]:
iid_to_artistsong_path = output_dir.joinpath("iid_to_artistsong.csv")

#### Save

In [None]:
unique_iid_artist_song_uid_t.to_csv(iid_to_artistsong_path, columns=["new_iid", "artist", "song"], header=False, index=False)

### Join each of the train/validation/test DataFrames on iid

In [None]:
train_df = train_df.join(unique_iid_artist_song_uid_t[["new_iid"]], on="iid").drop("iid", axis=1).rename(columns={"new_iid": "iid"})

In [None]:
val_df = val_df.join(unique_iid_artist_song_uid_t[["new_iid"]], on="iid").drop("iid", axis=1).rename(columns={"new_iid": "iid"})

#### Verify before-after

In [None]:
test_df.tail()

In [None]:
test_df = test_df.join(unique_iid_artist_song_uid_t[["new_iid"]], on="iid").drop("iid", axis=1).rename(columns={"new_iid": "iid"})

In [None]:
test_df.tail()

In [None]:
# Write the last time gap to csv - needed for session based analysis
sufficient_interaction_mask = test_df.groupby("uid").uid.transform("count") > 1
test_df[sufficient_interaction_mask].groupby("uid")["uid", "dt"].tail(1).to_csv(output_dir.joinpath("last_dt.csv"), index=False, header=True)

## Create X, y, seq_lens and user_ids out of these three DataFrames

In [None]:
train_array = []
val_array = []
test_array = []
dfs = [train_df, val_df, test_df]
arrays = [train_array, val_array, test_array]

### Iterate over each of the three DataFrames and create 4 numpy arrays that are added to a list

In [None]:
for index in range(len(arrays)):
    dataframe = dfs[index]
    X, y, seq_lens = generate_big_hop_numpy_files(dataframe, features=["uid", "iid", "dt"], save=False)
    arrays[index].extend([X, y, seq_lens, X[:,0,0]])

    

## Save these into output_dir/{subset} as .npy files

In [None]:
subset_names = ["train", "validation", "test"]
file_types = ["X", "y", "seq_lens", "user_ids"]
file_names = [x + ".npy" for x in file_types]
for x in range(len(subset_names)):
    target_folder = output_dir.joinpath(subset_names[x])
    try:
        os.mkdir(str(target_folder))
    except FileExistsError:
        print("Folder {} already exists.".format(str(target_folder)))
    for y in range(len(arrays[x])):
        file_path = str(target_folder.joinpath(file_names[y]))
        print("Writing {}".format(file_path))
        np.save(file_path, arrays[x][y])
        