In [300]:
import matplotlib.pyplot as plt
import torch.utils.data as data
import torch.optim as optim
import torch.nn as nn
import pandas as pd
import numpy as np
import torch


In [301]:
torch.cuda.set_device(0)
device = torch.device("cuda")
torch.cuda.is_available()


True

In [302]:
df_track_storage = pd.read_json("data/track_storage.jsonl", lines=True)
df_sessions = pd.read_json("data/sessions.jsonl", lines=True)
df_artists = pd.read_json("data/artists.jsonl", lines=True)
df_tracks = pd.read_json("data/tracks.jsonl", lines=True)
df_users = pd.read_json("data/users.jsonl", lines=True)


In [303]:
df = pd.merge(
    df_sessions,
    df_tracks,
    left_on="track_id",
    right_on="id"
).drop(["id"], axis=1)

df = df_sessions[["timestamp", "track_id"]]
df['timestamp'] = pd.to_datetime(df['timestamp']).dt.date
df = df.groupby(['track_id', 'timestamp']).size().reset_index(name='count')

df.columns


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['timestamp'] = pd.to_datetime(df['timestamp']).dt.date


Index(['track_id', 'timestamp', 'count'], dtype='object')

In [305]:
def prepare_data(dataset, lookback=7):
    X, y = [], []
    for i in range(len(dataset)-lookback):
        feature = dataset[i:i+lookback]
        target = dataset[i+1:i+lookback+1]
        X.append(feature)
        y.append(target)
    return X, y


X, y = [], []

for trakc_id in list(df["track_id"].unique()[:10]):
    df_track_tmp = df.loc[df["track_id"] == track_id].sort_values(by="timestamp").drop(["track_id", "timestamp"], axis=1)
    df_track_tmp = df_track_tmp.values.astype('float32')
    X_tmp, y_tmp = prepare_data(df_track_tmp)
    X += X_tmp
    y += y_tmp

X = torch.tensor(X)
y = torch.tensor(y)


In [306]:
def create_dataset(dataset, lookback):
    X, y = [], []
    for i in range(len(dataset)-lookback):
        feature = dataset[i:i+lookback]
        target = dataset[i+1:i+lookback+1]
        X.append(feature)
        y.append(target)
    return torch.tensor(X), torch.tensor(y)


if False:
    track_id = "000xYdQfIZ4pDmBGzQalKU"
    df_track_tmp = df.loc[df["track_id"] == track_id].sort_values(by="timestamp").drop(["track_id", "timestamp"], axis=1)
    df_track_tmp = df_track_tmp.values.astype('float32')

    X, y = create_dataset(df_track_tmp, lookback=7)

    train_size = int(len(X) * 0.8)
    test_size = len(X) - train_size
    X_train, X_test = X[:train_size], X[train_size:]
    y_train, y_test = y[:train_size], y[train_size:]


train_size = int(len(X) * 0.8)
test_size = len(X) - train_size
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]


In [307]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

(torch.Size([96, 7, 1]),
 torch.Size([96, 7, 1]),
 torch.Size([24, 7, 1]),
 torch.Size([24, 7, 1]))

In [308]:
class SuperMusicModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.lstm = nn.LSTM(input_size=1, hidden_size=50, num_layers=1, batch_first=True)
        self.linear = nn.Linear(50, 1)
    def forward(self, x):
        x, _ = self.lstm(x)
        x = self.linear(x)
        return x
 

model = SuperMusicModel().cuda()
optimizer = optim.Adam(model.parameters())
loss_fn = nn.MSELoss()

BATCH_SIZE = 8

train_dataset = data.TensorDataset(X_train, y_train)
train_loader = data.DataLoader(train_dataset, shuffle=True, batch_size=BATCH_SIZE)

test_dataset = data.TensorDataset(X_test, y_test)
test_loader = data.DataLoader(test_dataset, shuffle=True, batch_size=BATCH_SIZE)


In [309]:
model.train()

epochs_losses = []
for epoch in range(1000):
    epoch_lossess = []
    for x, y in train_loader:

        x = x.cuda()  # .to(device)
        y = y.cuda()  # .to(device)

        preds = model(x)

        optimizer.zero_grad()

        loss = loss_fn(preds, y)
        loss.backward()

        epoch_lossess.append(loss.item())
        optimizer.step()

    loss_mean = np.array(epoch_lossess).mean()
    epochs_losses.append(loss_mean)

loss_mean = np.array(epochs_losses).mean()
loss_mean


0.14031983588822186

# Walidacja przy założeniu, że model ma przewidzieć ostatnie 6 dni oraz 1 przyszły

In [310]:
model.eval()

with torch.no_grad():
    correct, all_ = 0, 0
    for X, labels in test_loader:

        X = X.to(device)
        labels = labels.to(device)

        preds = model(X)

        for t in torch.round(preds) == labels:
            if sum(t) == 7:
                correct += 1
            all_ += 1

try:
    print(f'Valid accuracy: {round(correct/all_, 2)}')
except TypeError:
    print(f'Valid accuracy: {correct/all_, 2}')


Valid accuracy: 0.08


# Walidacja przy założeniu, że model ma przewidzieć 1 przyszły dzień

In [311]:
model.eval()

with torch.no_grad():
    correct, all_ = 0, 0
    for X, labels in test_loader:

        X = X.to(device)
        labels = labels.to(device)

        preds = model(X)

        for t in torch.round(preds) == labels:
            if t[-1] == True:
                correct += 1
            all_ += 1

try:
    print(f'Valid accuracy: {round(correct/all_, 2)}')
except TypeError:
    print(f'Valid accuracy: {correct/all_, 2}')


Valid accuracy: 1.0


### Warto zwrócić uwagę na fakt, że model został do tej pory wytrenowany tylko na 10 utworach oraz nie jest dostrojony - co zostanie poprawione w najbliższych dniach