# Trial Summary 

Data preparation:
- Filter genes with less than 10 non-zero expressions spots
- Apply log transformation on the expressions

Model:
- Nueral Matrix Factorization
- RMSE training loss excluding zeros

Results:
- Valid RMSE: 
- Test RMSE:

# Imports

In [1]:
from os import path, listdir
from copy import deepcopy
import stlearn as st
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.optim as optim
import matplotlib.pyplot as plt

%load_ext autoreload
%autoreload 2

import trainer_nmf as trainer
import data_nmf as get_data
from models import get_model
import tester_nmf as tester
from loss import *
from results_analysis import *

In [2]:
plt.rcParams.update({'font.size': 12})
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


# Load Data 

In [3]:
min_counts = 500
min_cells = 177
apply_log = True
batch_size = 128

In [4]:
dl_train, dl_valid, dl_test, _ = get_data.main(
    min_counts=min_counts,
    min_cells=min_cells,
    apply_log=apply_log, 
    batch_size=batch_size, 
    device=device
)

  utils.warn_names_duplicates("var")


# spots: 1185 | # genes: 32285
New shape after filtering: (1185, 6279)
Log transformation step is finished in adata.X
Data shape: (7440615, 3)
Number of genes: 6279
Number of spots: 1185
Train shape:(7440615, 3)
Valid shape:(383754, 3)
Test shape:(426393, 3)
Start creating the DataSets
Start creating the DataLoaders
Finish loading the data


# Modelling

## Set HyperParameters

In [5]:
model_name = 'NMF'
max_epochs = 150
early_stopping = 10
model_params = {
    'learning_rate': 0.1,
    'optimizer': "SGD",
    'latent_dim': 40,
    'batch_size': batch_size
}

## Build Model 

In [6]:
model = get_model(model_name, model_params, dl_train)
optimizer = getattr(optim, model_params['optimizer'])(model.parameters(), lr=model_params['learning_rate'])
criterion = NON_ZERO_RMSELoss()

## Train Model 

In [7]:
model, valid_loss = trainer.train(
    model=model,
    optimizer=optimizer,
    criterion=criterion,
    max_epochs=max_epochs,
    early_stopping=early_stopping,
    dl_train=dl_train,
    dl_test=dl_valid, 
    device=device,
    model_name=model_name
)

2022-09-27 12:29:24.270626: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-09-27 12:29:24.433514: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-09-27 12:29:24.433545: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-09-27 12:29:24.467287: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-09-27 12:29:25.388027: W tensorflow/stream_executor/platform/de

Training Results - Epoch[1] Avg loss: 2.24
Validation Results - Epoch[1] Avg loss: 2.29
Training Results - Epoch[2] Avg loss: 1.39
Validation Results - Epoch[2] Avg loss: 1.42
Training Results - Epoch[3] Avg loss: 0.73
Validation Results - Epoch[3] Avg loss: 0.74
Training Results - Epoch[4] Avg loss: 0.37
Validation Results - Epoch[4] Avg loss: 0.38
Training Results - Epoch[5] Avg loss: 0.36
Validation Results - Epoch[5] Avg loss: 0.37
Training Results - Epoch[6] Avg loss: 0.36
Validation Results - Epoch[6] Avg loss: 0.37
Training Results - Epoch[7] Avg loss: 0.36
Validation Results - Epoch[7] Avg loss: 0.37
Training Results - Epoch[8] Avg loss: 0.36
Validation Results - Epoch[8] Avg loss: 0.37
Training Results - Epoch[9] Avg loss: 0.36
Validation Results - Epoch[9] Avg loss: 0.37
Training Results - Epoch[10] Avg loss: 0.36
Validation Results - Epoch[10] Avg loss: 0.37
Training Results - Epoch[11] Avg loss: 0.36
Validation Results - Epoch[11] Avg loss: 0.37
Training Results - Epoch[12]

Validation Results - Epoch[96] Avg loss: 0.34
Training Results - Epoch[97] Avg loss: 0.33
Validation Results - Epoch[97] Avg loss: 0.34
Training Results - Epoch[98] Avg loss: 0.33
Validation Results - Epoch[98] Avg loss: 0.34
Training Results - Epoch[99] Avg loss: 0.33
Validation Results - Epoch[99] Avg loss: 0.34
Training Results - Epoch[100] Avg loss: 0.33
Validation Results - Epoch[100] Avg loss: 0.34
Training Results - Epoch[102] Avg loss: 0.33
Validation Results - Epoch[102] Avg loss: 0.34
Training Results - Epoch[103] Avg loss: 0.33
Validation Results - Epoch[103] Avg loss: 0.34
Training Results - Epoch[104] Avg loss: 0.33
Validation Results - Epoch[104] Avg loss: 0.34
Training Results - Epoch[105] Avg loss: 0.33
Validation Results - Epoch[105] Avg loss: 0.34
Training Results - Epoch[106] Avg loss: 0.33
Validation Results - Epoch[106] Avg loss: 0.34
Training Results - Epoch[108] Avg loss: 0.33
Validation Results - Epoch[108] Avg loss: 0.34
Training Results - Epoch[109] Avg loss: 

Engine run is terminating due to exception: 


KeyboardInterrupt: 

In [None]:
train_res = 0.67
valid_res = 0.67
print(f'Train final results (after log transform) = {train_res}')
print(f'Train final results = {np.exp(train_res)}')
print(f'Valid final results (after log transform) = {valid_res}')
print(f'Valid final results = {np.exp(valid_res)}')

## Test 

In [None]:
test_loss, df_test_preds = tester.test(
    model=model,
    criterion=criterion,
    dl_test=dl_test,
    device=device
)
print(f'Test loss = {test_loss}')

# Results Analysis 

In [None]:
df_test_preds['y_fixed'] = np.exp(df_test_preds['y'])
df_test_preds['y_pred_fixed'] = np.exp(df_test_preds['y_pred'])

y_col = 'y'
y_pred_col = 'y_pred'

df_test_preds['error'] = df_test_preds[y_col] - df_test_preds[y_pred_col]
df_test_preds.head()

## Errors Distribution 

In [None]:
error_distribution(df_test_preds)

In [None]:
mask = df_test_preds['error'] > -1e5
error_distribution(df_test_preds, mask=mask)

## Spots Errors Distribution 

In [None]:
spots_error_distribution(df_test_preds)

## Genes Errors Distribution 

In [None]:
genes_error_distribution(df_test_preds)

## Errors Heat Map 

In [None]:
error_heat_map(df_test_preds, vmin=-700, vmax=700)