# 3D Protein Pose Recovery

## (0) Setup Colab

In [1]:
running_on_colab = True

In [2]:
from pathlib import Path

if running_on_colab:
    # create directories
    !mkdir --parents data/5j0n
    !mkdir training

    !wget -c https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh  #Miniconda3-py37_4.8.3-Linux-x86_64.sh
    !chmod +x Miniconda3-latest-Linux-x86_64.sh
    !bash ./Miniconda3-latest-Linux-x86_64.sh -b -f -p /usr/local

    # save the model on the drive
    from google.colab import drive
    drive.mount("/content/drive")
    #!cd 'drive/My Drive'
    !mkdir --parents 'drive/My Drive/ModelsProtein'

    import sys
    # clone repo
    !rm -r protein-reconstruction
    !git clone https://github.com/JelenaBanjac/protein-reconstruction.git
    !cd protein-reconstruction
    #sys.path.append("protein-reconstruction")

    !conda env create -f protein-reconstruction/environment.yml 
    #sys.path.append("/usr/local/envs/protein_reconstruction/lib/python3.6/site-packages") 
    !/usr/local/envs/protein_reconstruction/bin/pip install tensorflow-graphics-gpu
    !/usr/local/envs/protein_reconstruction/bin/pip install tensorflow-gpu

else:
    import sys
    sys.path.append("..") 
    
    DATA_DIR        = "../data"

--2020-08-25 12:59:59--  https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
Resolving repo.anaconda.com (repo.anaconda.com)... 104.16.131.3, 104.16.130.3, 2606:4700::6810:8203, ...
Connecting to repo.anaconda.com (repo.anaconda.com)|104.16.131.3|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 93052469 (89M) [application/x-sh]
Saving to: ‘Miniconda3-latest-Linux-x86_64.sh’


2020-08-25 12:59:59 (216 MB/s) - ‘Miniconda3-latest-Linux-x86_64.sh’ saved [93052469/93052469]

PREFIX=/usr/local
Unpacking payload ...
Collecting package metadata (current_repodata.json): - \ | done
Solving environment: - done

## Package Plan ##

  environment location: /usr/local

  added / updated specs:
    - _libgcc_mutex==0.1=main
    - ca-certificates==2020.1.1=0
    - certifi==2020.4.5.1=py38_0
    - cffi==1.14.0=py38he30daa8_1
    - chardet==3.0.4=py38_1003
    - conda-package-handling==1.6.1=py38h7b6447c_0
    - conda==4.8.3=py38_0
    - cryptography==2

---

In [3]:
%%bash
source activate protein_reconstruction

python
# imports
import tensorflow as tf
from tensorflow.python.client import device_lib 
print(tf.__version__)
print(device_lib.list_local_devices())
import sys
sys.path.append("protein-reconstruction")
from cryoem.projections import generate_2D_projections
from cryoem.plots import plot_projection, plot_detector_pixels, plot_angles_count
from cryoem.preprocessing import preprocessing, train_val_test_split
from cryoem.distance_estimation import train_distance_estimation
from cryoem.angle_alignment import training_angle_alignment, update_quaternion
from tensorflow.keras.optimizers import Adagrad, Ftrl

from cryoem.conversions import d_q, euler2quaternion
from cryoem.plots import plot_dP_dQ
import h5py
import ipyvolume as ipv
import numpy as np
print (sys.version)

# constants
DATA_DIR        = "/content/drive/My Drive/ModelsProtein"
PROTEIN         = "5j0n"
PROJECTIONS_NUM = 5000
INPUT_FILE      = f"{PROTEIN}.mrc"
ANGLE_COVERAGE  = [2.0, 1.0, 0.5]
ANGLE_SHIFT     = [0., 0., 0.]
OUTPUT_FILE     = f"{PROTEIN}_ProjectionsAngles_ProjNber{PROJECTIONS_NUM}_AngCoverage{','.join(map(str, ANGLE_COVERAGE))}_AngShift{','.join(map(str, ANGLE_COVERAGE))}.h5"

# generate sythetic data - 2D projections and their corresponding orientations from a given 3D protein
generate_2D_projections(input_file_path=f"{DATA_DIR}/{INPUT_FILE}", 
                        ProjNber=PROJECTIONS_NUM,
                        AngCoverage=ANGLE_COVERAGE,
                        AngShift=ANGLE_SHIFT,
                        angles_gen_mode="uniform_angles",
                        output_file_name=f"{DATA_DIR}/{OUTPUT_FILE}")

# read stored data for further processing
data = h5py.File(f"{DATA_DIR}/{OUTPUT_FILE}", 'r')
projections = data['Projections']
angles_true = np.array(data['Angles'])
print(f"{projections.shape[0]} projections of images with dimension {projections.shape[1:]} pixels")
print(f"{angles_true.shape[0]} sets of {angles_true.shape[1]} ground truth rotation angles (Z-Y-Z axes) of corresponding projection images")

#plot_angles_count(angles_true)
#plot_detector_pixels(angles_true)

# (2) PREPROCESSING
NOISY_VAR = 0
TRANSLATION = 0

#plot_projection(projections[0], angles=angles_true[0])
projections = preprocessing(projections, noise_var=NOISY_VAR, left_limit=-TRANSLATION, peak_limit=0, right_limit=TRANSLATION, channels="gray")
#plot_projection(projections[0], angles=angles_true[0])

# (3) DISTANCE ESTIMATION
DE_LOGS_DIR = "pipeline_logs/distance_estimation"
DE_DESCRIPTION = "25-08-2020_de200epochs_noPerturbation"

train_idx, val_idx, test_idx, train_pairs_num, val_pairs_num = train_val_test_split(PROJECTIONS_NUM, test_size=0.33, val_size=0.25, train_percent=0.01, val_percent=0.01, indices_file=f"{DATA_DIR}/train_val_test_indices.npz")

model, history = train_distance_estimation(X=projections, y=angles_true, 
                                           train_idx=train_idx, 
                                           val_idx=val_idx,
                                           epochs=200,
                                           batch_size=256,  #512 latest 
                                           learning_rate=1e-3, 
                                           limit_style="random", 
                                           path_logs_training=f"{DATA_DIR}/{DE_LOGS_DIR}", 
                                           training_description=DE_DESCRIPTION,
                                           training_steps=train_pairs_num,
                                           validation_steps=val_pairs_num, 
                                           plot=True,
                                           gpus=None,
                                           file_name=f"{DATA_DIR}/{DE_LOGS_DIR}/loss.png")

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

2020-08-25 13:06:55.244532: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1
2020-08-25 13:06:57.587036: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2020-08-25 13:06:57.592979: I tensorflow/core/platform/profile_utils/cpu_utils.cc:104] CPU Frequency: 2000150000 Hz
2020-08-25 13:06:57.593315: I tensorflow/compiler/xla/service

In [4]:
%%bash
source activate protein_reconstruction

python
# imports
import tensorflow as tf
from tensorflow.python.client import device_lib 
print(tf.__version__)
print(device_lib.list_local_devices())
import sys
sys.path.append("protein-reconstruction")
from cryoem.projections import generate_2D_projections
from cryoem.plots import plot_projection, plot_detector_pixels, plot_angles_count
from cryoem.preprocessing import preprocessing, train_val_test_split
from cryoem.distance_estimation import train_distance_estimation
from cryoem.angle_alignment import training_angle_alignment, update_quaternion
from tensorflow.keras.optimizers import Adagrad, Ftrl
from cryoem.angle_recovery import train_angle_recovery
from cryoem.conversions import quaternion2euler
from cryoem.reconstruction import reconstruct
import copy 

from cryoem.conversions import d_q, euler2quaternion
from cryoem.plots import plot_dP_dQ
import h5py
import ipyvolume as ipv
import numpy as np
print (sys.version)

# (1) SYTHETIC DATA
# constants
DATA_DIR        = "/content/drive/My Drive/ModelsProtein"
PROTEIN         = "5j0n"
PROJECTIONS_NUM = 5000
INPUT_FILE      = f"{PROTEIN}.mrc"
ANGLE_COVERAGE  = [2.0, 1.0, 0.5]
ANGLE_SHIFT     = [0., 0., 0.]
OUTPUT_FILE     = f"{PROTEIN}_ProjectionsAngles_ProjNber{PROJECTIONS_NUM}_AngCoverage{','.join(map(str, ANGLE_COVERAGE))}_AngShift{','.join(map(str, ANGLE_COVERAGE))}.h5"

# read stored data for further processing
data = h5py.File(f"{DATA_DIR}/{OUTPUT_FILE}", 'r')
projections = data['Projections']
projections_original = data['Projections']
angles_true = np.array(data['Angles'])
print(f"{projections.shape[0]} projections of images with dimension {projections.shape[1:]} pixels")
print(f"{angles_true.shape[0]} sets of {angles_true.shape[1]} ground truth rotation angles (Z-Y-Z axes) of corresponding projection images")

#plot_angles_count(angles_true)
#plot_detector_pixels(angles_true)

# (2) PREPROCESSING
NOISY_VAR = 0
TRANSLATION = 0

#plot_projection(projections[0], angles=angles_true[0])
projections = preprocessing(projections, noise_var=NOISY_VAR, left_limit=-TRANSLATION, peak_limit=0, right_limit=TRANSLATION, channels="gray")
#plot_projection(projections[0], angles=angles_true[0])

DE_LOGS_DIR = "pipeline_logs/distance_estimation"
DE_DESCRIPTION = "25-08-2020_de200epochs_noPerturbation"

# (*3) DISTANCE ESTIMATION (read stored model)
# read stored model
model_filename = f"{DATA_DIR}/{DE_LOGS_DIR}/training/{DE_DESCRIPTION}.h5"

# Recreate the exact same model, including its weights and the optimizer
model = tf.keras.models.load_model(model_filename)

# Show the model architecture
model.summary()

# learned distance function
def d_p(p1, p2):
    p1 = tf.cast(p1, dtype=tf.float32)
    p2 = tf.cast(p2, dtype=tf.float32)
    return model.predict([p1, p2], batch_size=512)

# dP/dQ ratio plot
batch_size = 512
idx1 = list(np.random.choice(range(5000), size=batch_size))
idx2 = list(np.random.choice(range(5000), size=batch_size))

q1_true = euler2quaternion([angles_true[i] for i in idx1])
q2_true = euler2quaternion([angles_true[i] for i in idx2])

p1 = [projections[i] for i in idx1]
p2 = [projections[i] for i in idx2]

dP_values = d_p(p1, p2).T[0]
dQ_values = d_q(q1_true, q2_true).numpy()

plot_dP_dQ(dP_values, dQ_values, f"{DATA_DIR}/{DE_LOGS_DIR}/dPdQ.png");

# (4) ANGLE RECOVERY
AR_LOGS_DIR = "pipeline_logs/angle_recovery"
AR_DESCRIPTION = "25-08-2020_ar15Ksteps_noPerturbation"

q_predicted, losses, q_all = train_angle_recovery(steps=15000, 
                                                 batch_size=256, 
                                                 in_data=projections, 
                                                 distance_fn=d_p, 
                                                 learning_rate=0.5, 
                                                 limit_distance=np.pi,
                                                 file_name=f"{DATA_DIR}/{AR_LOGS_DIR}/{AR_DESCRIPTION}")

angles_predicted = quaternion2euler(q_predicted)

# (5) ANGLE ALIGNMENT
m, a_R, losses, collect_data, trajectory = training_angle_alignment(num_runs=3, 
                                                                    steps=300, 
                                                                    batch_size=256, 
                                                                    optimizer=Ftrl(learning_rate=2., learning_rate_power=-2.), 
                                                                    angles_true=angles_true, 
                                                                    angles_predicted=angles_predicted)

m = m
trajectory_first = trajectory[0]
loss_first = losses[0]
trajectory_last = trajectory[-1]
loss_last = losses[-1]

m, trajectory_first, loss_first, trajectory_last, loss_last

q_predicted_rotated =  update_quaternion(m, a_R, q_predicted)
angles_predicted_rotated = quaternion2euler(q_predicted_rotated)

# (6) RECONSTRUCTION
REC_LOGS_DIR = "pipeline_logs/reconstruction"
REC_DESCRIPTION = "25-08-2020_de200e_ar15Ks_noPerturbation.mrc"

reconstruct(projections_original, angles_predicted, mrc_filename=f"{DATA_DIR}/{REC_LOGS_DIR}/{REC_DESCRIPTION}")

2.3.0
[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 1106315575827237534
, name: "/device:XLA_CPU:0"
device_type: "XLA_CPU"
memory_limit: 17179869184
locality {
}
incarnation: 15774727724154613223
physical_device_desc: "device: XLA_CPU device"
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 14613293312
locality {
  bus_id: 1
  links {
  }
}
incarnation: 7423919814862744703
physical_device_desc: "device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5"
, name: "/device:XLA_GPU:0"
device_type: "XLA_GPU"
memory_limit: 17179869184
locality {
}
incarnation: 14629410513815937485
physical_device_desc: "device: XLA_GPU device"
]
3.6.9 |Anaconda, Inc.| (default, Jul 30 2019, 19:07:31) 
[GCC 7.3.0]
5000 projections of images with dimension (116, 116) pixels
5000 sets of 3 ground truth rotation angles (Z-Y-Z axes) of corresponding projection images
--- Preprocessing projections ---
Image rescaled: from dimension 116 to 128

2020-08-25 20:22:28.753593: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1
2020-08-25 20:22:30.524821: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2020-08-25 20:22:30.530957: I tensorflow/core/platform/profile_utils/cpu_utils.cc:104] CPU Frequency: 2000150000 Hz
2020-08-25 20:22:30.531206: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x56153f02cbc0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2020-08-25 20:22:30.531234: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
2020-08-25 20:22:30.532382: I tensorflow/stream_executor/platform/d