In [2]:
import pandas as pd 
import numpy as np 
import sys 
import os 
import importlib
import copy
import plotly.graph_objects as go 

In [3]:
import rmsd_functions as rmsd 
importlib.reload(rmsd)
# Following line assures plotly to plot inline of jupyter notebook
import plotly.offline as pyo
pyo.init_notebook_mode()

Goal of this notebook is to take already aligned pdb and turn them into a voxel cube vector. This only takes into account binary of atom presence of absence, not differentiating between atom C,N,O... etc 

In [4]:
tmaligned_df = pd.read_pickle('../AF_files/dict_tmaligned.pkl')

In [5]:
# Visualize aligned ORs 
rmsd.dict_plot_all(dict(list(tmaligned_df.items())[0:5]), mode="markers")

In [6]:
def create_voxel(coords, size, resolution, spacer = [0,0,0]):
    # Initialize the voxel
    voxel = np.zeros(size, dtype=float)
    # Compute the indices of the coordinates in the voxel
    indices = np.floor((coords - spacer) / resolution).astype(int)
    # Set the values of the voxel
    voxel[indices[:,0], indices[:,1], indices[:,2]] = 1
    return voxel

In [7]:
resolution = 0.1

# Find the maximum extent of all the proteins
max_extent = np.max([np.max(tmaligned_df[Olfr]['coord'], axis=0) - \
                     np.min(tmaligned_df[Olfr]['coord'], axis=0) \
                     for Olfr in tmaligned_df], axis=0)
min_spacer = np.min([np.min(tmaligned_df[Olfr]['coord'], axis=0) \
                     for Olfr in tmaligned_df], axis=0)

size = np.ceil((max_extent - min_spacer )/ resolution).astype(int)


voxel_list = []
Olfr_order = []
# Loop through each protein's coordinates and create a voxel
for Olfr, info in tmaligned_df.items():
    voxel = create_voxel(info['coord'], size, resolution, spacer = min_spacer)
    # Save the voxel to a file or do other processing as needed
    Olfr_order.append(Olfr)
    voxel_list.append(voxel)

In [96]:
# Visualize the first voxel in 3D space. 

fig = go.Figure()
for i in [voxel_list[i] for i in [0,5,100,1000]]:
    x, y, z = i.nonzero()
    fig.add_trace(go.Scatter3d(x = x, 
                               y = y, 
                               z = z, 
                              mode = 'markers'))
fig.update_traces( marker=dict(size=3, opacity = 0.4))
    # update_layout setting the axis visibility and background to False 
fig.show()

In [None]:
# Save dictionary files as pickles 
import pickle
f=open('../AF_files/linear-voxel_tmaligned.pkl','wb')
pickle.dump(voxel_list,f)
f.close
# voxel_list = pd.read_pickle('../AF_files/voxel_tmaligned.pkl')

Attempt to visualize voxels in a reduced dimensional space via PCA 

In [24]:
from sklearn.decomposition import PCA
from sklearn.decomposition import IncrementalPCA


In [17]:
# Flattens the list via ravel 
flat_voxel_list = []
for v in voxel_list: 
    flat_voxel_list.append( np.ravel(v))

In [28]:
from sklearn.datasets import fetch_openml


In [None]:
# Load the MNIST dataset
mnist = fetch_openml('mnist_784')

# Convert the data to float64
data = mnist.data.astype('float64')

In [29]:
mnist = fetch_openml('mnist_784')






In [31]:
data = mnist.data.astype('float64')


In [None]:
pd.DataFrame(flat_voxel_list)

In [None]:
type(data)

In [25]:
# Create an incremental PCA object
n_components = 3  # Number of principal components to keep
batch_size = 100  # Number of samples to process at a time
ipca = IncrementalPCA(n_components=n_components, batch_size=batch_size)

# Fit the incremental PCA model to the data
for i in range(0, data.shape[0], batch_size):
    ipca.partial_fit(data[i:i+batch_size])

# Transform the flattened voxel data to the reduced-dimensional space
reduced_voxel = ipca.transform(flat_voxel_list)

MemoryError: Unable to allocate 51.6 TiB for an array with shape (1170, 6065477154) and data type float64

In [None]:

# Visualize the reduced data points
plt.scatter(reduced_voxel[:, 0], reduced_voxel[:, 1], c=Olfr, cmap='tab10')
plt.colorbar()
plt.show()