# Presynapse Protein Disorder Maps

Goal: Do the same analysis as for the postsynapse now with the presynapse data. If we find that the presynapse does not have specific regions of enriched/depeleted parameters, then the postsynapse is specific for that. We use the presynapse because we have the data and there is no other dataset available, that contains what we need.

In [5]:
import pandas as pd
import numpy as np
import glob
from pathlib import Path
import copy
import pickle
from scipy.io import loadmat, savemat
import matplotlib.pyplot as plt
%matplotlib inline

## Data preparation

The presynapse data is not present as lego models, as we had for the postsynapse. Instead, I have a txt file for each protein with xyz coordinates in Angström of each copy. I need to bin it in 3D and then sum it up to create similar lego models. Silvio suggested 250 Angström bins.

In [6]:
file_path = '.\\coordinates_average_presynapse\\'
files = glob.glob1(file_path, '*.txt')
data = {file[:-4]: np.loadtxt(file_path + file) for file in files}

Since I do not know the full extent of the model, I need to create a complete dataset first to determine the bin edges. If I would run each protein individually, the bin edges would differ between them.

In [7]:
def determine_bins(data, binsize_xy, binsize_z):
    total_data = np.concatenate([data[x] for x in data], axis = 0)
    minimums = total_data.min(axis = 0)
    maximums = total_data.max(axis = 0)
    minimums[0:2] /= binsize_xy #last value of range is excluded in python!
    minimums[2] /= binsize_z
    maximums[0:2] /= binsize_xy
    maximums[2] /= binsize_z
    minimums = np.floor(minimums)
    maximums = np.ceil(maximums)
    x_bins = np.arange(minimums[0] * binsize_xy, (maximums[0] + 1) * binsize_xy, binsize_xy) #add 1 to the maximum because this value is not included in the range
    y_bins = np.arange(minimums[1] * binsize_xy, (maximums[1] + 1) * binsize_xy, binsize_xy)
    z_bins = np.arange(minimums[2] * binsize_z, (maximums[2] + 1) * binsize_z, binsize_z)
    return x_bins, y_bins, z_bins

For each protein, bin the proteins according and put them into the dictionary copy_maps

In [8]:
x_bins, y_bins, z_bins = determine_bins(data, binsize_xy = 250, binsize_z = 1000)
copy_maps = {key: np.histogramdd(values, [x_bins, y_bins, z_bins])[0] for key, values in data.items()} #the [0] indicator behind histogramdd ensures that we only pass the binned data, and ignore the binedges that are usually also returned.
savemat('Presynapse_LegoModels.mat', copy_maps)

Read in the data tables

In [9]:
# The dictionary contains all presynapse proteins with the various identifiers. I use it to select and merge all the data from the different sources.
protein_list = pd.read_excel('presynapse protein dictionary.xlsx')

data_folder = Path('../')
#table from Hanna Wildhagen with disorder scores and structure composition
wildhagen_table = pd.read_excel(data_folder / "table_structure_elements.xlsx")
wildhagen_table = wildhagen_table.drop(columns = ['Gene_Names', 'Protein_IDs', 'Protein_Names', 'Fasta_Headers', 'Isoelectric point EMBOSS', 'Isoelectric point DATASELECT'])
df = pd.merge(protein_list, wildhagen_table, how = 'left', left_on = 'Mouse_Maj_Protein_Ids', right_on = 'Maj_Protein_IDs')
df = df.drop(columns = 'Maj_Protein_IDs')

#table from Cyriam et al.
ciryam_table = pd.read_excel(data_folder / "Ciryam_supp_table.xlsx")
ciryam_table = ciryam_table.rename(columns = ciryam_table.iloc[1])
ciryam_table = ciryam_table[['Uniprot ID', 'Zagg', 'ZaggSC']]
ciryam_table = ciryam_table.drop([0, 1])
ciryam_table = ciryam_table.reset_index(drop = True)
ciryam_table = ciryam_table.dropna()
#select only proteins from human, we dont need C.elegans.
ciryam_table = ciryam_table[ciryam_table['Uniprot ID'].str.contains('human')]
df = pd.merge(df, ciryam_table, how = 'left', left_on = 'Human_entry_name', right_on = 'Uniprot ID').drop(columns = 'Uniprot ID')
#add column with isoelectric point difference to neutral, because this makes the data interpretation easier later
df['IsoelectricPointAverageDifferenceToNeutral'] = df['IsoelectricPointAverage'] - 7
#reorder the dataframe
df = df[['File_name', 'Length', 'Mass', 'DisorderLong', 'DisorderShort', 'IsoelectricPointAverage', 'IsoelectricPointAverageDifferenceToNeutral', 'Coil', 'ExtendedBetaSheet', 'AlphaHelix', 'StructuredRatio', 'Zagg', 'ZaggSC']]
df = df.set_index('File_name')

## Set up Analysis

In contrast to the postsynapse, I have all the lego models in the copy_maps dictionary this time. Again use the add_score function to deal with situations where no score is present in the dataframe.

In [10]:
def add_score(score_map, copy_map, score):
    if isinstance(score, (int, float)) and ~np.isnan(score):
        score_map += copy_map * score
    return score_map

In [11]:
# Initialize the result maps
score_maps = {score: np.zeros(copy_maps['actin'].shape) for score in ['Length', 'Mass', 'DisorderLong', 'DisorderShort', 'IsoelectricPointAverage', 'IsoelectricPointAverageDifferenceToNeutral', 'Coil', 'ExtendedBetaSheet', 'AlphaHelix', 'StructuredRatio', 'Zagg', 'ZaggSC']}
copy_map_total = np.zeros(copy_maps['actin'].shape)

for protein, copy_map in copy_maps.items():
    copy_map_total += copy_map
    scores = df.loc[protein, :]
#     scores = scores[['Length', 'Mass', 'DisorderLong', 'DisorderShort', 'IsoelectricPointAverage', 'IsoelectricPointAverageDifferenceToNeutral', 'Coil', 'ExtendedBetaSheet', 'AlphaHelix', 'StructuredRatio', 'Zagg', 'ZaggSC']]
    score_maps.update((score, add_score(score_map, copy_map, scores[score])) for score, score_map in score_maps.items())

score_maps_orig = copy.deepcopy(score_maps) #Save a copy first for normalizations later on.

#add the map with total copy numbers to the results for the simple analysis and save the data
score_maps['CopyMap'] = copy_map_total 
with open('presynapse_scores.pkl', 'wb') as outfile:
    pickle.dump(score_maps, outfile)
savemat('presynapse_scores.mat', score_maps)

# #Normalize data for protein concentration per voxel, to see whether the increase in aggregation prone proteins is independent of protein concentration.
# score_maps = copy.deepcopy(score_maps_orig)
# length_map = score_maps['Length']
# length_map[length_map == 0] = 1  #to prevent division by 0
# score_maps.update((score, score_map/length_map) for score, score_map in score_maps.items())
# with open('presynapse_scores_normalized_aa.pkl', 'wb') as outfile:
#     pickle.dump(score_maps, outfile)
# savemat('presynapse_scores_normalized_aa.mat', score_maps)

#Normalize data for protein copy number per voxel.
score_maps = copy.deepcopy(score_maps_orig)
copy_map_total[copy_map_total == 0] = 1 #to prevent division by 0.
score_maps.update((score, score_map/copy_map_total) for score, score_map in score_maps.items())
with open('presynapse_scores_normalized_copynum.pkl', 'wb') as outfile:
    pickle.dump(score_maps, outfile)
savemat('presynapse_scores_normalized_copynum.mat', score_maps)

## Make heatmap plots

In [16]:
from matplotlib.backends.backend_pdf import PdfPages
from mpl_toolkits.axes_grid1 import ImageGrid
plt.ioff()

files = glob.glob('*.pkl')
for file in files:
    with open(file, 'rb') as infile:
        score_maps = pickle.load(infile)

    with PdfPages(str(Path(file).with_suffix('')) + '.pdf') as pdf:
        for score, score_map in score_maps.items():
            fig = plt.figure()
            plt.suptitle(score)
            grid = ImageGrid(fig, 111, nrows_ncols = (4,3), share_all = True, cbar_location = 'right', cbar_mode = 'single')

            #get max and min values to scale all images the same later on
            vmin = np.min(score_map)
            vmax = np.max(score_map)

            for i, ax in enumerate(grid):
                try:
                    im = ax.imshow(score_map[:,:,i], cmap = 'inferno', vmin = vmin, vmax = vmax)
                except:
                    break

            ax.cax.colorbar(im)
            ax.cax.toggle_label(True)            
            pdf.savefig()
            plt.close()

plt.ion()



## Make Lego Model plots

This is done to make the later investigation easier, which proteins might cause some regional score enrichments.

In [26]:
plt.ioff()
with PdfPages('Presynapse_LegoModels.pdf') as pdf:
    for protein, copy_map in copy_maps.items():
        fig = plt.figure()
        plt.suptitle(protein)
        grid = ImageGrid(fig, 111, nrows_ncols = (4,3), share_all = True, cbar_location = 'right', cbar_mode = 'single')
        vmin = np.min(copy_map)
        vmax = np.max(copy_map)
        for i, ax in enumerate(grid):
            try:
                im = ax.imshow(copy_map[:,:,i], cmap = 'inferno', vmin = vmin, vmax = vmax)
            except:
                break
                
        ax.cax.colorbar(im)
        ax.cax.toggle_label(True)
        pdf.savefig()
        plt.close()

plt.ion()