# This notebook is to analyse the fibril site alignments with focus on ex vivo aSyn fibrils
- Find share features among ex vivo aSyn fibrils
- Find share features between ex vivo fibrils of aSyn and other amyloid proteins
- Find share features between ex vivo and in vitro aSyn fibrils
- Align site matches

**Open3d output explained**

- *icp_fitness*, which measures the overlapping area (# of correspondences / # of points in source pocket). The higher the better.
        "how much of the source pocket is matched to the target" --> Fitness(source)
        
- *inlier_rmse*, which measures the RMSE of all inlier correspondences. The lower the better.

**target_matched_percent explained**
- *Fitness(Target)* =  # of correspondences / # of points in target pocket

**combined_fitness_score (SSmax) explained**
- since there is a size difference between the matched pockets, a fitness score, aka query coverage, calculated based on the source and target pockets' sizes. the Max of both is used as the most coverage obtained between 2 pockets and will be used for further downstream analysis

Note: The fibril sites are refered to as grooves or pockets throughout the code

## 1.0 Libraries

In [1]:
from fibrilsite.site_alignment import ply_parser_hull
from fibrilsite.site_alignment_analysis import *

Jupyter environment detected. Enabling Open3D WebVisualizer.
[Open3D INFO] WebRTC GUI backend enabled.
[Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.


## 2.0 Functions

In [2]:
def fibril_name_str(x):
    """  """
    if x.split('_')[0] in ['PHF','CTEII']:
        return 'Tau'
    elif x.split('_')[0] in ['A53T', 'Pol', 'LF', 'G51D', 'MSA', 'E46K', 'H50Q', 'pY39']:
        return 'aSyn'
    else:
        return x.split('_')[0]

In [3]:
def add_size_metrics(df:pd.DataFrame):
    """  """
    df['src2target_size_ratio']      = round(df['size_source'] / df['size_target'],2)
    df['target_matched_percent_icp'] = round(df['icp_nb_corres']/df['size_target'],2)
    return df

In [4]:
def add_pocket_matches(df:pd.DataFrame) -> pd.DataFrame:
    """  """
    vessel = []
    for item00 in df.pocket_pairs:
        sorted_items = sorted([item00[0],item00[2]])
        vessel.append((sorted_items[0], sorted_items[1]))
    
    # add to Dataframe
    df.insert(0, 'pocket_matches', vessel)

    # check if there are duplicated matches
    assert len(df.pocket_matches.to_list()) == len(list(set(df.pocket_matches.to_list()))), "Duplicated Matches Found"
    print("Unique Matches")

    return df

## 3.0 I/O

### 3.1 Output dirs

In [5]:
# make main output folder
main_output = os.path.join(os.path.abspath('.'), str(datetime.date.today()) + "_site_alignment_analysis")
os.makedirs(main_output, exist_ok=False)

## ------------------------------------------------------- ##

# make output for all vs all comparison
allvsall_out = os.path.join(main_output, "all_vs_all")
os.makedirs(allvsall_out, exist_ok=False)

## ------------------------------------------------------- ##

# make output for ex vivo aSyn to aSyn fibril sites comparison
asyn2asyn_out = os.path.join(main_output, "aSyn_to_aSyn")
os.makedirs(asyn2asyn_out, exist_ok=False)

## ------------------------------------------------------- ##

# make output for ex vivo aSyn to other amyloid fibrils sites comparison
asyn2others_out = os.path.join(main_output, "aSyn_to_other_amyloids")
os.makedirs(asyn2others_out, exist_ok=False)

## ------------------------------------------------------- ##

# make output for ex vivo to in vitro aSyn fibril sites comparison
asyn2invitro_out = os.path.join(main_output, "aSyn_to_invitro_aSyn")
os.makedirs(asyn2invitro_out, exist_ok=False)

## ------------------------------------------------------- ##

# make output for in vitro aSyn to other amyloid fibrils sites comparison
asyninvitro2others_out = os.path.join(main_output, "invitro_aSyn_to_other_amyloids")
os.makedirs(asyninvitro2others_out, exist_ok=False)

## ------------------------------------------------------- ##

# make output for aligned sites
aligned_sites_output = os.path.join(main_output, "identified_matches_alignments")
os.makedirs(aligned_sites_output, exist_ok=False)

## ------------------------------------------------------- ##

### 3.2 Folders paths

In [None]:
# Path to folder containing the defined fibril sites
site_src = os.path.abspath("../sel_fibril_sites")

In [None]:
# Path to folder containing alignment npy files
reg_results_src   = os.path.abspath("./2025-04-08_registration_outputs")

# parse the output npy files after site registration
reg_results_paths = [l.strip() for l in glob.iglob(os.path.join(
    reg_results_src , "o3d_objects_npy", "*.npy"))]

print(len(reg_results_paths))
print(reg_results_paths[0])

1830
/work/lpdi/users/asadek/bin/fibril_pocket_analysis/to_publish/run_alignment/2025-04-08_registration_outputs/o3d_objects_npy/AB42_P76_E46K_P53_input-feats.npy


In [None]:
# Path to folder containing fibrils pdb files
fibrils_pdb_src = os.path.abspath("../pdbs")

# parse the fibril pdb file paths
fibrils_paths = [p.strip() for p in glob.iglob(os.path.join(fibrils_pdb_src, "*.pdb"))]

print(len(fibrils_paths))
fibrils_paths[:2]

18


['/work/lpdi/users/asadek/bin/fibril_pocket_analysis/to_publish/pdbs/6l1u_ABCDEFGHIJKLMNO.pdb',
 '/work/lpdi/users/asadek/bin/fibril_pocket_analysis/to_publish/pdbs/6cu7_ABCDEFGHIJ.pdb']

In [None]:
# parse the paths for the defined fibril sites ply files
ply_files = [p.strip() for p in glob.iglob(os.path.join(site_src, "*", "*.ply")) if "convex" in p]

print(len(ply_files))
ply_files[:2]

61


['/work/lpdi/users/asadek/bin/fibril_pocket_analysis/to_publish/sel_fibril_sites/2025-04-08_6xyq_MSA_IIb_P70/MSA_IIb_P70_convex_hull.ply',
 '/work/lpdi/users/asadek/bin/fibril_pocket_analysis/to_publish/sel_fibril_sites/2025-04-08_6cu7_Pol_1a_P1/Pol_1a_P1_convex_hull.ply']

In [10]:
# get the paths to the defined site (pocket) files
## we need to get the refined site version if available, other wise to load the isolated site version

# container
src_pockets_paths = []

for folder00 in glob.iglob(os.path.join(site_src, '*')):
    try:
        [p.strip() for p in glob.iglob(os.path.join(folder00, '*.xyz')) if 'refined' in p][0]
    except IndexError:
        src_pockets_paths.append([p.strip() for p in glob.iglob(os.path.join(folder00, '*.xyz')) if 'isolated' in p][0])
    else:
        src_pockets_paths.append([p.strip() for p in glob.iglob(os.path.join(folder00, '*.xyz')) if 'refined' in p][0])

print(len(src_pockets_paths))
src_pockets_paths[:2]

61


['/work/lpdi/users/asadek/bin/fibril_pocket_analysis/to_publish/sel_fibril_sites/2025-04-08_6xyq_MSA_IIb_P70/2025-04-08_refined_pocket.xyz',
 '/work/lpdi/users/asadek/bin/fibril_pocket_analysis/to_publish/sel_fibril_sites/2025-04-08_6cu7_Pol_1a_P1/2025-04-08_refined_pocket.xyz']

In [None]:
# Map fibril sites to respective fibril pdbs
done = []
fibril_pocket_map = {}

for path01 in glob.iglob(os.path.join(site_src, '*')):
    p_name = os.path.basename(path01).split("_")
    if p_name[1] not in done :
        done.append(p_name[1])
        fibril_pocket_map[p_name[1]] = []
        fibril_pocket_map[p_name[1]].append(p_name[-1])
    else:
        fibril_pocket_map[p_name[1]].append(p_name[-1])

assert len(fibril_pocket_map.keys()) == len(fibrils_paths), "Parsed fibrils and provided fibril pdb paths count mismatch"

fibril_pocket_map

{'6xyq': ['P70', 'P65', 'P63', 'P66', 'P64'],
 '6cu7': ['P1', 'P4', 'P3', 'P2'],
 '7UMQ': ['P81', 'P82', 'P80', 'P79'],
 '6ufr': ['P54', 'P55', 'P52', 'P53'],
 '6xyp': ['P62', 'P69', 'P60', 'P59', 'P61'],
 '6ssx': ['P22', 'P24', 'P21', 'P26', 'P23', 'P25'],
 '6xyo': ['P56', 'P58', 'P57'],
 '6l1u': ['P40', 'P43', 'P42', 'P41'],
 '7QVC': ['P71', 'P72'],
 '6NWQ': ['P74', 'P73'],
 '7NRV': ['P78', 'P77'],
 '7e0f': ['P46', 'P44', 'P45'],
 '8A9L': ['P67', 'P68'],
 '6pes': ['P83', 'P51', 'P50'],
 '7Q4M': ['P75', 'P76'],
 '6cu8': ['P19', 'P20'],
 '6sst': ['P32', 'P36', 'P34', 'P33', 'P35'],
 '6lrq': ['P49', 'P48', 'P47']}

## 4.0 Execution

### 4.1 Load all the information

In [None]:
# load all fibril site points with their features -- MODIFY PATH
df_all_info = pd.read_csv(os.path.abspath('../run_alignment/2025-04-08_sites_parsed_info/2025-04-08_all_sites_input_feats.csv'), index_col=0)

print(df_all_info.shape)
df_all_info.head(2)

(13667, 21)


Unnamed: 0,fibril,pocket_id,isolation,MaSIF_index,atom_type,chain,coords,point_direction,resid,resname,...,surf_charge,surf_coords,surf_hbond,surf_hphob,surf_norm_fibril_dot,surf_normals,input_si,input_charge,input_hphob,input_hbonds
0,6NWQ,CTEII_P73,refined,9478,C,A,[153.771 151.437 160.011],,324.0,SER,...,0.08785,[151.993 151.207 159.101],-0.203964,-0.8,0.100411,[-0.814522 -0.507549 -0.280977],0.113715,0.195311,-0.155556,0.030528
1,6NWQ,CTEII_P73,refined,9837,CB,A,[151.985 153.109 160.591],,324.0,SER,...,0.419086,[150.786 151.785 160.011],-0.274085,-0.8,-0.116151,[-0.784149 -0.617719 -0.0594427],-0.513678,0.249782,0.237698,0.0


In [None]:
# define the loaded site names
pockets = list(set(df_all_info.pocket_id))
assert len(pockets) == len(src_pockets_paths), "Parsed sites xyz files and site info mismatch"

In [None]:
### load all fibril site alignment results -- MODIFY PATH
df_input_all = pd.read_csv(os.path.abspath('./2025-04-08_registration_outputs/2025-04-08_all_sites_alignment_results.csv'), index_col=0)

# add fibril source for source and target pockets
df_input_all['source_pocket_fibril'] = df_input_all['source_pocket'].apply(lambda x: fibril_name_str(x))
df_input_all['target_pocket_fibril'] = df_input_all['target_pocket'].apply(lambda x: fibril_name_str(x))

# add the alternate metrics from the target pocket side
df_input_all = add_size_metrics(df=df_input_all)

# rename the fitness score columns
df_input_all.rename(columns={"icp_fitness":"icp_fitness_source", "target_matched_percent_icp":"icp_fitness_target"}, inplace=True)


# add the combined fitness score
comb_fit_sc_vessel = []
for idx00 in df_input_all.index:
    comb_fit_sc_vessel.append(round(max(df_input_all.at[idx00, "icp_fitness_source"], df_input_all.at[idx00, "icp_fitness_target"]),2))

df_input_all.insert(df_input_all.shape[1], "combined_fitness_score", comb_fit_sc_vessel)

# get rid of the ransac columns
df_input_all = df_input_all[[
    'source_pocket', 'target_pocket',
    'icp_rmse', 'icp_fitness_source', 'icp_fitness_target', 'combined_fitness_score',
    'icp_nb_corres', 'size_source', 'size_target', 'src2target_size_ratio',
    'source_pocket_fibril', 'target_pocket_fibril']]

# get the pocket pairs
df_input_all.insert(0, 'pocket_pairs', [[s,sf,t,tf] for s,sf,t,tf in zip(df_input_all.source_pocket, df_input_all.source_pocket_fibril, df_input_all.target_pocket, df_input_all.target_pocket_fibril)])
df_input_all.sort_values(by='pocket_pairs', inplace=True)
df_input_all.reset_index(drop=True, inplace=True)

# calculate the input feat diff
df_input_all = calc_input_feat_diff(df=df_input_all, df_all_info=df_all_info, input_feats_npy=reg_results_paths, output=main_output, export=False)

# add the pocket matches
df_input_all = add_pocket_matches(df=df_input_all)

# get the approximations
df_input_all["combined_fitness_score"] = df_input_all["combined_fitness_score"].apply(lambda x: round(x, 2))
df_input_all["icp_mean_input_diff"] = df_input_all["icp_mean_input_diff"].apply(lambda x: round(x, 2))

# export 
df_input_all.to_csv(os.path.join(main_output, str(datetime.date.today())+'_all_input_reg_pockets.csv'))

print(df_input_all.shape)
df_input_all.head(2)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Unique Matches
(1830, 15)


Unnamed: 0,pocket_matches,pocket_pairs,source_pocket,target_pocket,icp_rmse,icp_fitness_source,icp_fitness_target,combined_fitness_score,icp_nb_corres,size_source,icp_mean_input_diff,size_target,src2target_size_ratio,source_pocket_fibril,target_pocket_fibril
0,"(A53T_P47, A53T_P48)","[A53T_P48, aSyn, A53T_P47, aSyn]",A53T_P48,A53T_P47,0.692606,0.401709,0.3,0.4,47,117,1.01,158,0.74,aSyn,aSyn
1,"(A53T_P48, CTEII_P73)","[A53T_P48, aSyn, CTEII_P73, Tau]",A53T_P48,CTEII_P73,0.70532,0.452991,0.36,0.45,53,117,1.26,148,0.79,aSyn,Tau


In [15]:
# Find the all vs all matches
# base the selections on the ICP mean input feats (Fdiff)

# container
allvsall_vessel = []

for poc in tqdm(pockets):
    df_temp = df_input_all.copy()[(df_input_all.source_pocket == poc) | (df_input_all.target_pocket == poc)].sort_values(by="icp_mean_input_diff", ascending=True).reset_index(drop=True).head(5)
    allvsall_vessel.append(df_temp)
    df_temp.to_csv(os.path.join(allvsall_out, poc + "_top5_nghs.csv"))

# put the selected nghs into a df
df_allvsall_sel_ngh = pd.concat(allvsall_vessel).reset_index(drop=True)
df_allvsall_sel_ngh.to_csv(os.path.join(allvsall_out, "allvsall_top5_nghs.csv"))

100%|███████████████████████████████████████████████████████████████████████████████████| 61/61 [00:00<00:00, 427.90it/s]


### 4.2 Compare sites among ex vivo aSyn fibrils

In [16]:
# get the pocket names in the brain derived structures
asyn_brain_pockets = [p for p in pockets if p.split("_")[0] in ["MSA", "LF"]]


# get the information for the brain derived pockets
df_input_b2b = df_input_all.copy()[(df_input_all.source_pocket.isin(asyn_brain_pockets)) & (df_input_all.target_pocket.isin(asyn_brain_pockets))].sort_values(by='combined_fitness_score', ascending=False).reset_index(drop=1)

# export all aSyn brain pockets matches
df_input_b2b.to_csv(os.path.join(asyn2asyn_out, 'all_asyn_brain_to_asyn_brain_matches.csv'))

print(df_input_all.shape)
print(df_input_b2b.shape)

df_input_b2b.head(2)

(1830, 15)
(105, 15)


Unnamed: 0,pocket_matches,pocket_pairs,source_pocket,target_pocket,icp_rmse,icp_fitness_source,icp_fitness_target,combined_fitness_score,icp_nb_corres,size_source,icp_mean_input_diff,size_target,src2target_size_ratio,source_pocket_fibril,target_pocket_fibril
0,"(MSA_IIa_P60, MSA_IIb_P64)","[MSA_IIa_P60, aSyn, MSA_IIb_P64, aSyn]",MSA_IIa_P60,MSA_IIb_P64,0.596326,0.888268,0.85,0.89,159,179,0.31,187,0.96,aSyn,aSyn
1,"(MSA_IIa_P62, MSA_I_P58)","[MSA_IIa_P62, aSyn, MSA_I_P58, aSyn]",MSA_IIa_P62,MSA_I_P58,0.589014,0.790419,0.87,0.87,132,167,0.27,151,1.11,aSyn,aSyn


### Get the matches that pass the simirity threshold
- SSmax : combined_fitness_score >= 0.5
- Fdiff : icp_mean_input_diff <= 0.6

In [None]:
# set the similarity thresholds
ssmax_thresh = 0.5
fdiff_thresh = 0.6

In [None]:
# get the matches that pass the simirity threshold
## SSmax thresh was lowered to 0.4 to include MSA–I P56 for alignment purposes 

df_b2b_sel = df_input_b2b.copy()[(df_input_b2b.combined_fitness_score >= 0.4) 
                                 & (df_input_b2b.icp_mean_input_diff <= fdiff_thresh)].sort_values(by="combined_fitness_score", ascending=False).reset_index(drop=True)
print(f"Identified {df_b2b_sel.shape[0]} site matched among ex vivo aSyn fibril sites")

# export
df_b2b_sel.to_csv(os.path.join(asyn2asyn_out, "sel_asyn_brain_to_asyn_brain_matches.csv"))

# align site point clouds
align_site_pcd(df=df_b2b_sel, ply_files=ply_files, input_feats_npy=reg_results_paths, output=aligned_sites_output)

print(df_b2b_sel.shape)
df_b2b_sel.head(2)

aligning sites: 100%|████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 85.40it/s]

(11, 15)





Unnamed: 0,pocket_matches,pocket_pairs,source_pocket,target_pocket,icp_rmse,icp_fitness_source,icp_fitness_target,combined_fitness_score,icp_nb_corres,size_source,icp_mean_input_diff,size_target,src2target_size_ratio,source_pocket_fibril,target_pocket_fibril
0,"(MSA_IIa_P60, MSA_IIb_P64)","[MSA_IIa_P60, aSyn, MSA_IIb_P64, aSyn]",MSA_IIa_P60,MSA_IIb_P64,0.596326,0.888268,0.85,0.89,159,179,0.31,187,0.96,aSyn,aSyn
1,"(MSA_IIa_P62, MSA_I_P58)","[MSA_IIa_P62, aSyn, MSA_I_P58, aSyn]",MSA_IIa_P62,MSA_I_P58,0.589014,0.790419,0.87,0.87,132,167,0.27,151,1.11,aSyn,aSyn
2,"(MSA_IIa_P60, MSA_I_P57)","[MSA_I_P57, aSyn, MSA_IIa_P60, aSyn]",MSA_I_P57,MSA_IIa_P60,0.671094,0.731959,0.79,0.79,142,194,0.45,179,1.08,aSyn,aSyn
3,"(MSA_IIa_P59, MSA_IIb_P63)","[MSA_IIb_P63, aSyn, MSA_IIa_P59, aSyn]",MSA_IIb_P63,MSA_IIa_P59,0.662825,0.783726,0.74,0.78,366,467,0.34,497,0.94,aSyn,aSyn
4,"(MSA_IIb_P64, MSA_I_P57)","[MSA_I_P57, aSyn, MSA_IIb_P64, aSyn]",MSA_I_P57,MSA_IIb_P64,0.674407,0.731959,0.76,0.76,142,194,0.46,187,1.04,aSyn,aSyn
5,"(MSA_IIa_P69, MSA_IIb_P70)","[MSA_IIb_P70, aSyn, MSA_IIa_P69, aSyn]",MSA_IIb_P70,MSA_IIa_P69,0.715868,0.623596,0.66,0.66,111,178,0.46,169,1.05,aSyn,aSyn
6,"(MSA_IIa_P61, MSA_IIb_P65)","[MSA_IIb_P65, aSyn, MSA_IIa_P61, aSyn]",MSA_IIb_P65,MSA_IIa_P61,0.65593,0.60479,0.58,0.6,202,334,0.52,350,0.95,aSyn,aSyn
7,"(MSA_IIa_P59, MSA_IIb_P65)","[MSA_IIb_P65, aSyn, MSA_IIa_P59, aSyn]",MSA_IIb_P65,MSA_IIa_P59,0.635025,0.586826,0.39,0.59,196,334,0.51,497,0.67,aSyn,aSyn
8,"(MSA_IIb_P63, MSA_IIb_P65)","[MSA_IIb_P65, aSyn, MSA_IIb_P63, aSyn]",MSA_IIb_P65,MSA_IIb_P63,0.642808,0.553892,0.4,0.55,185,334,0.56,467,0.72,aSyn,aSyn
9,"(MSA_IIa_P59, MSA_I_P56)","[MSA_I_P56, aSyn, MSA_IIa_P59, aSyn]",MSA_I_P56,MSA_IIa_P59,0.64662,0.423423,0.28,0.42,141,333,0.4,497,0.67,aSyn,aSyn


### 4.3 Compare sites between ex vivo fibrils of aSyn and other amyloid proteins

In [18]:
# get the pocket names in the brain derived structures
asyn_brain_and_other_amyloids_pockets = [p for p in pockets if p.split("_")[0] not in ['A53T', 'Pol', 'G51D', 'E46K', 'H50Q', 'pY39']]

# get the information for the desired pockets
df_input_b2others = df_input_all.copy()[
    (df_input_all.source_pocket.isin(asyn_brain_and_other_amyloids_pockets)) & 
    (df_input_all.target_pocket.isin(asyn_brain_and_other_amyloids_pockets))
    ].sort_values(by='combined_fitness_score', ascending=False).reset_index(drop=1)

# export all aSyn brain pockets matches
df_input_b2others.to_csv(os.path.join(asyn2others_out, 'all_asyn_brain_to_other_amyloids_matches.csv'))

print(df_input_all.shape)
print(df_input_b2others.shape)

df_input_b2others.head(2)

(1830, 15)
(351, 15)


Unnamed: 0,pocket_matches,pocket_pairs,source_pocket,target_pocket,icp_rmse,icp_fitness_source,icp_fitness_target,combined_fitness_score,icp_nb_corres,size_source,icp_mean_input_diff,size_target,src2target_size_ratio,source_pocket_fibril,target_pocket_fibril
0,"(MSA_IIa_P60, MSA_IIb_P64)","[MSA_IIa_P60, aSyn, MSA_IIb_P64, aSyn]",MSA_IIa_P60,MSA_IIb_P64,0.596326,0.888268,0.85,0.89,159,179,0.31,187,0.96,aSyn,aSyn
1,"(MSA_IIa_P62, MSA_I_P58)","[MSA_IIa_P62, aSyn, MSA_I_P58, aSyn]",MSA_IIa_P62,MSA_I_P58,0.589014,0.790419,0.87,0.87,132,167,0.27,151,1.11,aSyn,aSyn


In [None]:
# get the matches that pass the simirity threshold
df_b2o_sel = df_input_b2others.copy()[(df_input_b2others.combined_fitness_score >= ssmax_thresh) & 
                                      (df_input_b2others.icp_mean_input_diff <= fdiff_thresh)].sort_values(by="combined_fitness_score", ascending=False).reset_index(drop=True)

# make sure that the matches are not among ex vivo aSyn fibrils 
df_b2o_sel["fibril_src_target_match"] = df_b2o_sel["source_pocket_fibril"] == df_b2o_sel["target_pocket_fibril"]
df_b2o_sel = df_b2o_sel[df_b2o_sel.fibril_src_target_match == False]
print(f"Identified {df_b2o_sel.shape[0]} site matched among sites from ex vivo fibrils from aSyn and other amyloids")

# export
df_b2o_sel.to_csv(os.path.join(asyn2others_out, 'sel_asyn_brain_to_other_amyloids_matches.csv'))

df_b2o_sel.head(2)

Unnamed: 0,pocket_matches,pocket_pairs,source_pocket,target_pocket,icp_rmse,icp_fitness_source,icp_fitness_target,combined_fitness_score,icp_nb_corres,size_source,icp_mean_input_diff,size_target,src2target_size_ratio,source_pocket_fibril,target_pocket_fibril,fibril_src_target_match


### 4.4 Compare sites between ex vivo and in vitro fibrils of aSyn

In [20]:
# get the pocket names
asyn_brain_and_other_invitro_pockets = [p for p in pockets if p.split("_")[0] in ['MSA', 'LF', 'A53T', 'Pol', 'G51D', 'E46K', 'H50Q', 'pY39']]

# get the information for the desired pockets
df_input_b2invitro = df_input_all.copy()[
    (df_input_all.source_pocket.isin(asyn_brain_and_other_invitro_pockets)) & 
    (df_input_all.target_pocket.isin(asyn_brain_and_other_invitro_pockets))
    ].sort_values(by='combined_fitness_score', ascending=False).reset_index(drop=1)

# export all aSyn brain pockets matches
df_input_b2invitro.to_csv(os.path.join(asyn2invitro_out, 'all_asyn_brain_to_invitro_matches.csv'))

print(df_input_all.shape)
print(df_input_b2invitro.shape)

df_input_b2invitro.head(2)

(1830, 15)
(1176, 15)


Unnamed: 0,pocket_matches,pocket_pairs,source_pocket,target_pocket,icp_rmse,icp_fitness_source,icp_fitness_target,combined_fitness_score,icp_nb_corres,size_source,icp_mean_input_diff,size_target,src2target_size_ratio,source_pocket_fibril,target_pocket_fibril
0,"(Pol_2a_P23, Pol_2b_P34)","[Pol_2b_P34, aSyn, Pol_2a_P23, aSyn]",Pol_2b_P34,Pol_2a_P23,0.548625,0.94152,0.84,0.94,161,171,0.29,192,0.89,aSyn,aSyn
1,"(Pol_2a_P22, Pol_2b_P35)","[Pol_2a_P22, aSyn, Pol_2b_P35, aSyn]",Pol_2a_P22,Pol_2b_P35,0.611618,0.918919,0.74,0.92,102,111,0.43,137,0.81,aSyn,aSyn


In [None]:
# get the matches that pass the simirity threshold
df_b2invitro_sel = df_input_b2invitro.copy()[(df_input_b2invitro.combined_fitness_score >= ssmax_thresh) & 
                                             (df_input_b2invitro.icp_mean_input_diff <= fdiff_thresh)].sort_values(by="combined_fitness_score", ascending=False).reset_index(drop=True)

# ensure that the matches are with the ex vivo aSyn structures
df_b2invitro_sel = df_b2invitro_sel[(df_b2invitro_sel.source_pocket.isin(asyn_brain_pockets)) | (df_b2invitro_sel.target_pocket.isin(asyn_brain_pockets))].reset_index(drop=True)

# make sure that the matches are not among ex vivo aSyn fibrils 
df_b2invitro_sel_rej = df_b2invitro_sel[(df_b2invitro_sel.source_pocket.isin(asyn_brain_pockets)) & (df_b2invitro_sel.target_pocket.isin(asyn_brain_pockets))].reset_index(drop=True)
df_b2invitro_sel = df_b2invitro_sel[~df_b2invitro_sel.pocket_matches.isin(df_b2invitro_sel_rej.pocket_matches.to_list())].reset_index(drop=True)
print(f"Identified {df_b2invitro_sel.shape[0]} site matched among sites from ex vivo and in vitro fibrils of aSyn")

# export
df_b2invitro_sel.to_csv(os.path.join(asyn2invitro_out, 'sel_asyn_brain_to_invitro_matches.csv'))

# align site point clouds
align_site_pcd(df=df_b2invitro_sel, ply_files=ply_files, input_feats_npy=reg_results_paths, output=aligned_sites_output)

df_b2invitro_sel.head(2)

aligning sites: 100%|██████████████████████████████████████████████████████████████████████| 3/3 [00:32<00:00, 10.83s/it]


Unnamed: 0,pocket_matches,pocket_pairs,source_pocket,target_pocket,icp_rmse,icp_fitness_source,icp_fitness_target,combined_fitness_score,icp_nb_corres,size_source,icp_mean_input_diff,size_target,src2target_size_ratio,source_pocket_fibril,target_pocket_fibril
0,"(H50Q_P50, MSA_IIb_P66)","[MSA_IIb_P66, aSyn, H50Q_P50, aSyn]",MSA_IIb_P66,H50Q_P50,0.637851,0.684564,0.46,0.68,102,149,0.6,221,0.67,aSyn,aSyn
1,"(MSA_IIb_P70, Pol_1a_P4)","[MSA_IIb_P70, aSyn, Pol_1a_P4, aSyn]",MSA_IIb_P70,Pol_1a_P4,0.694011,0.601124,0.54,0.6,107,178,0.55,198,0.9,aSyn,aSyn
2,"(H50Q_P83, MSA_IIb_P66)","[H50Q_P83, aSyn, MSA_IIb_P66, aSyn]",H50Q_P83,MSA_IIb_P66,0.647769,0.420513,0.55,0.55,82,195,0.5,149,1.31,aSyn,aSyn


### 4.5 Compare sites between in vitro fibrils of aSyn and ex vivo fibrils of other amyloids

In [22]:
# get the pocket names
asyn_invitro_and_other_exvivo_pockets = [p for p in pockets if p.split("_")[0] not in ['MSA', 'LF']]

# get the information for the desired pockets
df_input_invitro2others = df_input_all.copy()[
    (df_input_all.source_pocket.isin(asyn_invitro_and_other_exvivo_pockets)) & 
    (df_input_all.target_pocket.isin(asyn_invitro_and_other_exvivo_pockets))
    ].sort_values(by='combined_fitness_score', ascending=False).reset_index(drop=1)


# make sure that the matches are not among ex vivo aSyn fibrils 
df_input_invitro2others["fibril_src_target_match"] = df_input_invitro2others["source_pocket_fibril"] == df_input_invitro2others["target_pocket_fibril"]
df_input_invitro2others = df_input_invitro2others[df_input_invitro2others.fibril_src_target_match == False].reset_index(drop=True)

# export
df_input_invitro2others.to_csv(os.path.join(asyninvitro2others_out, 'all_asyn_invitro_to_other_amyloids_matches.csv'))

df_input_invitro2others.head(2)

Unnamed: 0,pocket_matches,pocket_pairs,source_pocket,target_pocket,icp_rmse,icp_fitness_source,icp_fitness_target,combined_fitness_score,icp_nb_corres,size_source,icp_mean_input_diff,size_target,src2target_size_ratio,source_pocket_fibril,target_pocket_fibril,fibril_src_target_match
0,"(E46K_P54, Prp_P81)","[Prp_P81, Prp, E46K_P54, aSyn]",Prp_P81,E46K_P54,0.703847,0.60355,0.86,0.86,102,169,0.99,119,1.42,Prp,aSyn,False
1,"(Pol_2b_P35, Prp_P81)","[Prp_P81, Prp, Pol_2b_P35, aSyn]",Prp_P81,Pol_2b_P35,0.659941,0.64497,0.8,0.8,109,169,1.06,137,1.23,Prp,aSyn,False


In [None]:
# get the matches that pass the simirity threshold
df_invitro2others_sel = df_input_invitro2others.copy()[(df_input_invitro2others.combined_fitness_score >= ssmax_thresh) &
                                                       (df_input_invitro2others.icp_mean_input_diff <= fdiff_thresh)].sort_values(by="combined_fitness_score", ascending=False).reset_index(drop=True)
print(f"Identified {df_invitro2others_sel.shape[0]} site matched between sites from in vitro fibrils of aSyn and ex vivo fibrils of other amyloid proteins")

# export
df_invitro2others_sel.to_csv(os.path.join(asyninvitro2others_out, 'sel_asyn_invitro_to_other_amyloids_matches.csv'))

df_invitro2others_sel.head(2)

Unnamed: 0,pocket_matches,pocket_pairs,source_pocket,target_pocket,icp_rmse,icp_fitness_source,icp_fitness_target,combined_fitness_score,icp_nb_corres,size_source,icp_mean_input_diff,size_target,src2target_size_ratio,source_pocket_fibril,target_pocket_fibril,fibril_src_target_match


## 5.0 Align Fibrils based on shared features for matched sites

In [None]:
# align site source fibrils
align_site_fibrils(
    align_out=aligned_sites_output,
    src_pockets_paths=src_pockets_paths,
    ply_files=ply_files,
    fibrils_paths=fibrils_paths,
    fibril_pocket_map=fibril_pocket_map,
    reg_results_paths=reg_results_paths
    )

aligning fibrils:   0%|                                                                           | 0/14 [00:00<?, ?it/s]@> 195 atoms and 1 coordinate set(s) were parsed in 0.00s.
@> 195 atoms and 1 coordinate set(s) were parsed in 0.00s.
@> 149 atoms and 1 coordinate set(s) were parsed in 0.00s.
@> 149 atoms and 1 coordinate set(s) were parsed in 0.00s.
@> 8300 atoms and 1 coordinate set(s) were parsed in 0.07s.
@> 10140 atoms and 1 coordinate set(s) were parsed in 0.16s.
aligning fibrils:   7%|████▊                                                              | 1/14 [00:32<07:04, 32.63s/it]@> 179 atoms and 1 coordinate set(s) were parsed in 0.00s.
@> 179 atoms and 1 coordinate set(s) were parsed in 0.00s.
@> 187 atoms and 1 coordinate set(s) were parsed in 0.00s.
@> 187 atoms and 1 coordinate set(s) were parsed in 0.00s.
@> 10140 atoms and 1 coordinate set(s) were parsed in 0.09s.
@> 10140 atoms and 1 coordinate set(s) were parsed in 0.08s.
aligning fibrils:  14%|█████████▌          