Hao Wu  September 30 2020 <br />
This code is intended to generate data visualizations

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import os
import pathlib
from tqdm.notebook import tqdm
import pandas as pd
import itertools
import statistics
import csv
import scipy.stats as st
import scipy.ndimage


## 1.1 Read & Check Distance Matrices

in the matrix:
0 -- no contact
1 -- contact
2 -- missing information (ignore it)

In [2]:
maps = os.listdir("contact_maps/")
pred = os.listdir("contact_predictions/")
maps_code = [m.split("_")[0] for m in maps]
pred_code = [p.split("_")[0] for p in pred]
# check duplicated elements in maps or preds
print (len(set(maps_code)) == len(maps_code))
print (len(set(pred_code)) == len(pred_code))

# check whether maps & preds match
print (set(maps_code) == set(pred_code)) # True, which means we have same set of acid code for real contact map and predictions
num_of_code = len(set(maps_code))
print (num_of_code) # number of acid codes/contact maps we are analyzing 


True
True
True
11181


## 1.2 Generate Matrices of Diff (+/-) & Sqr_Diff

1. For each pair of contact map & prediction, calculate difference & squared difference between corresponding points of the two matrices and generate a square diff matrix.
2. Collect all data points in the upper triangular part of each diff/sqr-diff matrix and store in a list
3. Classify diff/sqr_diff matrices based on sizes 
* upper triangular part: since data points on each contact map are symmetric about y=x, we can either focus on upper triangular part of the matrix (y>=x) or on lower triangular part (x>=y)

In [3]:
diff = {} # key: contact code, value: (sqr_diff matrix, size, diff matrix)

# for visualization
diff_by_size = {} # key:size, value: ([sqr_diff matrices], [fp_diff matrices], [fn_diff matrices])
contact_pred = {}

# for getting idea of distribution of sqr_diff/diff
diff_ele = []
diff_ele_pos = []
diff_ele_neg = []
sqr_diff_ele = []

for c in tqdm(maps_code):
    temp_map = np.load("contact_maps/"+c+"_contact_map.npy")
    temp_pred = np.load("contact_predictions/"+c+"_contact_pred.npy")
    size = temp_map.shape[0]
    
    temp_diff = np.subtract(temp_pred, temp_map)
    temp_sqr_diff = np.power(np.subtract(temp_pred, temp_map),2)

    
    # check where the contact_map has 2 (which means missing info), set set sqr_diff as -1 (missing information)
    clear_temp_diff = np.where(temp_map==2,999, temp_diff)
    clear_temp_sqr_diff = np.where(temp_map==2, -1, temp_sqr_diff) 
    
    contact_pred[c] = (temp_pred, size)
    diff[c] = (clear_temp_sqr_diff, size, clear_temp_diff)
    
    iu = np.triu_indices(size) # indices in upper triangle matrix

    # concatenating flatten np matrix with list is faster than merging np matrix
    diff_ele += list(filter(lambda a: a != 999, list(clear_temp_diff[iu].flat))) # collecting all diff values regardless of 999 (missing info)
    diff_ele_pos += list(filter(lambda a: a != 999 and a>0, list(clear_temp_diff[iu].flat))) 
    diff_ele_neg += list(filter(lambda a: a != 999 and a<0, list(clear_temp_diff[iu].flat)))
    sqr_diff_ele += list(filter(lambda a: a != -1, list(clear_temp_sqr_diff[iu].flat))) # collecting all sqr_diff values regardless of -1 (missing info)
    
    # for visualization use
    clear_temp_diff_zero = np.where(temp_map==2,0, temp_diff) # replace missing info with 0
    clear_temp_diff_fp = np.where(clear_temp_diff_zero>0, clear_temp_diff_zero, 0)
    clear_temp_diff_fn = np.where(clear_temp_diff_zero<0, clear_temp_diff_zero, 0)
    clear_temp_sqr_diff_zero = np.where(temp_map==2, 0, temp_sqr_diff)
  
    if size not in (list(diff_by_size.keys())):
        # diff_by_size[size] = ([clear_temp_sqr_diff_zero], [clear_temp_diff_fp], [clear_temp_diff_fn])
        diff_by_size[size] = [1,[c], clear_temp_sqr_diff_zero, clear_temp_diff_fp, clear_temp_diff_fn]
    else:
        # diff_by_size[size][0].append(clear_temp_sqr_diff_zero)
        # diff_by_size[size][1].append(clear_temp_diff_fp)
        # diff_by_size[size][2].append(clear_temp_diff_fn)
        diff_by_size[size][0] += 1
        diff_by_size[size][1].append(c)
        diff_by_size[size][2] += clear_temp_sqr_diff_zero
        diff_by_size[size][3] += clear_temp_diff_fp
        diff_by_size[size][4] += clear_temp_diff_fn
    
    
# sorting diff_by_size by size
diff_by_size = {key:[value[0], value[1], value[2]/value[0], value[3]/value[0], value[4]/value[0]] for key, value in sorted(diff_by_size.items(), key=lambda item: int(item[0]))}

HBox(children=(FloatProgress(value=0.0, max=11181.0), HTML(value='')))




In [4]:
temp = [(d, diff_by_size[d][0]) for d in list(diff_by_size.keys()) if diff_by_size[d][0]>1]
print (temp)

[(50, 15), (51, 17), (52, 14), (53, 17), (54, 17), (55, 14), (56, 15), (57, 19), (58, 18), (59, 23), (60, 17), (61, 22), (62, 27), (63, 24), (64, 24), (65, 24), (66, 31), (67, 31), (68, 25), (69, 28), (70, 22), (71, 21), (72, 20), (73, 21), (74, 29), (75, 28), (76, 24), (77, 32), (78, 28), (79, 28), (80, 35), (81, 47), (82, 29), (83, 22), (84, 33), (85, 42), (86, 34), (87, 32), (88, 29), (89, 38), (90, 50), (91, 32), (92, 42), (93, 36), (94, 39), (95, 39), (96, 39), (97, 41), (98, 46), (99, 35), (100, 36), (101, 35), (102, 43), (103, 37), (104, 43), (105, 48), (106, 54), (107, 32), (108, 43), (109, 56), (110, 41), (111, 42), (112, 41), (113, 41), (114, 51), (115, 61), (116, 36), (117, 36), (118, 55), (119, 47), (120, 45), (121, 57), (122, 54), (123, 50), (124, 53), (125, 34), (126, 45), (127, 45), (128, 35), (129, 39), (130, 43), (131, 54), (132, 45), (133, 45), (134, 49), (135, 47), (136, 41), (137, 52), (138, 52), (139, 44), (140, 40), (141, 43), (142, 54), (143, 38), (144, 48), (145

## 1.3 Distribution of Sqr_diff/Diff Values

1. Calculate quantiles of Sqr_diff / Positive Diff / Negative Diff
2. Visualize with histogram

In [5]:
'''
#print (len(diff_ele)) # 355941541 | 369656525

# cutoff: arbitrarily design, try with different numbers (should be larger than 0.5 in absolute diff)
elements = [sqr_diff_ele, diff_ele_pos, diff_ele_neg]
elements_list = ["Sqared Difference Values", "Positive Difference Values", "Negative Difference Values"]
# takes about 2-3 min to compute
for i in range(3):
    
    print ("========== "+elements_list[i]+" ==========\n")
    print ("Number of values: "+str(len(elements[i])))
    min_cutoff = min(elements[i])
    cutoff_75 = np.quantile(elements[i],0.75) # 
    cutoff_50 = np.quantile(elements[i],0.50) #
    cutoff_25 = np.quantile(elements[i],0.25) # 
    cutoff_9772 = np.quantile(elements[i],0.9772) # 
    cutoff_9987 = np.quantile(elements[i],0.9987) # 
    max_cutoff = max(elements[i]) # 

    print ("min: "+str(min_cutoff))
    print ("25%: "+str(cutoff_25))
    print ("50%: "+str(cutoff_50))
    print ("75%: "+str(cutoff_75))
    print ("97.72%: "+str(cutoff_9772))
    print ("99.87%: "+str(cutoff_9987))
    print ("max: "+str(max_cutoff))
    
    # histogram
    plt.hist(elements[i], bins = 30)
    plt.xlabel(elements_list[i])
    plt.ylabel('Frequency')
    plt.title('Histogram of '+elements_list[i])
    plt.show()

#cutoff = 7.068872962487293e-06
'''





## 1.4 Visualize Accumulative Sqr_diff / Positive Diff / Negative Diff in Matrices of Each Size

Maps below is intended to help us find patterns of false predictions.
Diagrams of Positive Diff may represent some features of false-positive predictions, and diagrams of Negative Diff may represent some features of false-negative predictions

In [6]:

# takes about 5-6 hours
titles = ["Accumulative Sqr Diff", "Accumulative Positive Diff", "Accumulative Negative Diff"]
cutoff = [[0.49, 0.7, -0.7], [0.64, 0.8, -0.8]]

for s in tqdm(list(diff_by_size.keys())):
    fig, axs = plt.subplots(3,3,figsize=(15,15))
    for i in range (3):
        for j in range (3):
            if i==0:
                im = axs[i][j].matshow(diff_by_size[s][j+2], interpolation="nearest")
                fig.colorbar(im, ax=axs[i][j], shrink=0.2)
                axs[i][j].set_xlabel(titles[j]+" (No Cutoff)", size=12)
            
            else:
                if j!=2:
                    temp_matrix = np.where(diff_by_size[s][j+2]>=(cutoff[i-1][j]), diff_by_size[s][j+2], 0)   
                else:
                    temp_matrix = np.where(diff_by_size[s][j+2]<=(cutoff[i-1][j]), diff_by_size[s][j+2], 0)
                im = axs[i][j].matshow(temp_matrix, interpolation="nearest")
                fig.colorbar(im, ax=axs[i][j], shrink=0.2)
                axs[i][j].set_xlabel(titles[j]+" (Cutoff: "+str(cutoff[i-1][j])+")", size=12)
            
            
    fig.suptitle("Size: "+str(s)+", Num: "+str(diff_by_size[s][0]))
    plt.tight_layout()
    plt.savefig("viz/"+str(s)+".png")
    plt.close(fig)
        


HBox(children=(FloatProgress(value=0.0, max=546.0), HTML(value='')))


