In [76]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import statistics

## Analysis for GM12878

In [77]:
# Directory containing the data files
directory = './GM12878'

In [78]:
# helper to load a single file into a contact matrix
def load_contact_matrix(filepath):
    data = pd.read_csv(filepath, sep='\t', header=None)
    data.columns = ['chr1', 'pos1', 'chr2', 'pos2', 'interaction']
    data = data[data['chr1'] == data['chr2']]
    max_pos = max(data['pos1'].max(), data['pos2'].max())
    min_pos = min(data['pos1'].min(), data['pos2'].min())
    bins = np.linspace(min_pos, max_pos, num=201)
    data['pos1_bin'] = pd.cut(data['pos1'], bins, labels=False, include_lowest=True)
    data['pos2_bin'] = pd.cut(data['pos2'], bins, labels=False, include_lowest=True)
    pivot_table = data.groupby(['pos1_bin', 'pos2_bin'])['interaction'].sum().unstack(fill_value=0)
    return pivot_table.to_numpy()

# Function to normalize a contact matrix
def z_score_normalize(matrix):
    mean_val = np.mean(matrix)
    std_val = np.std(matrix)
    normalized_matrix = (matrix - mean_val) / std_val
    return normalized_matrix

# Function to load all the matrices in a directory
def load_matrices(directory, contact_matrices, filepaths):
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):
            filepath = os.path.join(directory, filename)
            normalized_matrix = z_score_normalize(load_contact_matrix(filepath))
            contact_matrices.append(normalized_matrix)
            filepaths.append(filepath)
            
# Function to extend matrix to the target dimension
def extend_matrix(matrix, target_dim):
    extended_matrix = np.zeros(target_dim)
    rows, cols = matrix.shape
    extended_matrix[:rows, :cols] = matrix
    return extended_matrix

# Function to calculate the Frobenius norm between two matrices
def frobenius_norm(matrix1, matrix2):
    return np.linalg.norm(matrix1 - matrix2, 'fro')

def plot_matrices(num_matrices, matrices_per_figure, contact_matrices):
    for start in range(0, num_matrices, matrices_per_figure):
        end = min(start + matrices_per_figure, num_matrices)
    fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(15, 10))
    axes = axes.flatten()
    
    for i, ax in enumerate(axes):
        if start + i < end:
            matrix = contact_matrices[start + i]
            im = ax.imshow(matrix, cmap='coolwarm', interpolation='none')
            ax.set_title(f'Matrix {start + i + 1}')
            ax.set_xlabel('Position along Chromosome')
            ax.set_ylabel('Position along Chromosome')
            fig.colorbar(im, ax=ax, label='Interaction Frequency')
        else:
            fig.delaxes(ax)
    
    plt.tight_layout()
    plt.show()
    

def plot_distance_matrix(num_matrices, contact_matrices):
    # Initialize the distance matrix
    distance_matrix = np.zeros((num_matrices, num_matrices))

    # Compute the distances and store in the distance matrix
    for i in range(num_matrices):
        for j in range(i, num_matrices):
            distance = frobenius_norm(contact_matrices[i], contact_matrices[j])
            distance_matrix[i, j] = distance
            distance_matrix[j, i] = distance

    # Print the distance matrix
    print("Distance Matrix:")
    print(distance_matrix)

def find_average(contact_matrices):
    average_list = []
    for matrix in contact_matrices:
        sum = 0
        for i in range(matrix.shape[0]):
            for j in range(matrix.shape[1]):
                sum += matrix[i][j]
        avg = sum / (matrix.shape[0] * matrix.shape[1])
        average_list.append(avg)
    return average_list

In [79]:
# Load all contact matrices into a list
contact_matrices = []
filepaths = []

In [80]:
load_matrices(directory, contact_matrices, filepaths)


num_matrices = len(contact_matrices)

# Target dimension
target_dim = (200, 200)

# Extend all matrices
contact_matrices = [extend_matrix(matrix, target_dim) for matrix in contact_matrices]

print(len(contact_matrices))

24


In [81]:
# Calculate the Frobenius norm (distance) between the matrices
distance = frobenius_norm(contact_matrices[3], contact_matrices[0])

# Print the distance
print("Frobenius Norm (Distance) between the matrices:", distance)


Frobenius Norm (Distance) between the matrices: 128.99796875135635


In [None]:
# Plot the matrices in batches of 6 per figure
num_matrices = len(contact_matrices)
matrices_per_figure = 6

plot_matrices(num_matrices, matrices_per_figure, contact_matrices)

In [83]:
plot_distance_matrix(num_matrices, contact_matrices)

Distance Matrix:
[[  0.          80.57671192 136.3526918  128.99796875  82.33918215
  136.28604488  86.98202622 138.62782231 155.16512479 141.64125278
  137.08130842 139.56606583 128.59476673 139.78131077 127.45660365
  137.01414868 137.39492653 130.3074066  133.03552038 133.08184179
  134.02665933 130.82957997 133.20190591 138.98711219]
 [ 80.57671192   0.         122.18639918 114.14824047  63.58232115
  119.7971038   68.69611741 125.67667006 145.72232498 127.93396422
  124.30111176 132.28893071 115.92612336 125.24802192 112.1458947
  120.57360963 120.78225431 113.4208751  118.82355798 118.63913772
  123.15616794 117.28761243 119.96244357 125.57163166]
 [136.3526918  122.18639918   0.         129.63985309 136.65587215
  125.01440313 131.75433919 125.05660711 109.56589976 122.51858022
  124.46247625 138.99328254  72.06382045 125.49197811  68.47442523
  123.56524645 124.11369124  72.24429608 121.3557025   84.18745403
  122.62025839 119.83569425 124.22819349  94.50111947]
 [128.99796875 

In [85]:
average = find_average(contact_matrices)
average_value = statistics.mean(average)
print("Average:", average_value)

Average: -5.655275177549971e-15


## Analysis for HAP1

In [87]:
directory1 = './HAP1'

# Load all contact matrices into a list
contact_matrices1 = []
filepaths1 = []

# Load in the contact matrices from txt files
load_matrices(directory1, contact_matrices1, filepaths1)


num_matrices = len(contact_matrices1)

# Target dimension
target_dim = (200, 200)

# Extend all matrices
contact_matrices1 = [extend_matrix(matrix, target_dim) for matrix in contact_matrices1]

plot_distance_matrix(num_matrices, contact_matrices1)

average1 = find_average(contact_matrices1)
average_value1 = statistics.mean(average1)
print("Average:", average_value1)

Distance Matrix:
[[  0.         107.09182995 120.05037596 ... 157.77096983 110.66971164
  110.32289041]
 [107.09182995   0.          81.77029257 ... 136.23395828  59.5813876
   65.72478858]
 [120.05037596  81.77029257   0.         ... 146.19224692  76.7347748
   80.93586997]
 ...
 [157.77096983 136.23395828 146.19224692 ...   0.         134.99341351
  134.5799318 ]
 [110.66971164  59.5813876   76.7347748  ... 134.99341351   0.
   60.09309729]
 [110.32289041  65.72478858  80.93586997 ... 134.5799318   60.09309729
    0.        ]]
Average: 6.800302451850623e-17
