# Spectral Co-Clustering

In [None]:
# Paste Python version
from platform import python_version
print(python_version())

#Import packages
import time
import os
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
import concensus_clustering

from matplotlib import pyplot as plt
from sklearn.metrics import silhouette_score, pairwise_distances, jaccard_score, consensus_score
from sklearn.datasets import samples_generator as sg
from sklearn.datasets import make_biclusters, make_checkerboard
from sklearn.cluster.bicluster import SpectralCoclustering, SpectralBiclustering
from sklearn.cluster import SpectralClustering

#Import local scripts
import metrics
import concensus_clustering

sns.set()
%matplotlib inline

In [None]:
def checker_coords(model, num_clusters: tuple):
    """Set coordinates for bi-clusters with a checkerboard structure.
    
    Args:
        model: A trained bi-clustering model
        num_clusters: The number of detected row and column bic-cluters.
        
    Returns:
        (pd.DataFrame): The coordinates of each bi-cluster.
    
    """
    tot_num_clusters = num_clusters[0] * num_clusters[1]
    coords = pd.DataFrame(
        np.zeros((tot_num_clusters, 4)),
        columns=('y1', 'y2', 'x1', 'x2')
    )
    num, prev_rows = 0, 0
    for row_num in range(num_clusters[0]):
        nrows = np.sum(model.rows_[row_num])

        prev_cols = 0
        for col_num in range(num_clusters[1]):
            ncols = np.sum(model.columns_[col_num])

            coords.iloc[num, 0] = prev_rows 
            coords.iloc[num, 1] = prev_rows + nrows
            coords.iloc[num, 2] = prev_cols 
            coords.iloc[num, 3] = prev_cols + ncols

            num += 1

            prev_cols += ncols
        prev_rows += nrows

    return coords

In [None]:
def bic_coords(model, num_clusters):
    """Set coordinates for bi-clusters with a block-diagonal structure.
    
    Args:
        model: A trained bi-clustering model
        num_clusters: The number of detected bic-cluters.
        
    Returns:
        (pd.DataFrame): The coordinates of each bi-cluster.
    
    """
    coords = pd.DataFrame(
        np.zeros((num_clusters, 4)),
        columns=('y1', 'y2', 'x1', 'x2')
    )
    prev_rows, prev_cols = 0, 0
    for num, row_bic in enumerate(model.rows_):
        num_rows = np.sum(row_bic)
        num_cols = np.sum(model.columns_[num])

        coords.iloc[num, 0] = prev_rows
        coords.iloc[num, 1] = prev_rows + num_rows
        coords.iloc[num, 2] = prev_cols
        coords.iloc[num, 3] = prev_cols + num_cols

        prev_rows += num_rows
        prev_cols += num_cols

    return coords

In [None]:
def to_disk(fname, biclusters, threshold=1):
    
    #cpgs, genes = biclusters.labels
    with open(fname, 'w') as outfile:
        num = 1
        #for rows, cols in zip(cpgs, genes):
        for bicluster in biclusters:
            outfile.write('clusternum_{0}\n'.format(num))

            if len(bicluster) < threshold:
                outfile.write('\n')
            else:
                for item in bicluster:
                    outfile.write('{}\n'.format(item))
            
            #for row_elem in rows:
            #    outfile.write('{}\n'.format(row_elem))
            #for col_elem in cols:
            #    outfile.write('{}\n'.format(col_elem))

            num += 1
            
    return None

# Read data

In [None]:
data = pd.read_csv("emQTL correlation coefficients.csv", sep=',')

# Spectral co-clustering

In [None]:
#Define the number of clusters and the range for bicluster scoring
cluster_range = range(2,20 + 1)

In [None]:
#Create dictionaries
metrics = {}
row_scores = {}
col_scores = {}

In [None]:
for n_clusters in cluster_range:
    
    # Tag outputfilene med dette (tid + antall cluster)
    nowtime = datetime.datetime.now().strftime("%d-%m-%Y_%I-%M-%S_%p")

    run_tag = "%s_%dcl" % (nowtime, n_clusters)

    model = SpectralCoclustering(n_clusters=n_clusters, random_state=0)
    model.fit(data)

    # Create cluster objects and write results to disk (file).
    bic = concensus_clustering.Biclusters(model.rows_, model.columns_, data)
    probes, genes = bic.labels
    
    metrics[n_clusters] = bic.external_metrics
    metrics[n_clusters].to_csv("metrics_%i.csv" % (n_clusters, ))
    print("metrics_%i.csv" % (n_clusters, ))

    # Specify filenames. E.g.:
    bic.to_multiple_files(path=".", tag=run_tag)

    fit_data = data.iloc[np.argsort(model.row_labels_), np.argsort(model.column_labels_)]
    
    # Get cluster coordinates.
    coords = bic_coords(model, num_clusters=n_clusters)

    plt.figure(figsize=(8, 8))
    sns.heatmap(fit_data, cmap=sns.color_palette("Blues"),vmin=0,vmax=1)#RdBu_r or RdBu, Blues
    for num in coords.index:
        plt.plot(
            (coords.loc[num, ['x1', 'x2', 'x2', 'x1', 'x1']]),
            (coords.loc[num, ['y1', 'y1', 'y2', 'y2', 'y1']]),
            linewidth=2, c='darkred')
    plt.axis('off')
    plt.savefig("%s_clustered_bboxes.png" % (run_tag,), dpi=1500)
    
    del model, bic, probes, genes, coords, fit_data, run_tag