In [None]:
%matplotlib inline

import sys
sys.path.append('..')

# For data loading
from scipy.io import loadmat
# For co-clustering
from coclust.CoclustMod import CoclustMod
from coclust.CoclustSpecMod import CoclustSpecMod
from coclust.CoclustInfo import CoclustInfo

# For evaluations
from coclust.evaluation.io import(load_doc_term_data)
from coclust.evaluation.partitionEvaluation import(best_modularity_partition)
from coclust.evaluation.ioNotebook import(input_with_default_int, input_with_default_str)
from coclust.utils.evaluation import (print_NMI_and_ARI,
                                      plot_convergence,
                                      accuracy,
                                      plot_delta_kl)
# For visualization
from coclust.visualization.coClusteringMeasures import(plot_max_modularities, 
                                                       plot_intermediate_modularities)
from coclust.visualization.termClusters import(plot_cluster_top_terms, 
                                               get_term_graph, 
                                               plot_cluster_sizes)

In [None]:
# ** Get best co-clustering with the CoclustMod approach **

# [Co-clustering by direct maximization of graph modularity]

# Provide a co-occurence file path
# Exp: "../datasets/classic3.mat"
# Exp: "../datasets/classic3.csv"
file_path = input_with_default_str('Co-occurence file path', "../datasets/classic3_coclustFormat.mat")

# Load the given co-occurence matrix and associated term labels
doc_term_data = load_doc_term_data(file_path)

In [None]:
# ** Compute the final coClust modularity on a range of number of clusters **

# -- range of number of clusters
# ---- get min
min_cluster_nbr = input_with_default_int('Minimum number of clusters', 2)
max_cluster_nbr = input_with_default_int('Maximum number of clusters', 9)
range_n_clusters = range(min_cluster_nbr, (max_cluster_nbr+1))

# -- Number of random initialization per cluster number
n_rand_init = input_with_default_int('Number of random initialization per cluster number', 1)

# --  max modularity for all models
all_final_modularities = [None]*len(range_n_clusters)

best_coclustMod_model, all_max_modularities = best_modularity_partition(doc_term_data['doc_term_matrix'], 
                                                                        range_n_clusters, n_rand_init)

In [None]:
# ** Plot all final modularities **
plot_max_modularities(all_max_modularities, range_n_clusters)

In [None]:
# ** Plot intermediate modularities for the best CoClust final modularity **
plot_intermediate_modularities(best_coclustMod_model)

In [None]:
# ** Plot top terms for each cluster and the cluster sizes **
n_terms = input_with_default_int('Number of top terms', 10)

# --> top terms
plot_cluster_top_terms(doc_term_data['doc_term_matrix'], doc_term_data['term_labels'], n_terms, best_coclustMod_model)
# --> cluster sizes
plot_cluster_sizes(best_coclustMod_model)

In [None]:
# ** Plot the term graph for a given cluster of terms **
n_cluster = input_with_default_int('Number of cluster', 1)
n_top_terms = input_with_default_int('Number of top terms', 25)
n_neighbors = input_with_default_int('Number of neighbors', 10)

graph = get_term_graph(X = doc_term_data['doc_term_matrix'], model = best_coclustMod_model, 
                       terms = doc_term_data['term_labels'], n_cluster = n_cluster, 
                       n_top_terms = n_top_terms, n_neighbors = n_neighbors)

In [None]:
%%javascript
require.config({
  paths: {
      d3: '//cdnjs.cloudflare.com/ajax/libs/d3/3.4.8/d3.min'
  }
});

In [None]:
from IPython.display import Javascript
#runs arbitrary javascript, client-side
Javascript("""
           window.graph={};
           """.format(graph))

In [None]:
%%javascript
require(['d3'], function(d3){
  //a weird idempotency thing
  $("#chart1").remove();
  //create canvas
  element.append("<div id='chart1'></div>");
  $("#chart1").width("1160px");
  $("#chart1").height("800px");        
  var margin = {top: 20, right: 20, bottom: 30, left: 40};
  var width = 1280 - margin.left - margin.right;
  var height = 800 - margin.top - margin.bottom;
  var svg = d3.select("#chart1").append("svg")
    .style("position", "relative")
    .style("max-width", "960px")
    .attr("width", width + "px")
    .attr("height", (height + 50) + "px")
    .call(d3.behavior.zoom().on("zoom", redraw))
    .append("g")
    .attr("transform", "translate(" + margin.left + "," + margin.top + ")");
    
    
  function redraw() {
      svg.attr("transform",
               "translate(" + d3.event.translate + ")"
               + " scale(" + d3.event.scale + ")");
  }   

  var color = d3.scale.category20();

  var force = d3.layout.force()
    .charge(-500)
    //.linkDistance(5)
    .linkDistance(function(d) { return (1 - d.value); })
    .size([width, height]);

  var graph = window.graph;
    
  force
      .nodes(graph.nodes)
      .links(graph.links)
      .start();

  var link = svg.selectAll(".link")
      .data(graph.links)
      .enter().append("line")
      .attr("class", "link")
      .style("stroke", "#999;")
      .style("stroke-opacity", .6)
      .style("stroke-width", function(d) { return Math.sqrt(d.value); })
  
      .style("stroke", "blue");

  var node = svg.selectAll(".node")
      .data(graph.nodes)
      .enter().append("g")
      .attr("class", "node")
      .call(force.drag);
    
  node.append("circle")
      .attr("class", "node_circle")
      .attr("r", 8)
      .style("fill", function(d) { return color(d.group); });

  node.append("text")
      .attr("class", "node_text")
      .attr("dx", 12)
      .attr("dy", ".35em")
      .text(function(d) { return d.name });

  node.append("title")
      .text(function(d) { return d.name; });

  var node_text = svg.selectAll(".node_text");
  var node_circle = svg.selectAll(".node_circle");
    
  force.on("tick", function() {
    link.attr("x1", function(d) { return d.source.x; })
        .attr("y1", function(d) { return d.source.y; })
        .attr("x2", function(d) { return d.target.x; })
        .attr("y2", function(d) { return d.target.y; });

    node_circle.attr("cx", function(d) { return d.x; })
        .attr("cy", function(d) { return d.y; });
      
    node_text.attr("x", function(d) { return d.x; })
        .attr("y", function(d) { return d.y; });
  });
});

In [None]:
# ** Compute spectral co-clustering with the CoclustSpecMod approach **

n_clusters = best_coclustMod_model.n_clusters
# Perform co-clustering
coclust_specMod_model = CoclustSpecMod(n_clusters=n_clusters, random_state=0)
coclust_specMod_model.fit(doc_term_data['doc_term_matrix'])

In [None]:
# ** Plot top terms for each cluster and the cluster sizes **

n_terms = input_with_default_int('Number of top terms', 10)

# --> top terms
plot_cluster_top_terms(doc_term_data['doc_term_matrix'], doc_term_data['term_labels'], n_terms, coclust_specMod_model)
# --> cluster sizes
plot_cluster_sizes(coclust_specMod_model)

In [None]:
# ** Compute co-clustering with the CoclustInfo approach **

n_clusters = best_coclustMod_model.n_clusters
n_rand_init = input_with_default_int('Number of random initialization', 1)
# Perform co-clustering
coclust_info_model = CoclustInfo(n_row_clusters = n_clusters, n_col_clusters = n_clusters,
                                 n_init = n_rand_init, random_state = 0)
coclust_info_model.fit(doc_term_data['doc_term_matrix'])

In [None]:
## ** Plot top terms for each cluster and the cluster sizes **

#n_terms = input_with_default_int('Number of top terms', 10)
#
## --> top terms
#plot_cluster_top_terms(dt_coocurence_data, my_terms, n_terms, coclust_info_model)
## --> cluster sizes
#plot_cluster_sizes(coclust_info_model)

In [None]:
# ** Compare clustering quality measure for coClustMod, coClustSpecMod and info **

true_row_labels = doc_term_data['doc_labels']
n_clusters = best_coclustMod_model.n_clusters

## Evaluate the results for (Best) Modularity co-clustering
print(10*'=')
print("(Best) Modularity co-clustering")
print(5*'-')
#print("CRITERION: %s" % best_coclustMod_model.criterion)
predicted_row_labels = best_coclustMod_model.row_labels_
print_NMI_and_ARI(true_row_labels, predicted_row_labels)
print_accuracy = accuracy(doc_term_data['doc_term_matrix'], n_clusters, true_row_labels, predicted_row_labels)
print("ACCURACY: %s" % print_accuracy)
            
## Evaluate the results for Spectral Modularity co-clustering
print(10*'=')
print("Spectral Modularity co-clustering")
print(5*'-')
#print("CRITERION: %s" % coclust_specMod_model.criterion)
predicted_row_labels = coclust_specMod_model.row_labels_
print_NMI_and_ARI(true_row_labels, predicted_row_labels)
print_accuracy = accuracy(doc_term_data['doc_term_matrix'], n_clusters, true_row_labels, predicted_row_labels)
print("ACCURACY: %s" % print_accuracy)
      
## Evaluate the results for Info co-clustering
print(10*'=')
print("Info co-clustering")
print(5*'-')
print("CRITERION: %s" % coclust_info_model.criterion)
predicted_row_labels = coclust_info_model.row_labels_
print_NMI_and_ARI(true_row_labels, predicted_row_labels)
print_accuracy = accuracy(doc_term_data['doc_term_matrix'], n_clusters, true_row_labels, predicted_row_labels)
print("ACCURACY: %s" % print_accuracy)