<div style="font-variant: small-caps; 
      font-weight: normal; 
      font-size: 35px; 
      text-align: center; 
      padding: 15px; 
      margin: 10px;">
  Nuclear Incidents
  </div> 

  
<div style="font-variant: small-caps; 
      font-weight: normal; 
      font-size: 25px; 
      text-align: center; 
      padding: 15px; 
      margin: 10px;">
      Similarity - Text-level
  </div> 


  <div style=" float:left; 
      font-size: 12px; 
      line-height: 12px; 
  padding: 10px 15px 8px;">
  Jean-baptiste AUJOGUE
  </div> 
  
  <div style=" float:right; 
      font-size: 12px; 
      line-height: 12px; 
  padding: 10px 15px 8px;">
  Jan 2023
  </div> 

<a id="TOC"></a>



[Bottom](#bottom)



In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
import os
import warnings

# data 
import numpy as np
import pandas as pd
import scipy

# text
import nltk
nltk.download('punkt') # run once

# graph
import networkx as nx
from d3graph import d3graph
from pyvis import network as nt

warnings.filterwarnings("ignore")
print('python version :', sys.version)

**Path to data repertory**

In [None]:
path_to_repo = os.path.dirname(os.getcwd())
path_to_data = os.path.join(path_to_repo, 'data', 'processed')
path_to_plot = os.path.join(path_to_repo, 'data', 'plots')

In [None]:
sys.path.insert(0, os.path.join(path_to_repo, 'src'))

In [None]:
from tmtools.tfidf import compute_gensim_tfidf_similarity_matrix
from tmtools.similarity import (
    build_nt_graph,
    filter_similarity_matrix,
    get_most_similar_indices,
    get_similarity_heatmap,
)

# 1. Compute similarity matrix

[Table of Content](#TOC)

In [None]:
df_corpus = pd.read_excel(os.path.join(path_to_data, 'source_texts.xlsx'))

In [None]:
df_corpus.head(3)

In [None]:
corpus = df_corpus.text.tolist()
len(corpus)

## 1.1 Tf-idf similarity matrix

[Table of Content](#TOC)

In [None]:
# run only once
sim_matrix = compute_gensim_tfidf_similarity_matrix(corpus, tokenizer = nltk.word_tokenize, threshold = 0.25)

scipy.sparse.save_npz(os.path.join(path_to_plot, 'd3graph', "sim_matrix_tfidf_texts.npz"), sim_matrix)

# 2. Global representation of similarities

[Table of Content](#TOC)

Using d3graph is the recommended way to go

In [None]:
sim_matrix = scipy.sparse.load_npz(os.path.join(path_to_plot, 'd3graph', "sim_matrix_tfidf_texts.npz"))

### Using d3graph



In [None]:
d3_graph = d3graph()
d3_graph.graph((sim_matrix * 100).toarray())

In [None]:
d3_graph.show(
    figsize = (10000, 10000), 
    filepath = os.path.join(path_to_plot, 'd3graph', 'sim_matrix_tfidf_texts.html'),
    showfig = True,
)









[d3graph] INFO> Slider range is set to [24, 100]
[d3graph] INFO> Write to path: [C:\Users\jbaujogue\Desktop\Tessella\Internal - Nuclear Incidents v2\docs\d3graph\sim_matrix_tfidf_texts.html]
[d3graph] INFO> File already exists and will be overwritten: [C:\Users\jbaujogue\Desktop\Tessella\Internal - Nuclear Incidents v2\docs\d3graph\sim_matrix_tfidf_texts.html]


<networkx.classes.digraph.DiGraph at 0x1a790e896a0>

### Using pyvis

Using pyvis is not recommended

In [None]:
# nt_graph = build_nt_graph(sim_matrix)

In [None]:
# nt_graph.show(os.path.join(path_to_plot, 'ntgraph', 'text_tfidf_similarity.html'))

# 3. Local exploration of similarities

[Table of Content](#TOC)

In [None]:
idx = 42

## 3.1 Get most similar

[Table of Content](#TOC)

In [None]:
topk_sims = get_most_similar_indices(sim_matrix, n = 2, idx = idx)

for i, sim in topk_sims:
    print(i+1, sim, corpus[i])

## 3.2 Get local similarity heatmap

[Table of Content](#TOC)

In [None]:
small_sim_matrix, x_ids, y_ids = filter_similarity_matrix(
    df_corpus, 
    sim_matrix, 
    key_column = 'location', 
    idx = idx,
)
small_sim_matrix.shape

In [None]:
hmap = get_similarity_heatmap(
    small_sim_matrix.toarray(), 
    x_labels = [x+1 for x in x_ids], 
    y_labels = [y+1 for y in y_ids],
    incident_idx = idx+1, 
)

In [None]:
hmap = get_similarity_heatmap(
    small_sim_matrix.toarray(), 
    x_labels = [x+1 for x in x_ids], 
    y_labels = [y+1 for y in y_ids],
)

## 3.3 Get local subgraph

[Table of Content](#TOC)

In [None]:
# nx_graph = nx.from_scipy_sparse_matrix(sim_matrix, edge_attribute = 'score')

In [None]:
# nx_subgraph = nx.ego_graph(nx_graph, 158, radius = 2)

In [None]:
# TODO

<a id="bottom"></a>

[Table of content](#TOC)