<div style="font-variant: small-caps; 
      font-weight: normal; 
      font-size: 35px; 
      text-align: center; 
      padding: 15px; 
      margin: 10px;">
  Nuclear Incidents
  </div> 

  
<div style="font-variant: small-caps; 
      font-weight: normal; 
      font-size: 25px; 
      text-align: center; 
      padding: 15px; 
      margin: 10px;">
      Topic modeling - Span-level
  </div> 


  <div style=" float:left; 
      font-size: 12px; 
      line-height: 12px; 
  padding: 10px 15px 8px;">
  Jean-baptiste AUJOGUE
  </div> 
  
  <div style=" float:right; 
      font-size: 12px; 
      line-height: 12px; 
  padding: 10px 15px 8px;">
  Jan 2023
  </div> 

<a id="TOC"></a>

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
import warnings
import os
import copy

# data 
import numpy as np
import pandas as pd

# nlp
import spacy
from spacy.tokens import Span

# viz
import matplotlib.pyplot as plt
from spacy import displacy
from IPython.core.display import display, HTML

warnings.filterwarnings("ignore")
print('python version :', sys.version)

**Path to data repertory**

In [None]:
path_to_repo = os.path.dirname(os.getcwd())
path_to_data = os.path.join(path_to_repo, 'data', 'processed')

In [None]:
path_to_repo

In [None]:
sys.path.insert(0, os.path.join(path_to_repo, 'src'))

In [None]:
from tmtools.tfidf import compute_stripped_ngrams_fr, compute_sklearn_tfidf_matrix
from tmtools.topic import compute_topic_modeling, plot_topic_words, get_spans_html

<a id="classification"></a>

# 1. Tfidf feature matrix

[Table of Content](#TOC)

In [None]:
df_corpus = pd.read_excel(os.path.join(path_to_data, 'source_spans.xlsx'))

In [None]:
df_corpus.head(5)

In [None]:
corpus = df_corpus.text.tolist()
len(corpus)

#### Compute stripped ngrams

In [None]:
vocab = compute_stripped_ngrams_fr(
    corpus, 
    sublinear_tf = True,
    use_idf = True,
    ngram_range = (1, 3),
    min_df = 1,
    strip_accents = None,
    lowercase = True,
)

In [None]:
len(vocab)

In [None]:
tfidf_matrix, tfidf_ngrams = compute_sklearn_tfidf_matrix(
    corpus, 
    vocab,
    sublinear_tf = True,
    use_idf = True,
    ngram_range = (1, 3),
    min_df = 3,
    strip_accents = None,
    lowercase = True,
)
tfidf_matrix.shape

# 2. Topic modeling

[Table of Content](#TOC)

In [None]:
n_topics = 250

#### Topic modeling using LSA

In [None]:
df_text_topic_LSA, df_topic_feature_LSA, df_topic_importance_LSA, df_feature_importance_LSA = compute_topic_modeling(
    tfidf_matrix, tfidf_ngrams, method = 'LSA', n_components = n_topics, topic_name = 'Span topic',
)

In [None]:
topic_LSA = df_text_topic_LSA.iloc[:, :n_topics].idxmax(axis = 1)

In [None]:
df_corpus['topic_LSA'] = topic_LSA

In [None]:
df_corpus[df_corpus['topic_LSA'] == 'Span topic 5'].text.tolist()

#### Interpretation of LSA topics

In [None]:
# LSA
plot_topic_words(df_topic_feature_LSA, n_topics = 100, n_top_words = 15)

In [None]:
df_topic_importance_LSA.plot(figsize = (15, 5))

In [None]:
topic_LSA.value_counts().plot.barh(figsize = (10, 15)).invert_yaxis()
plt.show()

#### Export result

In [None]:
df_corpus.head(3)

In [None]:
df_corpus.to_excel(os.path.join(path_to_data, 'source_spans_topics.xlsx'), index = False)

# 3. NER-like visualization of spans

[Table of Content](#TOC)

In [None]:
# 400 MB transformer-based model
nlp = spacy.load('fr_dep_news_trf', exclude = ['ner'])

In [None]:
df_sents = pd.read_excel(os.path.join(path_to_data, 'source_sentences.xlsx'))
df_spans = pd.read_excel(os.path.join(path_to_data, 'source_spans_topics.xlsx'))

In [None]:
topic2color = {'Span topic {}'.format(i): '#84bee8' for i in range(1, n_topics+1)}

In [None]:
doc_id = 2
df_sents_doc = df_sents[df_sents.Doc_id == doc_id]
df_spans_doc = df_spans[df_spans.Doc_id == doc_id]

In [None]:
html = get_spans_html(df_sents_doc, df_spans_doc, nlp, topic2color)
HTML(html)

<a id="bottom"></a>

[Table of content](#TOC)