<div style="font-variant: small-caps; 
      font-weight: normal; 
      font-size: 35px; 
      text-align: center; 
      padding: 15px; 
      margin: 10px;">
  Nuclear Incidents
  </div> 

  
<div style="font-variant: small-caps; 
      font-weight: normal; 
      font-size: 25px; 
      text-align: center; 
      padding: 15px; 
      margin: 10px;">
      Data Preparation
  </div> 


  <div style=" float:left; 
      font-size: 12px; 
      line-height: 12px; 
  padding: 10px 15px 8px;">
  Jean-baptiste AUJOGUE
  </div> 
  
  <div style=" float:right; 
      font-size: 12px; 
      line-height: 12px; 
  padding: 10px 15px 8px;">
  Jan 2023
  </div> 

<a id="TOC"></a>

***
<div style="font-weight: normal; 
      font-size: 25px; 
      text-align: center; 
      padding: 15px; 
      margin: 10px;">
      Table of Content
  </div> 

1. [Corpus Import](#corpus)
2. [Metadata Extraction](#metadata)
3. [Text Segmentation](#segmentation)



[Bottom](#bottom)

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os
import warnings
import unicodedata
import re
import copy
from unidecode import unidecode
from ast import literal_eval # transforms back stings into list of words

# for data 
import numpy as np
import pandas as pd

# for nlp
from nltk.tokenize.punkt import PunktSentenceTokenizer
import spacy
from spacy.lang.en.stop_words import STOP_WORDS as en_stop
from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop

# for visualization
from IPython.core.display import display, HTML
from tqdm import tnrange


warnings.filterwarnings("ignore")
print('python version :', sys.version)

python version : 3.9.7 (default, Sep 16 2021, 16:59:28) [MSC v.1916 64 bit (AMD64)]


  from IPython.core.display import display, HTML


In [3]:
path_to_repo = os.path.dirname(os.getcwd())
path_to_data = os.path.join(path_to_repo, 'data')
path_to_save = os.path.join(path_to_repo, 'saves')

In [None]:
sys.path.insert(0, os.path.join(path_to_repo, 'src'))

In [None]:
from tmtools.tfidf import compute_gensim_tfidf_similarity_matrix
from tmtools.span import get_maximal_spans

In [20]:
# load relevant nlp models:

# 450 MB transformer-based model
nlp_en = spacy.load('en_core_web_trf')

# Merge multi-word entities into single tokens
nlp_en.add_pipe("merge_entities")

<function spacy.pipeline.functions.merge_entities(doc: spacy.tokens.doc.Doc)>

In [34]:
# 400 MB transformer-based model
nlp = spacy.load('fr_dep_news_trf', disable = ['ner'])

<a id="corpus"></a>

# 1. Corpus Import

In [5]:
df_incidents = pd.read_excel(os.path.join(path_to_data, 'data_nuclear_incidents.xlsx'))
df_incidents = df_incidents[['text']]
df_incidents.shape

(1527, 1)

In [6]:
df_incidents.head(2)

Unnamed: 0,text
0,Non-respect d’une règle de maîtrise de la crit...
1,Non-respect des spécifications techniques d’ex...


<a id="metadata"></a>

# 2. Metadata Extraction

[Table of content](#TOC)



### Find titles

In [8]:
df_incidents['Doc_id'] = df_incidents.index.tolist()
df_incidents['title']  = df_incidents['text'].apply(lambda t: t.split('\n')[0])
df_incidents['text']   = df_incidents['text'].apply(lambda t: '\n'.join(t.split('\n')[1:]))

df_incidents = df_incidents[['Doc_id', 'text', 'title']]

In [9]:
df_incidents.head(2)

Unnamed: 0,Doc_id,text,title
0,0,La société Framatome a déclaré le 20 avril 202...,Non-respect d’une règle de maîtrise de la crit...
1,1,"Le 7 avril 2021, l’exploitant de la centrale n...",Non-respect des spécifications techniques d’ex...


### Find dates and locations

In [10]:
# an example
text = df_incidents.text[1]
doc = nlp_en(text)

spacy.displacy.render(doc, style = "ent", jupyter = True)

In [None]:
titles = df_incidents['title'].tolist()
texts  = df_incidents.text.tolist()

common_entities = []
for i in tnrange(len(texts)):
    title = titles[i]
    text = texts[i]
    ents = nlp_en(title + '\n' + text).ents
    common_entities.append(ents)

In [None]:
dates = [[ent.text for ent in ents if ent.label_ == 'DATE'] for ents in common_entities]
dates = [ds[0].replace('2021à', '2021') if len(ds)>0 else None for ds in dates]

In [None]:
df_incidents['date'] = dates

In [None]:
locations = [[ent for ent in ents if ent.label_ == 'GPE'] for ents in common_entities]

In [None]:
unique_locations = [loc.text for locs in locations for loc in locs]
unique_locations += ['Saint-Laurent-des-Eaux']
unique_locations = set(unique_locations)
unique_locations - set(['Areva NC', 'Aucun', 'Saint'])
unique_locations

In [None]:
infered_locations = [[(loc, text.count(loc)) for loc in unique_locations] for text in df_incidents.text]
infered_locations = [sorted(locs, key = lambda lc: lc[1], reverse = True)[0] for locs in infered_locations]
infered_locations = [lc[0] if lc[1]>0 else 'Inconnu' for lc in infered_locations]

In [None]:
len([loc for loc in infered_locations if loc == 'Inconnu'])

In [None]:
infered_locations

In [None]:
df_incidents['location'] = infered_locations

In [None]:
df_incidents = df_incidents[['Doc_id', 'title', 'date', 'location', 'text']]

### Export result

In [8]:
df_incidents.to_excel(os.path.join(path_to_data, 'source_texts.xlsx'), index = False)

<a id="segmentation"></a>

# 3. Text segmentation

[Table of Content](#TOC)

In [9]:
df_incidents = pd.read_excel(os.path.join(path_to_data, 'source_texts.xlsx'))

In [10]:
df_incidents.head(3)

Unnamed: 0,Doc_id,title,date,location,text
0,0,Non-respect d’une règle de maîtrise de la crit...,le 20 avril 2021,Romans-sur-Isère,La société Framatome a déclaré le 20 avril 202...
1,1,Non-respect des spécifications techniques d’ex...,Le 7 avril 2021,Belleville-sur-Loire,"Le 7 avril 2021, l’exploitant de la centrale n..."
2,2,Non-respect d’une consigne de maîtrise de la c...,le 13 avril 2021,Romans-sur-Isère,La société Framatome a déclaré le 13 avril 202...


## 3.1 Segmentation into paragraphs

In [11]:
paragraphs = df_incidents.apply(
    func = lambda r: [(r.Doc_id, i, r.title, r.date, r.location, p) for i, p in enumerate(re.sub('(\n)+', '\n', r.text).split('\n'))],
    axis = 1,
)
paragraphs = [p for ps in paragraphs for p in ps]

In [12]:
df_paragraphs = pd.DataFrame(paragraphs, columns = ['Doc_id', 'Para_id', 'title', 'date', 'location', 'text'])
df_paragraphs.shape

(10210, 6)

In [13]:
df_paragraphs.to_excel(os.path.join(path_to_data, 'source_paragraphs.xlsx'), index = False)

## 3.2 Segmentation into sentences

In [14]:
tokenizer = PunktSentenceTokenizer()

In [15]:
sentences = df_paragraphs.apply(
    func = lambda r: [(r.Doc_id, r.Para_id, i, r.title, r.date, r.location, s) for i, s in enumerate(tokenizer.tokenize(r.text))],
    axis = 1,
)
sentences = [s for ss in sentences for s in ss]

In [16]:
df_sentences = pd.DataFrame(sentences, columns = ['Doc_id', 'Para_id', 'Sent_id', 'title', 'date', 'location', 'text'])
df_sentences.shape

(18861, 7)

In [17]:
df_sentences.to_excel(os.path.join(path_to_data, 'source_sentences.xlsx'), index = False)

## 3.3 Span extraction

[Table of Content](#TOC)

In [31]:
def get_span_data(s):
    return (s.text, s.lemma_, s.root.lemma_, s.start, s.end)

In [35]:
spans = df_sentences.apply(
    func = lambda r: [[r.Doc_id, r.Para_id, r.Sent_id, i, r.title, r.date, r.location] + list(get_span_data(s)) for i, s in enumerate(get_maximal_spans(nlp(r.text)))],
    axis = 1,
)
spans = [s for ss in spans for s in ss]

In [36]:
df_spans = pd.DataFrame(spans, columns = [
    'Doc_id', 'Para_id', 'Sent_id', 'Span_id', 'title', 'date', 'location', 'text', 'lemma', 'root', 'start', 'end',
])
df_spans.shape

(59864, 12)

In [37]:
df_spans.to_excel(os.path.join(path_to_data, 'source_spans.xlsx'), index = False)

<a id="bottom"></a>

[Table of content](#TOC)