# For each letter, extract the Nominal groups and put them in the database

## Getting the letters:

Run this query on the Henri Poincare Correspondence SPARQL endpoint [http://tomcat.henripoincare.fr/fuseki/#/dataset/hp_corpus/query](http://tomcat.henripoincare.fr/fuseki/#/dataset/hp_corpus/query), and save the result to a file `00_queryResults_transcriptions.csv` inside a `dataframes` folder next to this notebook

In [1]:
# transcriptions with their language

"""
PREFIX ahpo: <http://e-hp.ahp-numerique.fr/ahpo#>
        PREFIX dcterms: <http://purl.org/dc/terms/>
        PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
        PREFIX o:     <http://omeka.org/s/vocabs/o#>
PREFIX o-cnt: <http://www.w3.org/2011/content#>
 SELECT DISTINCT ?letter ?media ?language ?transcription WHERE { 
  #?lettre ahpo:sentBy <http://henripoincare.fr/api/items/843>.
  #?lettre ahpo:writingDate ?date .
  #?lettre ahpo:transcription ?trans.
  ?letter a ahpo:Letter.
  ?letter o:media ?media . 
  ?media o-cnt:chars ?transcription.
  OPTIONAL{ ?letter ahpo:language ?language}

} ORDERBY ?language

""";

## Importing the tools

In [2]:
from AHPNLP.tokenization.tokenization import Tokenizer

# I will use the french tokenizer for all the letter, because that's what the 
# recommendation system uses  at the end of the day
nominal_groups_french_tokenizer = Tokenizer(language = "french", nominal_groups= True)

In [3]:
import pandas as pd
import os
#notebooks/05/create_tokens_triples.ipynb
#notebooks/05/dataframes/00_queryResults_transcriptions.csv
df = pd.read_csv(os.path.join("dataframes","00_queryResults_transcriptions.csv"))

In [4]:
df.columns

Index(['letter', 'media', 'language', 'transcription'], dtype='object')

In [5]:
len(df)

1944

In [6]:
df = df.dropna(subset=['transcription'])
len(df)

1942

Clean the transcriptions

In [7]:
from AHPNLP.utilities.utilities import get_clean_transcription
df['clean_transcription'] = df['transcription'].apply(get_clean_transcription)

Compute the nominal groups

In [8]:
%%time
# preprocessing
df['normalized_nominal_groups'] = df['clean_transcription'].apply(
    lambda st: set(
        nominal_groups_french_tokenizer.tokenize_for_nominal_groups(
           nominal_groups_french_tokenizer.spacy_pipeline(st),
           normalized_form=True
        )

    )
) 

CPU times: user 3min 20s, sys: 698 ms, total: 3min 21s
Wall time: 3min 26s


In [11]:
df['normalized_nominal_groups']

0       {charles brisse, henri poincaré, albert badour...
1       {succès fou, timeo davaos, mlle clémence et de...
2       {frère belleville, école central de art et man...
3       {gouvernement jules dufaure, mahon et son chef...
4       {rinck attribu, unité monétaire, élie rinck, f...
                              ...                        
1939    {välja mellan, icke torde kunna, till kongl, h...
1940    {afhandling med, physique mathématique, hans m...
1941    {newton insåg först att centrifulgalkraften ko...
1942    {sferiska elektronerna, vidsträckt användning,...
1943    {henri poincaré, fredholm att, måtte tilldelas...
Name: normalized_nominal_groups, Length: 1942, dtype: object

In [12]:
nominal_groups_french_tokenizer.tokenize_for_nominal_groups(
           nominal_groups_french_tokenizer.spacy_pipeline("fonctions fuchsiennes"),
           normalized_form=True
        )

['fonction fuchsienne']

### Note:

"fuchsienne" is not recognized by the french tokenizer, thus, we have to manually normalize it by applying these replacements in sequence.

1. "fuchsiennes" $\to$ "fuchsien"
2. "fuchsiens" $\to$ "fuchsien"
3. "fuchsienne" $\to$ "fuchsien"
4. "fuchsien" $\to$ "fuchsienne"

In [11]:
#This is the definition used in the package.
"""
def normalise_fuchsien(st):
    return st.replace(
                "fuchsiennes", "fuchsien"
            ).replace(
                "fuchsiens", "fuchsien"
            ).replace(
                "fuchsienne", "fuchsien"
            ).replace(
                "fuchsien", "fuchsienne"
            ).replace(
                "groupe fuchsienne", "groupe fuchsien" #only case when it is in masculine
            )
"""

## Quick test

In [14]:
fuchsien_nominal_groups = []

for nominal_groups in df['normalized_nominal_groups']:
    for nominal_group in nominal_groups:
        if "fuchs" in nominal_group:
            fuchsien_nominal_groups.append(nominal_group)

print(len(fuchsien_nominal_groups))
fuchsien_nominal_groups

56


['fonction zetafuchsienne',
 'ihre definition der fuchsienne',
 'fonction fuchsienne',
 'gekannten funktionen fallen also nicht unter ihre definition der fuchsienne',
 'herren fuchs',
 'groupe fuchsien',
 'fonction fuchsienne',
 'groupe fuchsien',
 'fonction zétafuchsienne',
 'fonction fuchsienne',
 'fonction fuchsienne',
 'fonction fuchsienne',
 'groupe fuchsien',
 'idée succincte sur le fonction fuchsienne',
 'fonction fuchsienne',
 'fonction fuchsienne',
 'fonction fuchsienne',
 'fonction fuchsienne',
 'fuch et le dénomination de fonction fuchsienne',
 'fonction fuchsienne',
 'fuchsienne noch',
 'werde ich weder von fuchsienne noch kleinéenne gebrauch',
 'fonction fuchsienne',
 'fuchs in',
 'série relatif à fonction zétafuchsienne',
 'fonction fuchsienne',
 'fonction zétafuchsienne',
 'edmond fuchs',
 'fonction fuchsienne',
 'fonction fuchsienne',
 'fonction fuchsienne',
 'fonction fuchsienne',
 'fonction fuchsienne',
 'fonction fuchsienne',
 'fonction thêtafuchsienne',
 'fonction f

## Triple template

Triples look like this:
```
<http://henripoincare.fr/api/items/12531> ahpo:test "test"@en .
```

In [13]:
turtle_file_name = "normalized_nominal_groups_2.ttl"

property_name = "ahpo:hasNominalGroupInContent2"
triple_format = '<{letter_id}> '+property_name+' "{value}" .'
triple_format

'<{letter_id}> ahpo:hasNominalGroupInContent2 "{value}" .'

In [14]:
# preface:
preface = """@prefix :             <http://data.kasabi.com/dataset/italy/schema/> .
@prefix ahpo:         <http://e-hp.ahp-numerique.fr/ahpo#> .
@prefix ahpot:        <http://henripoincare.fr/ahpot> .

"""


In [15]:
triples = []

for _, row in df.iterrows():
    nominal_groups = row['normalized_nominal_groups']
    letter = row['letter']
    for token in nominal_groups:
        if "\\" not in token:  
            # we have to remove a special character that breaks the syntax
            triple = triple_format.format(
                letter_id = letter, 
                value = token
                )
            triples.append(triple)

In [16]:
len(triples)

23098

In [17]:
triples

['<http://henripoincare.fr/api/items/12531> ahpo:hasNominalGroupInContent2 "albert badoureau" .',
 '<http://henripoincare.fr/api/items/12531> ahpo:hasNominalGroupInContent2 "euphrasie launois" .',
 '<http://henripoincare.fr/api/items/12531> ahpo:hasNominalGroupInContent2 "octave barré" .',
 '<http://henripoincare.fr/api/items/12531> ahpo:hasNominalGroupInContent2 "heure chacun" .',
 '<http://henripoincare.fr/api/items/12531> ahpo:hasNominalGroupInContent2 "capitaine issu" .',
 '<http://henripoincare.fr/api/items/12531> ahpo:hasNominalGroupInContent2 "eugène brisse" .',
 '<http://henripoincare.fr/api/items/12531> ahpo:hasNominalGroupInContent2 "capitaine mahieu" .',
 '<http://henripoincare.fr/api/items/12531> ahpo:hasNominalGroupInContent2 "grand-mère maternel" .',
 '<http://henripoincare.fr/api/items/12531> ahpo:hasNominalGroupInContent2 "charles eugène pinat" .',
 '<http://henripoincare.fr/api/items/12531> ahpo:hasNominalGroupInContent2 "jean baptiste henry millot" .',
 '<http://henri

In [18]:
turtle_file_content = preface + "\n".join(triples)
print(turtle_file_content)

@prefix :             <http://data.kasabi.com/dataset/italy/schema/> .
@prefix ahpo:         <http://e-hp.ahp-numerique.fr/ahpo#> .
@prefix ahpot:        <http://henripoincare.fr/ahpot> .

<http://henripoincare.fr/api/items/12531> ahpo:hasNominalGroupInContent2 "albert badoureau" .
<http://henripoincare.fr/api/items/12531> ahpo:hasNominalGroupInContent2 "euphrasie launois" .
<http://henripoincare.fr/api/items/12531> ahpo:hasNominalGroupInContent2 "octave barré" .
<http://henripoincare.fr/api/items/12531> ahpo:hasNominalGroupInContent2 "heure chacun" .
<http://henripoincare.fr/api/items/12531> ahpo:hasNominalGroupInContent2 "capitaine issu" .
<http://henripoincare.fr/api/items/12531> ahpo:hasNominalGroupInContent2 "eugène brisse" .
<http://henripoincare.fr/api/items/12531> ahpo:hasNominalGroupInContent2 "capitaine mahieu" .
<http://henripoincare.fr/api/items/12531> ahpo:hasNominalGroupInContent2 "grand-mère maternel" .
<http://henripoincare.fr/api/items/12531> ahpo:hasNominalGroupInCont

In [19]:
with open(turtle_file_name,'w') as file:
    file.write(turtle_file_content)