# Experimenting with Gallica logs

## 1. Importing logs

In [1]:
import os

# for now I manually extracted a single file to test
file  = open('res296.log','r',encoding="utf8")
lines = file.read().splitlines()


## 2. Exploring logs

In [2]:
import pandas as pd
# convert lines from logs into pandas dataframe
lines_df = pd.DataFrame(lines)


In [3]:
lines_df.head()

Unnamed: 0,0
0,##320022e99796ca35dab7e63d48fd5e7##null##null#...
1,##e7fdec50f50253f6796d61b5382155f8##null##null...
2,##320022e99796ca35dab7e63d48fd5e7##null##null#...
3,##e7fdec50f50253f6796d61b5382155f8##null##null...
4,##320022e99796ca35dab7e63d48fd5e7##null##null#...


In [4]:
# we need to split each line into relevant metadata according to this example from Nouvellet et al.

![caption](log_example.png)

In [5]:
# first split, according to the example we split by ## to get ip, pays, ville and then date/requete/procole/code/taille/référant together
lines_df=lines_df[0].str.split('##', expand=True)

In [6]:
lines_df.head()

Unnamed: 0,0,1,2,3,4
0,,320022e99796ca35dab7e63d48fd5e7,,,"- - [03/Mar/2017:10:58:15 +0100] ""GET /ark:/12..."
1,,e7fdec50f50253f6796d61b5382155f8,,,"- - [03/Mar/2017:10:58:41 +0100] ""GET /ark:/12..."
2,,320022e99796ca35dab7e63d48fd5e7,,,"- - [03/Mar/2017:11:01:15 +0100] ""GET /ark:/12..."
3,,e7fdec50f50253f6796d61b5382155f8,,,"- - [03/Mar/2017:11:01:41 +0100] ""GET /ark:/12..."
4,,320022e99796ca35dab7e63d48fd5e7,,,"- - [03/Mar/2017:11:04:15 +0100] ""GET /ark:/12..."


### 2.1 Parsing HTTP requests

In [7]:
# Trying python email library
from email.parser import BytesParser

# testing on one example
request_text = str(lines_df.loc[1200][4])


In [8]:
request_line, headers_alone = request_text.split('-', 1)

In [9]:
request_line,headers_alone

('',
 ' - [03/Mar/2017:18:04:59 +0100] "GET /ark:/12148/bpt6k9657410k.thumbnail HTTP/1.1" 200 3861 "http://gallica.bnf.fr/services/engine/search/sru?operation=searchRetrieve&version=1.2&query=%28gallica%20all%20%22Le%20changement%20de%20nom%20a%20%C3%A9t%C3%A9%20prescrit%20par%20d%C3%A9cret%20du%2023%20octobre%201894.%22%29&suggest=0" "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36" "JSESSIONID=AD5536402E77AAE1014B65581D047634; xtvrn=$18798$" 154554')

In [10]:
import re

In [11]:
p = re.compile('\[([^]]*)\]')
# regex to capture between brackets

In [12]:
print(p.findall(request_text))

['03/Mar/2017:18:04:59 +0100']


In [13]:
p2 = p = re.compile('\"([^]]*)\]')
# regex to capture between brackets

In [14]:
request_text.split(']')[1].split("\"")

[' ',
 'GET /ark:/12148/bpt6k9657410k.thumbnail HTTP/1.1',
 ' 200 3861 ',
 'http://gallica.bnf.fr/services/engine/search/sru?operation=searchRetrieve&version=1.2&query=%28gallica%20all%20%22Le%20changement%20de%20nom%20a%20%C3%A9t%C3%A9%20prescrit%20par%20d%C3%A9cret%20du%2023%20octobre%201894.%22%29&suggest=0',
 ' ',
 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
 ' ',
 'JSESSIONID=AD5536402E77AAE1014B65581D047634; xtvrn=$18798$',
 ' 154554']

#### Experimenting with extracting data

In [15]:
# This is just experimental, to change with lazy eval regex when working on bigger data.

In [16]:
request_text = str(lines_df.loc[1200][4])

In [17]:
request_text.split("\"")

['- - [03/Mar/2017:18:04:59 +0100] ',
 'GET /ark:/12148/bpt6k9657410k.thumbnail HTTP/1.1',
 ' 200 3861 ',
 'http://gallica.bnf.fr/services/engine/search/sru?operation=searchRetrieve&version=1.2&query=%28gallica%20all%20%22Le%20changement%20de%20nom%20a%20%C3%A9t%C3%A9%20prescrit%20par%20d%C3%A9cret%20du%2023%20octobre%201894.%22%29&suggest=0',
 ' ',
 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
 ' ',
 'JSESSIONID=AD5536402E77AAE1014B65581D047634; xtvrn=$18798$',
 ' 154554']

In [18]:
# get the date 
date = request_text.split("]")[0].split("[")[1]
date

'03/Mar/2017:18:04:59 +0100'

In [19]:
request = ' '.join(request_text.split("\"")[1].split(' ')[:2])
request

'GET /ark:/12148/bpt6k9657410k.thumbnail'

In [20]:
protocole = request_text.split("\"")[1].split(' ')[2]
protocole

'HTTP/1.1'

In [21]:
code =  request_text.split("\"")[2].split(' ')[1]
code

'200'

In [22]:
taille = request_text.split("\"")[2].split(' ')[2]
taille

'3861'

In [23]:
referant = request_text.split("\"")[3]
referant

'http://gallica.bnf.fr/services/engine/search/sru?operation=searchRetrieve&version=1.2&query=%28gallica%20all%20%22Le%20changement%20de%20nom%20a%20%C3%A9t%C3%A9%20prescrit%20par%20d%C3%A9cret%20du%2023%20octobre%201894.%22%29&suggest=0'

In [24]:
ark ='-'
if('ark:/' in request):
    ark = request.split(' ')[1].split('/')[3]
ark

'bpt6k9657410k.thumbnail'

In [25]:
import shutil
import requests
import xmltodict
from bs4 import BeautifulSoup
from xml.etree import ElementTree as ET

In [26]:

def OAI(id):

    OAI_BASEURL = 'https://gallica.bnf.fr/services/OAIRecord?ark='

    url = "".join([OAI_BASEURL, id])
    print(url)

    s = requests.get(url, stream=True)
    soup = BeautifulSoup(s.content,"lxml-xml")
    print(soup)
    file = open('oai.xml', 'wb')
    file.write(soup.prettify().encode('UTF-8'))
    file.close()
    with open('oai.xml',encoding='UTF-8') as xml:
        doc = xmltodict.parse(xml.read())
        return doc


In [27]:
doc = OAI(ark)

https://gallica.bnf.fr/services/OAIRecord?ark=bpt6k9657410k.thumbnail
<?xml version="1.0" encoding="utf-8"?>
<results ResultsGenerationSearchTime="0:00:00.046" countResults="1" resultType="CVOAIRecordSearchService" searchTime="">
<visibility_rights>all</visibility_rights>
<notice>
<record>
<header>
<identifier>oai:bnf.fr:gallica/ark:/12148/bpt6k9657410k</identifier>
<datestamp>2019-06-22</datestamp>
<setSpec>gallica:theme:3:34</setSpec>
<setSpec>gallica:typedoc:periodiques:fascicules</setSpec>
</header>
<metadata>
<oai_dc:dc xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd">
<dc:identifier>https://gallica.bnf.fr/ark:/12148/bpt6k9657410k</dc:identifier>
<dc:date>1887</dc:date>
<dc:description>  1887</dc:description>
<dc:description>1887 (TB,A10)-1897.</dc:description>
<dc

In [28]:
type = doc.get('results').get('notice').get('record').get('header').get('setSpec')
type

['gallica:theme:3:34', 'gallica:typedoc:periodiques:fascicules']

In [29]:
metadata = doc.get('results').get('notice').get('record').get('metadata')
metadata

OrderedDict([('oai_dc:dc',
              OrderedDict([('@xmlns:dc', 'http://purl.org/dc/elements/1.1/'),
                           ('@xmlns:oai_dc',
                            'http://www.openarchives.org/OAI/2.0/oai_dc/'),
                           ('@xmlns:xsi',
                            'http://www.w3.org/2001/XMLSchema-instance'),
                           ('@xsi:schemaLocation',
                            'http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd'),
                           ('dc:identifier',
                            'https://gallica.bnf.fr/ark:/12148/bpt6k9657410k'),
                           ('dc:date', '1887'),
                           ('dc:description',
                            ['1887', '1887 (TB,A10)-1897.', None]),
                           ('dc:title',
                            'Jurisprudence générale du royaume en matière civile, commerciale et criminelle : ou Journal des audiences de la Cour de cassatio

## Extracting data

In [30]:
temp = pd.DataFrame()

In [31]:
# extracting the date of query
temp['date']=lines_df.apply(lambda x: x[4].split("]")[0].split("[")[1] ,axis = 1)

In [32]:
# extracting request
temp['request'] = lines_df.apply(lambda x: ' '.join(x[4].split("\"")[1].split(' ')[:2]),axis=1)

In [33]:
# extracting protocol, still need to look into it further
# do i really need to extract protocol at this point?
# commenting this for now until further research

# temp['protocol'] = lines_df.apply(lambda x: x[4].split("\"")[1].split(' ')[2], axis=1)

In [34]:
# extracting code
# try catch to avoid index errors, need to look into this further, what happens exactly there?
def trycode(value, default):
    try:
        return value[4].split("\"")[2].split(' ')[1]
    except (IndexError):
        return '-'
    return default



temp['code'] = lines_df.apply(lambda x: trycode(x,'-'), axis=1)

In [35]:
# checking unique codes
temp.code.unique()

array(['503', '404', '200', '302', '501', '405', '304', '500', '400',
       '206', '-'], dtype=object)

In [36]:
# extracting length
# try catch to avoid index errors, need to look into this further, what happens exactly there?
def trylength(value, default):
    try:
        return value[4].split("\"")[2].split(' ')[2] 
    except (IndexError):
        return '-'
    return default

temp['length'] = lines_df.apply(lambda x:  trylength(x,'-') , axis=1)

In [37]:
# extracting referant
temp['referant'] = lines_df.apply(lambda x: x[4].split("\"")[3], axis=1)

In [38]:
# extracting ark name

#function to check if the request contains ark
def extract_ark(request):
    ark ='-'
    if('ark:/' in request):
        ark = request.split(' ')[1].split('/')[3]
    return ark

temp['ark'] = temp.apply(lambda x: extract_ark(x['request']), axis=1)
    

In [39]:
lines_df = lines_df.rename(columns={1:"IPAdress",2:"Country",3:"City"})

In [40]:
final_df = pd.concat([lines_df, temp],axis=1)

In [41]:
final_df

Unnamed: 0,0,IPAdress,Country,City,4,date,request,code,length,referant,ark
0,,320022e99796ca35dab7e63d48fd5e7,,,"- - [03/Mar/2017:10:58:15 +0100] ""GET /ark:/12...",03/Mar/2017:10:58:15 +0100,GET /ark:/12148/bpt6k70211m,503,-,-,bpt6k70211m
1,,e7fdec50f50253f6796d61b5382155f8,,,"- - [03/Mar/2017:10:58:41 +0100] ""GET /ark:/12...",03/Mar/2017:10:58:41 +0100,GET /ark:/12148/bpt6k70211m,503,-,-,bpt6k70211m
2,,320022e99796ca35dab7e63d48fd5e7,,,"- - [03/Mar/2017:11:01:15 +0100] ""GET /ark:/12...",03/Mar/2017:11:01:15 +0100,GET /ark:/12148/bpt6k70211m,503,-,-,bpt6k70211m
3,,e7fdec50f50253f6796d61b5382155f8,,,"- - [03/Mar/2017:11:01:41 +0100] ""GET /ark:/12...",03/Mar/2017:11:01:41 +0100,GET /ark:/12148/bpt6k70211m,503,-,-,bpt6k70211m
4,,320022e99796ca35dab7e63d48fd5e7,,,"- - [03/Mar/2017:11:04:15 +0100] ""GET /ark:/12...",03/Mar/2017:11:04:15 +0100,GET /ark:/12148/bpt6k70211m,503,-,-,bpt6k70211m
...,...,...,...,...,...,...,...,...,...,...,...
170099,,62d6ebcb4c44ac6bc6b36479640341a9,United States,Mountain View,"- - [04/Mar/2017:05:20:40 +0100] ""GET /assets/...",04/Mar/2017:05:20:40 +0100,GET /assets/static/javascripts/vendor/jquery.m...,200,1261,http://gallica.bnf.fr/ark:/12148/btv1b10542978v,-
170100,,e7fdec50f50253f6796d61b5382155f8,,,"- - [04/Mar/2017:05:20:41 +0100] ""GET /ark:/12...",04/Mar/2017:05:20:41 +0100,GET /ark:/12148/bpt6k70211m,200,32736,-,bpt6k70211m
170101,,62d6ebcb4c44ac6bc6b36479640341a9,United States,Mountain View,"- - [04/Mar/2017:05:20:42 +0100] ""GET /ark:/12...",04/Mar/2017:05:20:42 +0100,GET /ark:/12148/bpt6k209278x.thumbnail,200,2576,-,bpt6k209278x.thumbnail
170102,,62d6ebcb4c44ac6bc6b36479640341a9,United States,Mountain View,"- - [04/Mar/2017:05:20:47 +0100] ""GET /ark:/12...",04/Mar/2017:05:20:47 +0100,GET /ark:/12148/btv1b8439376v,302,20,-,btv1b8439376v


### Subsection 2: XML parsing testing