# Experimenting with Gallica logs

## 1. Importing logs

In [100]:
import os
import numpy as np

# for now I manually extracted a single file to test
file  = open('res296.log','r',encoding="utf8")
lines = file.read().splitlines()


## 2. Exploring logs

In [101]:
import pandas as pd
# convert lines from logs into pandas dataframe
lines_df = pd.DataFrame(lines)


In [102]:
lines_df.head()

Unnamed: 0,0
0,##320022e99796ca35dab7e63d48fd5e7##null##null#...
1,##e7fdec50f50253f6796d61b5382155f8##null##null...
2,##320022e99796ca35dab7e63d48fd5e7##null##null#...
3,##e7fdec50f50253f6796d61b5382155f8##null##null...
4,##320022e99796ca35dab7e63d48fd5e7##null##null#...


In [103]:
# we need to split each line into relevant metadata according to this example from Nouvellet et al.

![caption](log_example.png)

In [104]:
# first split, according to the example we split by ## to get ip, pays, ville and then date/requete/procole/code/taille/référant together
lines_df=lines_df[0].str.split('##', expand=True)

In [105]:
lines_df.head()

Unnamed: 0,0,1,2,3,4
0,,320022e99796ca35dab7e63d48fd5e7,,,"- - [03/Mar/2017:10:58:15 +0100] ""GET /ark:/12..."
1,,e7fdec50f50253f6796d61b5382155f8,,,"- - [03/Mar/2017:10:58:41 +0100] ""GET /ark:/12..."
2,,320022e99796ca35dab7e63d48fd5e7,,,"- - [03/Mar/2017:11:01:15 +0100] ""GET /ark:/12..."
3,,e7fdec50f50253f6796d61b5382155f8,,,"- - [03/Mar/2017:11:01:41 +0100] ""GET /ark:/12..."
4,,320022e99796ca35dab7e63d48fd5e7,,,"- - [03/Mar/2017:11:04:15 +0100] ""GET /ark:/12..."


### 2.1 Parsing HTTP requests

In [106]:
# Trying python email library
from email.parser import BytesParser

# testing on one example
request_text = str(lines_df.loc[1200][4])


In [107]:
request_line, headers_alone = request_text.split('-', 1)

In [108]:
request_line,headers_alone

('',
 ' - [03/Mar/2017:18:04:59 +0100] "GET /ark:/12148/bpt6k9657410k.thumbnail HTTP/1.1" 200 3861 "http://gallica.bnf.fr/services/engine/search/sru?operation=searchRetrieve&version=1.2&query=%28gallica%20all%20%22Le%20changement%20de%20nom%20a%20%C3%A9t%C3%A9%20prescrit%20par%20d%C3%A9cret%20du%2023%20octobre%201894.%22%29&suggest=0" "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36" "JSESSIONID=AD5536402E77AAE1014B65581D047634; xtvrn=$18798$" 154554')

In [109]:
import re

In [110]:
p = re.compile('\[([^]]*)\]')
# regex to capture between brackets

In [111]:
print(p.findall(request_text))

['03/Mar/2017:18:04:59 +0100']


In [112]:
p2 = p = re.compile('\"([^]]*)\]')
# regex to capture between brackets

In [113]:
request_text.split(']')[1].split("\"")

[' ',
 'GET /ark:/12148/bpt6k9657410k.thumbnail HTTP/1.1',
 ' 200 3861 ',
 'http://gallica.bnf.fr/services/engine/search/sru?operation=searchRetrieve&version=1.2&query=%28gallica%20all%20%22Le%20changement%20de%20nom%20a%20%C3%A9t%C3%A9%20prescrit%20par%20d%C3%A9cret%20du%2023%20octobre%201894.%22%29&suggest=0',
 ' ',
 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
 ' ',
 'JSESSIONID=AD5536402E77AAE1014B65581D047634; xtvrn=$18798$',
 ' 154554']

#### Experimenting with extracting data

In [114]:
# This is just experimental, to change with lazy eval regex when working on bigger data.

In [115]:
request_text = str(lines_df.loc[1200][4])

In [116]:
request_text.split("\"")

['- - [03/Mar/2017:18:04:59 +0100] ',
 'GET /ark:/12148/bpt6k9657410k.thumbnail HTTP/1.1',
 ' 200 3861 ',
 'http://gallica.bnf.fr/services/engine/search/sru?operation=searchRetrieve&version=1.2&query=%28gallica%20all%20%22Le%20changement%20de%20nom%20a%20%C3%A9t%C3%A9%20prescrit%20par%20d%C3%A9cret%20du%2023%20octobre%201894.%22%29&suggest=0',
 ' ',
 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
 ' ',
 'JSESSIONID=AD5536402E77AAE1014B65581D047634; xtvrn=$18798$',
 ' 154554']

In [117]:
# get the date 
date = request_text.split("]")[0].split("[")[1]
date

'03/Mar/2017:18:04:59 +0100'

In [118]:
request = ' '.join(request_text.split("\"")[1].split(' ')[:2])
request

'GET /ark:/12148/bpt6k9657410k.thumbnail'

In [119]:
protocole = request_text.split("\"")[1].split(' ')[2]
protocole

'HTTP/1.1'

In [120]:
code =  request_text.split("\"")[2].split(' ')[1]
code

'200'

In [121]:
taille = request_text.split("\"")[2].split(' ')[2]
taille

'3861'

In [122]:
referant = request_text.split("\"")[3]
referant

'http://gallica.bnf.fr/services/engine/search/sru?operation=searchRetrieve&version=1.2&query=%28gallica%20all%20%22Le%20changement%20de%20nom%20a%20%C3%A9t%C3%A9%20prescrit%20par%20d%C3%A9cret%20du%2023%20octobre%201894.%22%29&suggest=0'

In [123]:
ark ='-'
if('ark:/' in request):
    ark = request.split(' ')[1].split('/')[3]
ark

'bpt6k9657410k.thumbnail'

In [124]:
import shutil
import requests
import xmltodict
from bs4 import BeautifulSoup
from xml.etree import ElementTree as ET

In [125]:

def OAI(id):

    OAI_BASEURL = 'https://gallica.bnf.fr/services/OAIRecord?ark='

    url = "".join([OAI_BASEURL, id])
    print(url)

    s = requests.get(url, stream=True)
    soup = BeautifulSoup(s.content,"lxml-xml")
    print(soup)
    file = open('oai.xml', 'wb')
    file.write(soup.prettify().encode('UTF-8'))
    file.close()
    with open('oai.xml',encoding='UTF-8') as xml:
        doc = xmltodict.parse(xml.read())
        return doc


In [126]:
doc = OAI(ark)

https://gallica.bnf.fr/services/OAIRecord?ark=bpt6k9657410k.thumbnail
<?xml version="1.0" encoding="utf-8"?>
<results ResultsGenerationSearchTime="0:00:00.029" countResults="1" resultType="CVOAIRecordSearchService" searchTime="">
<visibility_rights>all</visibility_rights>
<notice>
<record>
<header>
<identifier>oai:bnf.fr:gallica/ark:/12148/bpt6k9657410k</identifier>
<datestamp>2019-06-22</datestamp>
<setSpec>gallica:theme:3:34</setSpec>
<setSpec>gallica:typedoc:periodiques:fascicules</setSpec>
</header>
<metadata>
<oai_dc:dc xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd">
<dc:identifier>https://gallica.bnf.fr/ark:/12148/bpt6k9657410k</dc:identifier>
<dc:date>1887</dc:date>
<dc:description>  1887</dc:description>
<dc:description>1887 (TB,A10)-1897.</dc:description>
<dc

In [127]:
type = doc.get('results').get('notice').get('record').get('header').get('setSpec')
type

['gallica:theme:3:34', 'gallica:typedoc:periodiques:fascicules']

In [128]:
metadata = doc.get('results').get('notice').get('record').get('metadata')
metadata

OrderedDict([('oai_dc:dc',
              OrderedDict([('@xmlns:dc', 'http://purl.org/dc/elements/1.1/'),
                           ('@xmlns:oai_dc',
                            'http://www.openarchives.org/OAI/2.0/oai_dc/'),
                           ('@xmlns:xsi',
                            'http://www.w3.org/2001/XMLSchema-instance'),
                           ('@xsi:schemaLocation',
                            'http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd'),
                           ('dc:identifier',
                            'https://gallica.bnf.fr/ark:/12148/bpt6k9657410k'),
                           ('dc:date', '1887'),
                           ('dc:description',
                            ['1887', '1887 (TB,A10)-1897.', None]),
                           ('dc:title',
                            'Jurisprudence générale du royaume en matière civile, commerciale et criminelle : ou Journal des audiences de la Cour de cassatio

## Extracting data

In [129]:
temp = pd.DataFrame()

In [130]:
# extracting the date of query
temp['date']=lines_df.apply(lambda x: x[4].split("]")[0].split("[")[1] ,axis = 1)

In [131]:
# extracting request
temp['request'] = lines_df.apply(lambda x: ' '.join(x[4].split("\"")[1].split(' ')[:2]),axis=1)

In [132]:
# extracting protocol, still need to look into it further
# do i really need to extract protocol at this point?
# commenting this for now until further research

# temp['protocol'] = lines_df.apply(lambda x: x[4].split("\"")[1].split(' ')[2], axis=1)

In [133]:
# extracting code
# try catch to avoid index errors, need to look into this further, what happens exactly there?
def trycode(value, default):
    try:
        return value[4].split("\"")[2].split(' ')[1]
    except (IndexError):
        return '-'
    return default



temp['code'] = lines_df.apply(lambda x: trycode(x,'-'), axis=1)

In [134]:
# checking unique codes
temp.code.unique()

array(['503', '404', '200', '302', '501', '405', '304', '500', '400',
       '206', '-'], dtype=object)

In [135]:
# extracting length
# try catch to avoid index errors, need to look into this further, what happens exactly there?
def trylength(value, default):
    try:
        return value[4].split("\"")[2].split(' ')[2] 
    except (IndexError):
        return '-'
    return default

temp['length'] = lines_df.apply(lambda x:  trylength(x,'-') , axis=1)

In [136]:
# extracting referant
temp['referant'] = lines_df.apply(lambda x: x[4].split("\"")[3], axis=1)

In [137]:
# extracting ark name

#function to check if the request contains ark
def extract_ark(request):
    ark ='-'
    if('ark:/' in request):
        ark = request.split(' ')[1].split('/')[3]
    return ark

temp['ark'] = temp.apply(lambda x: extract_ark(x['request']), axis=1)
    

In [138]:
lines_df = lines_df.rename(columns={1:"IPAdress",2:"Country",3:"City"})

In [139]:
final_df = pd.concat([lines_df, temp],axis=1)

In [140]:
final_df

Unnamed: 0,0,IPAdress,Country,City,4,date,request,code,length,referant,ark
0,,320022e99796ca35dab7e63d48fd5e7,,,"- - [03/Mar/2017:10:58:15 +0100] ""GET /ark:/12...",03/Mar/2017:10:58:15 +0100,GET /ark:/12148/bpt6k70211m,503,-,-,bpt6k70211m
1,,e7fdec50f50253f6796d61b5382155f8,,,"- - [03/Mar/2017:10:58:41 +0100] ""GET /ark:/12...",03/Mar/2017:10:58:41 +0100,GET /ark:/12148/bpt6k70211m,503,-,-,bpt6k70211m
2,,320022e99796ca35dab7e63d48fd5e7,,,"- - [03/Mar/2017:11:01:15 +0100] ""GET /ark:/12...",03/Mar/2017:11:01:15 +0100,GET /ark:/12148/bpt6k70211m,503,-,-,bpt6k70211m
3,,e7fdec50f50253f6796d61b5382155f8,,,"- - [03/Mar/2017:11:01:41 +0100] ""GET /ark:/12...",03/Mar/2017:11:01:41 +0100,GET /ark:/12148/bpt6k70211m,503,-,-,bpt6k70211m
4,,320022e99796ca35dab7e63d48fd5e7,,,"- - [03/Mar/2017:11:04:15 +0100] ""GET /ark:/12...",03/Mar/2017:11:04:15 +0100,GET /ark:/12148/bpt6k70211m,503,-,-,bpt6k70211m
...,...,...,...,...,...,...,...,...,...,...,...
170099,,62d6ebcb4c44ac6bc6b36479640341a9,United States,Mountain View,"- - [04/Mar/2017:05:20:40 +0100] ""GET /assets/...",04/Mar/2017:05:20:40 +0100,GET /assets/static/javascripts/vendor/jquery.m...,200,1261,http://gallica.bnf.fr/ark:/12148/btv1b10542978v,-
170100,,e7fdec50f50253f6796d61b5382155f8,,,"- - [04/Mar/2017:05:20:41 +0100] ""GET /ark:/12...",04/Mar/2017:05:20:41 +0100,GET /ark:/12148/bpt6k70211m,200,32736,-,bpt6k70211m
170101,,62d6ebcb4c44ac6bc6b36479640341a9,United States,Mountain View,"- - [04/Mar/2017:05:20:42 +0100] ""GET /ark:/12...",04/Mar/2017:05:20:42 +0100,GET /ark:/12148/bpt6k209278x.thumbnail,200,2576,-,bpt6k209278x.thumbnail
170102,,62d6ebcb4c44ac6bc6b36479640341a9,United States,Mountain View,"- - [04/Mar/2017:05:20:47 +0100] ""GET /ark:/12...",04/Mar/2017:05:20:47 +0100,GET /ark:/12148/btv1b8439376v,302,20,-,btv1b8439376v


In [141]:
# To create sessions we would need clear timestamps, we parse the date column to make it easier to leverage.

In [148]:
# first split, according to the example we split by / to get day, month and the last column would still need to be parsed again
temp_df = final_df["date"].str.split('/', expand=True).rename(columns={0:"Day",1:"Month"})

In [149]:
# parse second column to get Year/Hour/Minute
temp_df_2 = temp_df[2].str.split(':',expand=True).drop(columns=[3]).rename(columns={0: "Year", 1: "Hour",2: "Minute"})

In [155]:
# regroup both columns to get one final dataframe containing the date
temp_df=temp_df.drop(columns=[2])
date_df = pd.concat([temp_df,temp_df_2],axis=1)
date_df.head()

Unnamed: 0,Day,Month,Year,Hour,Minute
0,3,Mar,2017,10,58
1,3,Mar,2017,10,58
2,3,Mar,2017,11,1
3,3,Mar,2017,11,1
4,3,Mar,2017,11,4


In [156]:
#regroup  date dataframe with our main dataframe
final_df = pd.concat([final_df,date_df],axis=1).drop(columns=["date"])
final_df.head()

Unnamed: 0,0,IPAdress,Country,City,4,request,code,length,referant,ark,Day,Month,Year,Hour,Minute
0,,320022e99796ca35dab7e63d48fd5e7,,,"- - [03/Mar/2017:10:58:15 +0100] ""GET /ark:/12...",GET /ark:/12148/bpt6k70211m,503,-,-,bpt6k70211m,3,Mar,2017,10,58
1,,e7fdec50f50253f6796d61b5382155f8,,,"- - [03/Mar/2017:10:58:41 +0100] ""GET /ark:/12...",GET /ark:/12148/bpt6k70211m,503,-,-,bpt6k70211m,3,Mar,2017,10,58
2,,320022e99796ca35dab7e63d48fd5e7,,,"- - [03/Mar/2017:11:01:15 +0100] ""GET /ark:/12...",GET /ark:/12148/bpt6k70211m,503,-,-,bpt6k70211m,3,Mar,2017,11,1
3,,e7fdec50f50253f6796d61b5382155f8,,,"- - [03/Mar/2017:11:01:41 +0100] ""GET /ark:/12...",GET /ark:/12148/bpt6k70211m,503,-,-,bpt6k70211m,3,Mar,2017,11,1
4,,320022e99796ca35dab7e63d48fd5e7,,,"- - [03/Mar/2017:11:04:15 +0100] ""GET /ark:/12...",GET /ark:/12148/bpt6k70211m,503,-,-,bpt6k70211m,3,Mar,2017,11,4


In [176]:
# TODO 
# PROBLEM WITH CAPTURING ARK
#
final_df[final_df.IPAdress=="105781f3101367c473a91d52b6d4fd67"]

Unnamed: 0,0,IPAdress,Country,City,4,request,code,length,referant,ark,Day,Month,Year,Hour,Minute
12352,,105781f3101367c473a91d52b6d4fd67,France,Rubelles,"- - [03/Mar/2017:18:27:36 +0100] ""GET /iiif/ar...","GET /iiif/ark:/12148/bpt6k54673247/f1/0,0,3819...",200,19377,http://gallica.bnf.fr/ark:/12148/bpt6k54673247...,12148,3,Mar,2017,18,27
12637,,105781f3101367c473a91d52b6d4fd67,France,Rubelles,"- - [03/Mar/2017:18:27:51 +0100] ""GET /iiif/ar...","GET /iiif/ark:/12148/bpt6k54673247/f1/3584,358...",200,281,http://gallica.bnf.fr/ark:/12148/bpt6k54673247...,12148,3,Mar,2017,18,27
12871,,105781f3101367c473a91d52b6d4fd67,France,Rubelles,"- - [03/Mar/2017:18:28:12 +0100] ""GET /service...",GET /services/ajax/extract/ark:/12148/bpt6k623...,200,645,http://gallica.bnf.fr/services/engine/search/s...,extract,3,Mar,2017,18,28
14586,,105781f3101367c473a91d52b6d4fd67,France,Rubelles,"- - [03/Mar/2017:18:32:02 +0100] ""GET /iiif/ar...","GET /iiif/ark:/12148/bpt6k224257z/f1/5120,3072...",200,412,http://gallica.bnf.fr/ark:/12148/bpt6k224257z/...,12148,3,Mar,2017,18,32
14635,,105781f3101367c473a91d52b6d4fd67,France,Rubelles,"- - [03/Mar/2017:18:32:12 +0100] ""GET /iiif/ar...","GET /iiif/ark:/12148/bpt6k224257z/f1/5120,0,22...",200,940,http://gallica.bnf.fr/ark:/12148/bpt6k224257z/...,12148,3,Mar,2017,18,32
14693,,105781f3101367c473a91d52b6d4fd67,France,Rubelles,"- - [03/Mar/2017:18:32:20 +0100] ""GET /iiif/ar...","GET /iiif/ark:/12148/bpt6k224257z/f1/4864,1024...",200,20090,http://gallica.bnf.fr/ark:/12148/bpt6k224257z/...,12148,3,Mar,2017,18,32
14733,,105781f3101367c473a91d52b6d4fd67,France,Rubelles,"- - [03/Mar/2017:18:32:29 +0100] ""GET /iiif/ar...","GET /iiif/ark:/12148/bpt6k224257z/f1/4608,3584...",200,28825,http://gallica.bnf.fr/ark:/12148/bpt6k224257z/...,12148,3,Mar,2017,18,32
14881,,105781f3101367c473a91d52b6d4fd67,France,Rubelles,"- - [03/Mar/2017:18:32:39 +0100] ""GET /iiif/ar...","GET /iiif/ark:/12148/bpt6k224257z/f1/2816,4096...",200,22317,http://gallica.bnf.fr/ark:/12148/bpt6k224257z/...,12148,3,Mar,2017,18,32
14886,,105781f3101367c473a91d52b6d4fd67,France,Rubelles,"- - [03/Mar/2017:18:32:47 +0100] ""GET /iiif/ar...","GET /iiif/ark:/12148/bpt6k224257z/f1/2048,4608...",200,24264,http://gallica.bnf.fr/ark:/12148/bpt6k224257z/...,12148,3,Mar,2017,18,32
14893,,105781f3101367c473a91d52b6d4fd67,France,Rubelles,"- - [03/Mar/2017:18:32:56 +0100] ""GET /iiif/ar...","GET /iiif/ark:/12148/bpt6k224257z/f1/2048,2048...",200,20773,http://gallica.bnf.fr/ark:/12148/bpt6k224257z/...,12148,3,Mar,2017,18,32


### Creating sessions

#### 1. Identifying sessions

In [167]:
# Session: séquences de requêtes
# Regrouper même adresse IP => session se termine intervalle supérieur à 60 minutes entre deux requêtes. 
sessions_df = final_df.groupby('IPAdress').agg({'ark':list,'Day':list,'Month':list,'Year':list,'Hour':list,'Minute':list})
sessions_df.head()

Unnamed: 0_level_0,ark,Day,Month,Year,Hour,Minute
IPAdress,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
103e44bc19d6aac58db9a149c73e505b,[-],[03],[Mar],[2017],[18],[12]
105781f3101367c473a91d52b6d4fd67,"[12148, 12148, extract, 12148, 12148, 12148, 1...","[03, 03, 03, 03, 03, 03, 03, 03, 03, 03, 03, 0...","[Mar, Mar, Mar, Mar, Mar, Mar, Mar, Mar, Mar, ...","[2017, 2017, 2017, 2017, 2017, 2017, 2017, 201...","[18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 1...","[27, 27, 28, 32, 32, 32, 32, 32, 32, 32, 33, 3..."
10907c8edc0b2702015e04f49a8204a2,"[-, -, -, -, bpt6k6308044k.lowres, bpt6k759364...","[03, 03, 03, 03, 03, 03, 03, 03, 03, 03, 03, 0...","[Mar, Mar, Mar, Mar, Mar, Mar, Mar, Mar, Mar, ...","[2017, 2017, 2017, 2017, 2017, 2017, 2017, 201...","[18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 1...","[05, 05, 05, 05, 05, 05, 05, 05, 05, 05, 05, 0..."
10915f6650d7b3ab000aafb953615c4e,"[bpt6k33258628.thumbnail, bpt6k3321225p.thumbn...","[03, 03, 03]","[Mar, Mar, Mar]","[2017, 2017, 2017]","[19, 19, 19]","[40, 41, 41]"
10dfc529d2b8f1a7ae6f94229848fbf,"[bpt6k4453214, -, -, -, -, -, -, -, -, -, -]","[03, 03, 03, 03, 03, 03, 03, 03, 03, 03, 03]","[Mar, Mar, Mar, Mar, Mar, Mar, Mar, Mar, Mar, ...","[2017, 2017, 2017, 2017, 2017, 2017, 2017, 201...","[18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18]","[38, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39]"


In [168]:
sessions_df.iloc[2]

ark       [-, -, -, -, bpt6k6308044k.lowres, bpt6k759364...
Day       [03, 03, 03, 03, 03, 03, 03, 03, 03, 03, 03, 0...
Month     [Mar, Mar, Mar, Mar, Mar, Mar, Mar, Mar, Mar, ...
Year      [2017, 2017, 2017, 2017, 2017, 2017, 2017, 201...
Hour      [18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 1...
Minute    [05, 05, 05, 05, 05, 05, 05, 05, 05, 05, 05, 0...
Name: 10907c8edc0b2702015e04f49a8204a2, dtype: object

In [179]:
from datetime import datetime

def days_between(d1, d2):
    d1 = datetime.strptime(d1, "%d/%m/%Y:%H:%M:%S")
    d2 = datetime.strptime(d2, "%d/%m/%Y:%H:%M:%S")
    return abs((d2 - d1))

In [181]:
time_1= "03/04/2017:10:58:15 +0100"
time_2 = "04/04/2017:12:58:15 +0100"

days_between(time_1,time_2)

ValueError: unconverted data remains:  +0100