# Experimenting with Gallica logs

## 1. Importing logs

In [76]:
import os
import numpy as np

# for now I manually extracted a single file to test
file  = open('res296.log','r',encoding="utf8")
lines = file.read().splitlines()


## 2. Exploring logs

In [77]:
import pandas as pd
# convert lines from logs into pandas dataframe
lines_df = pd.DataFrame(lines)


In [78]:
lines_df.head()

Unnamed: 0,0
0,##320022e99796ca35dab7e63d48fd5e7##null##null#...
1,##e7fdec50f50253f6796d61b5382155f8##null##null...
2,##320022e99796ca35dab7e63d48fd5e7##null##null#...
3,##e7fdec50f50253f6796d61b5382155f8##null##null...
4,##320022e99796ca35dab7e63d48fd5e7##null##null#...


In [79]:
# we need to split each line into relevant metadata according to this example from Nouvellet et al.

![caption](log_example.png)

In [80]:
# first split, according to the example we split by ## to get ip, pays, ville and then date/requete/procole/code/taille/référant together
lines_df=lines_df[0].str.split('##', expand=True)

In [81]:
lines_df.head()

Unnamed: 0,0,1,2,3,4
0,,320022e99796ca35dab7e63d48fd5e7,,,"- - [03/Mar/2017:10:58:15 +0100] ""GET /ark:/12..."
1,,e7fdec50f50253f6796d61b5382155f8,,,"- - [03/Mar/2017:10:58:41 +0100] ""GET /ark:/12..."
2,,320022e99796ca35dab7e63d48fd5e7,,,"- - [03/Mar/2017:11:01:15 +0100] ""GET /ark:/12..."
3,,e7fdec50f50253f6796d61b5382155f8,,,"- - [03/Mar/2017:11:01:41 +0100] ""GET /ark:/12..."
4,,320022e99796ca35dab7e63d48fd5e7,,,"- - [03/Mar/2017:11:04:15 +0100] ""GET /ark:/12..."


### 2.1 Parsing HTTP requests

In [82]:
# Trying python email library
from email.parser import BytesParser

# testing on one example
request_text = str(lines_df.loc[1200][4])


In [83]:
request_line, headers_alone = request_text.split('-', 1)

In [84]:
request_line,headers_alone

('',
 ' - [03/Mar/2017:18:04:59 +0100] "GET /ark:/12148/bpt6k9657410k.thumbnail HTTP/1.1" 200 3861 "http://gallica.bnf.fr/services/engine/search/sru?operation=searchRetrieve&version=1.2&query=%28gallica%20all%20%22Le%20changement%20de%20nom%20a%20%C3%A9t%C3%A9%20prescrit%20par%20d%C3%A9cret%20du%2023%20octobre%201894.%22%29&suggest=0" "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36" "JSESSIONID=AD5536402E77AAE1014B65581D047634; xtvrn=$18798$" 154554')

In [85]:
import re

In [86]:
p = re.compile('\[([^]]*)\]')
# regex to capture between brackets

In [87]:
print(p.findall(request_text))

['03/Mar/2017:18:04:59 +0100']


In [88]:
p2 = p = re.compile('\"([^]]*)\]')
# regex to capture between brackets

In [89]:
request_text.split(']')[1].split("\"")

[' ',
 'GET /ark:/12148/bpt6k9657410k.thumbnail HTTP/1.1',
 ' 200 3861 ',
 'http://gallica.bnf.fr/services/engine/search/sru?operation=searchRetrieve&version=1.2&query=%28gallica%20all%20%22Le%20changement%20de%20nom%20a%20%C3%A9t%C3%A9%20prescrit%20par%20d%C3%A9cret%20du%2023%20octobre%201894.%22%29&suggest=0',
 ' ',
 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
 ' ',
 'JSESSIONID=AD5536402E77AAE1014B65581D047634; xtvrn=$18798$',
 ' 154554']

#### Experimenting with extracting data

In [90]:
# This is just experimental, to change with lazy eval regex when working on bigger data.

In [91]:
request_text = str(lines_df.loc[1200][4])

In [92]:
request_text.split("\"")

['- - [03/Mar/2017:18:04:59 +0100] ',
 'GET /ark:/12148/bpt6k9657410k.thumbnail HTTP/1.1',
 ' 200 3861 ',
 'http://gallica.bnf.fr/services/engine/search/sru?operation=searchRetrieve&version=1.2&query=%28gallica%20all%20%22Le%20changement%20de%20nom%20a%20%C3%A9t%C3%A9%20prescrit%20par%20d%C3%A9cret%20du%2023%20octobre%201894.%22%29&suggest=0',
 ' ',
 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
 ' ',
 'JSESSIONID=AD5536402E77AAE1014B65581D047634; xtvrn=$18798$',
 ' 154554']

In [93]:
# get the date 
date = request_text.split("]")[0].split("[")[1]
date

'03/Mar/2017:18:04:59 +0100'

In [94]:
request = ' '.join(request_text.split("\"")[1].split(' ')[:2])
request

'GET /ark:/12148/bpt6k9657410k.thumbnail'

In [95]:
protocole = request_text.split("\"")[1].split(' ')[2]
protocole

'HTTP/1.1'

In [96]:
code =  request_text.split("\"")[2].split(' ')[1]
code

'200'

In [97]:
taille = request_text.split("\"")[2].split(' ')[2]
taille

'3861'

In [98]:
referant = request_text.split("\"")[3]
referant

'http://gallica.bnf.fr/services/engine/search/sru?operation=searchRetrieve&version=1.2&query=%28gallica%20all%20%22Le%20changement%20de%20nom%20a%20%C3%A9t%C3%A9%20prescrit%20par%20d%C3%A9cret%20du%2023%20octobre%201894.%22%29&suggest=0'

In [99]:
request

'GET /ark:/12148/bpt6k9657410k.thumbnail'

In [100]:
# capture everything between 12148 and / or between 12148 and . using regex
ark = '-'
ark = re.findall('(?<=12148/).+?(?=/)|(?<=12148/).+?(?=\.)', request)
ark




['bpt6k9657410k']

In [101]:
ark ='-'
if('ark:/' in request):
    ark = request.split(' ')[1].split('/')[3]
ark

'bpt6k9657410k.thumbnail'

In [102]:
import shutil
import requests
import xmltodict
from bs4 import BeautifulSoup
from xml.etree import ElementTree as ET

In [157]:

def OAI(id):

    OAI_BASEURL = 'https://gallica.bnf.fr/services/OAIRecord?ark='

    url = "".join([OAI_BASEURL, id])
    #print(url)

    s = requests.get(url, stream=True)
    soup = BeautifulSoup(s.content,"lxml-xml")
    #print(soup)
    file = open('oai.xml', 'wb')
    file.write(soup.prettify().encode('UTF-8'))
    file.close()
    with open('oai.xml',encoding='UTF-8') as xml:
        doc = xmltodict.parse(xml.read())
        return doc


In [104]:
doc = OAI(ark)

https://gallica.bnf.fr/services/OAIRecord?ark=bpt6k9657410k.thumbnail
<?xml version="1.0" encoding="utf-8"?>
<results ResultsGenerationSearchTime="0:00:00.003" countResults="1" resultType="CVOAIRecordSearchService" searchTime="">
<visibility_rights>all</visibility_rights>
<notice>
<record>
<header>
<identifier>oai:bnf.fr:gallica/ark:/12148/bpt6k9657410k</identifier>
<datestamp>2019-06-22</datestamp>
<setSpec>gallica:theme:3:34</setSpec>
<setSpec>gallica:typedoc:periodiques:fascicules</setSpec>
</header>
<metadata>
<oai_dc:dc xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd">
<dc:identifier>https://gallica.bnf.fr/ark:/12148/bpt6k9657410k</dc:identifier>
<dc:date>1887</dc:date>
<dc:description>  1887</dc:description>
<dc:description>1887 (TB,A10)-1897.</dc:description>
<dc

In [105]:
type = doc.get('results').get('notice').get('record').get('header').get('setSpec')
type

['gallica:theme:3:34', 'gallica:typedoc:periodiques:fascicules']

In [106]:
metadata = doc.get('results').get('notice').get('record').get('metadata')
metadata

OrderedDict([('oai_dc:dc',
              OrderedDict([('@xmlns:dc', 'http://purl.org/dc/elements/1.1/'),
                           ('@xmlns:oai_dc',
                            'http://www.openarchives.org/OAI/2.0/oai_dc/'),
                           ('@xmlns:xsi',
                            'http://www.w3.org/2001/XMLSchema-instance'),
                           ('@xsi:schemaLocation',
                            'http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd'),
                           ('dc:identifier',
                            'https://gallica.bnf.fr/ark:/12148/bpt6k9657410k'),
                           ('dc:date', '1887'),
                           ('dc:description',
                            ['1887', '1887 (TB,A10)-1897.', None]),
                           ('dc:title',
                            'Jurisprudence générale du royaume en matière civile, commerciale et criminelle : ou Journal des audiences de la Cour de cassatio

In [143]:
doc.get('results').get('notice').get('record').get('metadata').get('oai_dc:dc').get('dc:title')

'Jurisprudence générale du royaume en matière civile, commerciale et criminelle : ou Journal des audiences de la Cour de cassation et des Cours royales / par M. Dalloz,... et par M. Tournemine,...'

## Extracting data

In [107]:
temp = pd.DataFrame()

In [108]:
# extracting the date of query
temp['date']=lines_df.apply(lambda x: x[4].split("]")[0].split("[")[1] ,axis = 1)

In [109]:
# extracting request
temp['request'] = lines_df.apply(lambda x: ' '.join(x[4].split("\"")[1].split(' ')[:2]),axis=1)

In [110]:
# extracting protocol, still need to look into it further
# do i really need to extract protocol at this point?
# commenting this for now until further research

# temp['protocol'] = lines_df.apply(lambda x: x[4].split("\"")[1].split(' ')[2], axis=1)

In [111]:
# extracting code
# try catch to avoid index errors, need to look into this further, what happens exactly there?
def trycode(value, default):
    try:
        return value[4].split("\"")[2].split(' ')[1]
    except (IndexError):
        return '-'
    return default



temp['code'] = lines_df.apply(lambda x: trycode(x,'-'), axis=1)

In [112]:
# checking unique codes
temp.code.unique()

array(['503', '404', '200', '302', '501', '405', '304', '500', '400',
       '206', '-'], dtype=object)

In [113]:
# extracting length
# try catch to avoid index errors, need to look into this further, what happens exactly there?
def trylength(value, default):
    try:
        return value[4].split("\"")[2].split(' ')[2] 
    except (IndexError):
        return '-'
    return default

temp['length'] = lines_df.apply(lambda x:  trylength(x,'-') , axis=1)

In [114]:
# extracting referant
temp['referant'] = lines_df.apply(lambda x: x[4].split("\"")[3], axis=1)

In [115]:
# extracting ark name

#function to check if the request contains ark
def extract_ark(request):
    # capture everything between 12148 and / or between 12148 and . using regex
    ark = '-'
    ark = re.findall('(?<=12148/).+?(?=/)|(?<=12148/).+?(?=\.)', request)
    return ark


temp['ark'] = temp.apply(lambda x: extract_ark(x['request']), axis=1)
    

In [116]:
lines_df = lines_df.rename(columns={1:"IPAdress",2:"Country",3:"City"})

In [117]:
final_df = pd.concat([lines_df, temp],axis=1)

In [118]:
final_df.head()

Unnamed: 0,0,IPAdress,Country,City,4,date,request,code,length,referant,ark
0,,320022e99796ca35dab7e63d48fd5e7,,,"- - [03/Mar/2017:10:58:15 +0100] ""GET /ark:/12...",03/Mar/2017:10:58:15 +0100,GET /ark:/12148/bpt6k70211m,503,-,-,[]
1,,e7fdec50f50253f6796d61b5382155f8,,,"- - [03/Mar/2017:10:58:41 +0100] ""GET /ark:/12...",03/Mar/2017:10:58:41 +0100,GET /ark:/12148/bpt6k70211m,503,-,-,[]
2,,320022e99796ca35dab7e63d48fd5e7,,,"- - [03/Mar/2017:11:01:15 +0100] ""GET /ark:/12...",03/Mar/2017:11:01:15 +0100,GET /ark:/12148/bpt6k70211m,503,-,-,[]
3,,e7fdec50f50253f6796d61b5382155f8,,,"- - [03/Mar/2017:11:01:41 +0100] ""GET /ark:/12...",03/Mar/2017:11:01:41 +0100,GET /ark:/12148/bpt6k70211m,503,-,-,[]
4,,320022e99796ca35dab7e63d48fd5e7,,,"- - [03/Mar/2017:11:04:15 +0100] ""GET /ark:/12...",03/Mar/2017:11:04:15 +0100,GET /ark:/12148/bpt6k70211m,503,-,-,[]


In [119]:
# TODO 
# PROBLEM WITH CAPTURING ARK
#
final_df[final_df.IPAdress=="105781f3101367c473a91d52b6d4fd67"].head()

Unnamed: 0,0,IPAdress,Country,City,4,date,request,code,length,referant,ark
12352,,105781f3101367c473a91d52b6d4fd67,France,Rubelles,"- - [03/Mar/2017:18:27:36 +0100] ""GET /iiif/ar...",03/Mar/2017:18:27:36 +0100,"GET /iiif/ark:/12148/bpt6k54673247/f1/0,0,3819...",200,19377,http://gallica.bnf.fr/ark:/12148/bpt6k54673247...,[bpt6k54673247]
12637,,105781f3101367c473a91d52b6d4fd67,France,Rubelles,"- - [03/Mar/2017:18:27:51 +0100] ""GET /iiif/ar...",03/Mar/2017:18:27:51 +0100,"GET /iiif/ark:/12148/bpt6k54673247/f1/3584,358...",200,281,http://gallica.bnf.fr/ark:/12148/bpt6k54673247...,[bpt6k54673247]
12871,,105781f3101367c473a91d52b6d4fd67,France,Rubelles,"- - [03/Mar/2017:18:28:12 +0100] ""GET /service...",03/Mar/2017:18:28:12 +0100,GET /services/ajax/extract/ark:/12148/bpt6k623...,200,645,http://gallica.bnf.fr/services/engine/search/s...,[bpt6k6239727c]
14586,,105781f3101367c473a91d52b6d4fd67,France,Rubelles,"- - [03/Mar/2017:18:32:02 +0100] ""GET /iiif/ar...",03/Mar/2017:18:32:02 +0100,"GET /iiif/ark:/12148/bpt6k224257z/f1/5120,3072...",200,412,http://gallica.bnf.fr/ark:/12148/bpt6k224257z/...,[bpt6k224257z]
14635,,105781f3101367c473a91d52b6d4fd67,France,Rubelles,"- - [03/Mar/2017:18:32:12 +0100] ""GET /iiif/ar...",03/Mar/2017:18:32:12 +0100,"GET /iiif/ark:/12148/bpt6k224257z/f1/5120,0,22...",200,940,http://gallica.bnf.fr/ark:/12148/bpt6k224257z/...,[bpt6k224257z]


### Creating sessions

#### 1. Identifying sessions - experimenting with no time series

In [120]:
# Session: séquences de requêtes
# Regrouper même adresse IP => session se termine intervalle supérieur à 60 minutes entre deux requêtes. 
sessions_df = final_df.groupby('IPAdress').agg({'ark':list,'date':list})
sessions_df.head()

Unnamed: 0_level_0,ark,date
IPAdress,Unnamed: 1_level_1,Unnamed: 2_level_1
103e44bc19d6aac58db9a149c73e505b,[[]],[03/Mar/2017:18:12:04 +0100]
105781f3101367c473a91d52b6d4fd67,"[[bpt6k54673247], [bpt6k54673247], [bpt6k62397...","[03/Mar/2017:18:27:36 +0100, 03/Mar/2017:18:27..."
10907c8edc0b2702015e04f49a8204a2,"[[], [], [], [], [bpt6k6308044k], [bpt6k759364...","[03/Mar/2017:18:05:21 +0100, 03/Mar/2017:18:05..."
10915f6650d7b3ab000aafb953615c4e,"[[bpt6k33258628], [bpt6k3321225p], [bpt6k62553...","[03/Mar/2017:19:40:11 +0100, 03/Mar/2017:19:41..."
10dfc529d2b8f1a7ae6f94229848fbf,"[[bpt6k4453214], [], [], [], [], [], [], [], [...","[03/Mar/2017:18:38:47 +0100, 03/Mar/2017:18:39..."


In [121]:
sessions_df.iloc[2]

ark     [[], [], [], [], [bpt6k6308044k], [bpt6k759364...
date    [03/Mar/2017:18:05:21 +0100, 03/Mar/2017:18:05...
Name: 10907c8edc0b2702015e04f49a8204a2, dtype: object

In [122]:
from datetime import datetime

def minutes_between(d1, d2):
    d1 = datetime.strptime(d1, "%d/%b/%Y:%H:%M:%S")
    d2 = datetime.strptime(d2, "%d/%b/%Y:%H:%M:%S")
    return abs(((d2 - d1)).total_seconds() // 60.0)

In [123]:
time_1= "03/Mar/2017:10:58:15 +0100"
time_2 = "03/Mar/2017:11:40:30 +0100"

minutes_between(time_1[:-6],time_2[:-6])

42.0

In [124]:
time_beginning = "01/Jan/0001:01:01:01 +0100"
time_end = "01/Jan/3000:01:01:01 +0100"
sessions_df['date_1'] = sessions_df.apply(lambda x: [time_beginning]+x['date'], axis = 1)
sessions_df['date_2'] = sessions_df.apply(lambda x: x['date']+[time_end],axis=1)

In [125]:
sessions_df

Unnamed: 0_level_0,ark,date,date_1,date_2
IPAdress,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
103e44bc19d6aac58db9a149c73e505b,[[]],[03/Mar/2017:18:12:04 +0100],"[01/Jan/0001:01:01:01 +0100, 03/Mar/2017:18:12...","[03/Mar/2017:18:12:04 +0100, 01/Jan/3000:01:01..."
105781f3101367c473a91d52b6d4fd67,"[[bpt6k54673247], [bpt6k54673247], [bpt6k62397...","[03/Mar/2017:18:27:36 +0100, 03/Mar/2017:18:27...","[01/Jan/0001:01:01:01 +0100, 03/Mar/2017:18:27...","[03/Mar/2017:18:27:36 +0100, 03/Mar/2017:18:27..."
10907c8edc0b2702015e04f49a8204a2,"[[], [], [], [], [bpt6k6308044k], [bpt6k759364...","[03/Mar/2017:18:05:21 +0100, 03/Mar/2017:18:05...","[01/Jan/0001:01:01:01 +0100, 03/Mar/2017:18:05...","[03/Mar/2017:18:05:21 +0100, 03/Mar/2017:18:05..."
10915f6650d7b3ab000aafb953615c4e,"[[bpt6k33258628], [bpt6k3321225p], [bpt6k62553...","[03/Mar/2017:19:40:11 +0100, 03/Mar/2017:19:41...","[01/Jan/0001:01:01:01 +0100, 03/Mar/2017:19:40...","[03/Mar/2017:19:40:11 +0100, 03/Mar/2017:19:41..."
10dfc529d2b8f1a7ae6f94229848fbf,"[[bpt6k4453214], [], [], [], [], [], [], [], [...","[03/Mar/2017:18:38:47 +0100, 03/Mar/2017:18:39...","[01/Jan/0001:01:01:01 +0100, 03/Mar/2017:18:38...","[03/Mar/2017:18:38:47 +0100, 03/Mar/2017:18:39..."
...,...,...,...,...
ff506e36e1385e8607d34532984b7d02,[[]],[03/Mar/2017:22:42:55 +0100],"[01/Jan/0001:01:01:01 +0100, 03/Mar/2017:22:42...","[03/Mar/2017:22:42:55 +0100, 01/Jan/3000:01:01..."
ff5c8a55572a3208f2bb0c646d44b3ba,"[[bpt6k657909j], [], [], [], [], [], [], [], [...","[03/Mar/2017:19:41:14 +0100, 03/Mar/2017:19:41...","[01/Jan/0001:01:01:01 +0100, 03/Mar/2017:19:41...","[03/Mar/2017:19:41:14 +0100, 03/Mar/2017:19:41..."
ff65eaf4c032d57ba8a48d4d2b1e79f7,"[[], [], [], [], [], [], [], [], [], [], [], [...","[03/Mar/2017:21:59:16 +0100, 03/Mar/2017:21:59...","[01/Jan/0001:01:01:01 +0100, 03/Mar/2017:21:59...","[03/Mar/2017:21:59:16 +0100, 03/Mar/2017:21:59..."
ff878e6c4084e5734ab56d50c5f01cfd,[[bpt6k1643872]],[03/Mar/2017:23:11:05 +0100],"[01/Jan/0001:01:01:01 +0100, 03/Mar/2017:23:11...","[03/Mar/2017:23:11:05 +0100, 01/Jan/3000:01:01..."


In [126]:
def calculate_difference_zipped_list(lst):
    new_lst = []
    for e in lst:
        if (e[0]==time_beginning):
            new_lst.append(999)
        elif (e[1]==time_end):
            new_lst.append(999)
        else:
            new_lst.append(minutes_between(e[0][:-6], e[1][:-6]))
    return new_lst
        
    

In [127]:
#this contains the ip adress and the zipped version of date_1,date_2
from collections import deque
IP_and_sessions = sessions_df.apply(lambda x: deque(calculate_difference_zipped_list(list(zip(x['date_1'],x['date_2'])))),axis=1)
IP_and_sessions

IPAdress
103e44bc19d6aac58db9a149c73e505b                                           [999, 999]
105781f3101367c473a91d52b6d4fd67    [999, 0.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
10907c8edc0b2702015e04f49a8204a2    [999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
10915f6650d7b3ab000aafb953615c4e                                 [999, 1.0, 0.0, 999]
10dfc529d2b8f1a7ae6f94229848fbf     [999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
                                                          ...                        
ff506e36e1385e8607d34532984b7d02                                           [999, 999]
ff5c8a55572a3208f2bb0c646d44b3ba    [999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
ff65eaf4c032d57ba8a48d4d2b1e79f7    [999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
ff878e6c4084e5734ab56d50c5f01cfd                                           [999, 999]
fff0a1c00ccb914d3e9ea95abc4502a1                                           [999, 999]
Length: 1127, dtype: object

#### 2.Identifying sessions - experimenting with time series

In [128]:
final_df


Unnamed: 0,0,IPAdress,Country,City,4,date,request,code,length,referant,ark
0,,320022e99796ca35dab7e63d48fd5e7,,,"- - [03/Mar/2017:10:58:15 +0100] ""GET /ark:/12...",03/Mar/2017:10:58:15 +0100,GET /ark:/12148/bpt6k70211m,503,-,-,[]
1,,e7fdec50f50253f6796d61b5382155f8,,,"- - [03/Mar/2017:10:58:41 +0100] ""GET /ark:/12...",03/Mar/2017:10:58:41 +0100,GET /ark:/12148/bpt6k70211m,503,-,-,[]
2,,320022e99796ca35dab7e63d48fd5e7,,,"- - [03/Mar/2017:11:01:15 +0100] ""GET /ark:/12...",03/Mar/2017:11:01:15 +0100,GET /ark:/12148/bpt6k70211m,503,-,-,[]
3,,e7fdec50f50253f6796d61b5382155f8,,,"- - [03/Mar/2017:11:01:41 +0100] ""GET /ark:/12...",03/Mar/2017:11:01:41 +0100,GET /ark:/12148/bpt6k70211m,503,-,-,[]
4,,320022e99796ca35dab7e63d48fd5e7,,,"- - [03/Mar/2017:11:04:15 +0100] ""GET /ark:/12...",03/Mar/2017:11:04:15 +0100,GET /ark:/12148/bpt6k70211m,503,-,-,[]
...,...,...,...,...,...,...,...,...,...,...,...
170099,,62d6ebcb4c44ac6bc6b36479640341a9,United States,Mountain View,"- - [04/Mar/2017:05:20:40 +0100] ""GET /assets/...",04/Mar/2017:05:20:40 +0100,GET /assets/static/javascripts/vendor/jquery.m...,200,1261,http://gallica.bnf.fr/ark:/12148/btv1b10542978v,[]
170100,,e7fdec50f50253f6796d61b5382155f8,,,"- - [04/Mar/2017:05:20:41 +0100] ""GET /ark:/12...",04/Mar/2017:05:20:41 +0100,GET /ark:/12148/bpt6k70211m,200,32736,-,[]
170101,,62d6ebcb4c44ac6bc6b36479640341a9,United States,Mountain View,"- - [04/Mar/2017:05:20:42 +0100] ""GET /ark:/12...",04/Mar/2017:05:20:42 +0100,GET /ark:/12148/bpt6k209278x.thumbnail,200,2576,-,[bpt6k209278x]
170102,,62d6ebcb4c44ac6bc6b36479640341a9,United States,Mountain View,"- - [04/Mar/2017:05:20:47 +0100] ""GET /ark:/12...",04/Mar/2017:05:20:47 +0100,GET /ark:/12148/btv1b8439376v,302,20,-,[]


In [129]:
# index on ipadress/date
# queue 
final_df['previous_connexion_date']=final_df.apply(lambda x: IP_and_sessions[x['IPAdress']].popleft(),axis=1)

In [130]:
final_df

Unnamed: 0,0,IPAdress,Country,City,4,date,request,code,length,referant,ark,previous_connexion_date
0,,320022e99796ca35dab7e63d48fd5e7,,,"- - [03/Mar/2017:10:58:15 +0100] ""GET /ark:/12...",03/Mar/2017:10:58:15 +0100,GET /ark:/12148/bpt6k70211m,503,-,-,[],999.0
1,,e7fdec50f50253f6796d61b5382155f8,,,"- - [03/Mar/2017:10:58:41 +0100] ""GET /ark:/12...",03/Mar/2017:10:58:41 +0100,GET /ark:/12148/bpt6k70211m,503,-,-,[],999.0
2,,320022e99796ca35dab7e63d48fd5e7,,,"- - [03/Mar/2017:11:01:15 +0100] ""GET /ark:/12...",03/Mar/2017:11:01:15 +0100,GET /ark:/12148/bpt6k70211m,503,-,-,[],3.0
3,,e7fdec50f50253f6796d61b5382155f8,,,"- - [03/Mar/2017:11:01:41 +0100] ""GET /ark:/12...",03/Mar/2017:11:01:41 +0100,GET /ark:/12148/bpt6k70211m,503,-,-,[],3.0
4,,320022e99796ca35dab7e63d48fd5e7,,,"- - [03/Mar/2017:11:04:15 +0100] ""GET /ark:/12...",03/Mar/2017:11:04:15 +0100,GET /ark:/12148/bpt6k70211m,503,-,-,[],3.0
...,...,...,...,...,...,...,...,...,...,...,...,...
170099,,62d6ebcb4c44ac6bc6b36479640341a9,United States,Mountain View,"- - [04/Mar/2017:05:20:40 +0100] ""GET /assets/...",04/Mar/2017:05:20:40 +0100,GET /assets/static/javascripts/vendor/jquery.m...,200,1261,http://gallica.bnf.fr/ark:/12148/btv1b10542978v,[],0.0
170100,,e7fdec50f50253f6796d61b5382155f8,,,"- - [04/Mar/2017:05:20:41 +0100] ""GET /ark:/12...",04/Mar/2017:05:20:41 +0100,GET /ark:/12148/bpt6k70211m,200,32736,-,[],1.0
170101,,62d6ebcb4c44ac6bc6b36479640341a9,United States,Mountain View,"- - [04/Mar/2017:05:20:42 +0100] ""GET /ark:/12...",04/Mar/2017:05:20:42 +0100,GET /ark:/12148/bpt6k209278x.thumbnail,200,2576,-,[bpt6k209278x],0.0
170102,,62d6ebcb4c44ac6bc6b36479640341a9,United States,Mountain View,"- - [04/Mar/2017:05:20:47 +0100] ""GET /ark:/12...",04/Mar/2017:05:20:47 +0100,GET /ark:/12148/btv1b8439376v,302,20,-,[],0.0


In [131]:
session_id=0
def create_session(period):
    global session_id
    if(period>30):
        session_id += 1
    return session_id

In [132]:
final_df=final_df.sort_values(by=['IPAdress','date'])
final_df['session_id'] = final_df.apply(lambda x: create_session(x['previous_connexion_date']),axis=1)

In [133]:
final_df

Unnamed: 0,0,IPAdress,Country,City,4,date,request,code,length,referant,ark,previous_connexion_date,session_id
5235,,103e44bc19d6aac58db9a149c73e505b,United States,Menlo Park,"- - [03/Mar/2017:18:12:04 +0100] ""GET /resize?...",03/Mar/2017:18:12:04 +0100,GET /resize?w=90&url=http%3A%2F%2Fgallica.bnf....,404,380,-,[],999.0,1
12352,,105781f3101367c473a91d52b6d4fd67,France,Rubelles,"- - [03/Mar/2017:18:27:36 +0100] ""GET /iiif/ar...",03/Mar/2017:18:27:36 +0100,"GET /iiif/ark:/12148/bpt6k54673247/f1/0,0,3819...",200,19377,http://gallica.bnf.fr/ark:/12148/bpt6k54673247...,[bpt6k54673247],999.0,2
12637,,105781f3101367c473a91d52b6d4fd67,France,Rubelles,"- - [03/Mar/2017:18:27:51 +0100] ""GET /iiif/ar...",03/Mar/2017:18:27:51 +0100,"GET /iiif/ark:/12148/bpt6k54673247/f1/3584,358...",200,281,http://gallica.bnf.fr/ark:/12148/bpt6k54673247...,[bpt6k54673247],0.0,2
12871,,105781f3101367c473a91d52b6d4fd67,France,Rubelles,"- - [03/Mar/2017:18:28:12 +0100] ""GET /service...",03/Mar/2017:18:28:12 +0100,GET /services/ajax/extract/ark:/12148/bpt6k623...,200,645,http://gallica.bnf.fr/services/engine/search/s...,[bpt6k6239727c],0.0,2
14586,,105781f3101367c473a91d52b6d4fd67,France,Rubelles,"- - [03/Mar/2017:18:32:02 +0100] ""GET /iiif/ar...",03/Mar/2017:18:32:02 +0100,"GET /iiif/ark:/12148/bpt6k224257z/f1/5120,3072...",200,412,http://gallica.bnf.fr/ark:/12148/bpt6k224257z/...,[bpt6k224257z],3.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
115337,,ff65eaf4c032d57ba8a48d4d2b1e79f7,France,Saint-Chamond,"- - [03/Mar/2017:22:07:16 +0100] ""GET /servic...",03/Mar/2017:22:07:16 +0100,GET /services/ajax/pagination/page/SINGLE/ark:...,200,2827,http://gallica.bnf.fr/ark:/12148/bpt6k9687558k...,[bpt6k9687558k],0.0,1146
115340,,ff65eaf4c032d57ba8a48d4d2b1e79f7,France,Saint-Chamond,"- - [03/Mar/2017:22:07:16 +0100] ""GET /mbImag...",03/Mar/2017:22:07:16 +0100,GET /mbImage/perso/favicon.ico,200,570,-,[],0.0,1146
115349,,ff65eaf4c032d57ba8a48d4d2b1e79f7,France,Saint-Chamond,"- - [03/Mar/2017:22:07:16 +0100] ""GET /ark:/1...",03/Mar/2017:22:07:16 +0100,GET /ark:/12148/bpt6k9687558k/f36.highres,200,366334,http://gallica.bnf.fr/ark:/12148/bpt6k9687558k...,[bpt6k9687558k],0.0,1146
133125,,ff878e6c4084e5734ab56d50c5f01cfd,France,Angers,"- - [03/Mar/2017:23:11:05 +0100] ""GET /ark:/12...",03/Mar/2017:23:11:05 +0100,GET /ark:/12148/bpt6k1643872.thumbnail,200,5781,http://data.bnf.fr/13005787/theodore_perrin/,[bpt6k1643872],999.0,1147


In [136]:
sessions = final_df.groupby('session_id').agg({'ark':list})

In [135]:
final_df.iloc[1]["request"]

'GET /iiif/ark:/12148/bpt6k54673247/f1/0,0,3819,4096/239,/0/native.jpg'

In [191]:
def get_title_from_OAI(l):
    temp = []
    for ark in l:  
        title = ''
        if(len(ark)>0):
            oai_result = OAI(ark[0]) 
            if(oai_result != None ):
                try:
                    title = oai_result.get('results').get('notice').get('record').get('metadata').get('oai_dc:dc').get('dc:title')
                except:
                    title = ''
                
        temp.append(title)
    return temp

In [192]:
sessions_5 = sessions[:5]

In [193]:
sessions_5['document_titles_path'] = sessions_5.apply(lambda x: get_title_from_OAI(x['ark']),axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [201]:
sessions_5

Unnamed: 0_level_0,ark,document_titles_path
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,[[]],[]
2,"[[bpt6k54673247], [bpt6k54673247], [bpt6k62397...","[Le Progrès, Le Progrès, L'Avenir de Souk-Ahra..."
3,"[[], [], [], [], [bpt6k6308044k], [bpt6k759364...","[, , , , Le Rire : journal humoristique, L'Act..."
4,"[[bpt6k33258628], [bpt6k3321225p], [bpt6k62553...",[La pratique administrative dans la fonction p...
5,"[[bpt6k4453214], [], [], [], [], [], [], [], [...",[Matériaux pour l'histoire primitive et nature...


In [199]:
sessions_5.iloc[2]['document_titles_path']

['',
 '',
 '',
 '',
 'Le Rire : journal humoristique',
 "L'Action française : organe du nationalisme intégral / directeur politique : Henri Vaugeois ; rédacteur en chef : Léon Daudet",
 "L'Echo d'Alger : journal républicain du matin",
 'La Croix',
 "L'Écho de Paris",
 'Figaro : journal non politique',
 "L'Humanité : journal socialiste quotidien",
 'Le Journal',
 'Le Matin : derniers télégrammes de la nuit',
 "L'Ouest-Éclair : journal quotidien d'informations, politique, littéraire, commercial",
 'Le Petit journal',
 'Le Petit Parisien : journal quotidien du soir',
 'Le Temps',
 "La Situation actuelle en Italie, jugée d'après les conceptions démocratiques d'un Garibaldien / Ricciotti Garibaldi",
 "De l'Enseignement des sourds-muets par la parole, mémoire présenté à l'Académie nationale de Savoie, par J. Théobald,...",
 'Histoire de la navigation de Jean Hugues de Linschot Hollandois : aux Indes orientales, contenant diverses descriptions des lieux jusques à present descouverts par les P

In [195]:
sessions_5.iloc[3e]

ark                     [[bpt6k33258628], [bpt6k3321225p], [bpt6k62553...
document_titles_path    [La pratique administrative dans la fonction p...
Name: 4, dtype: object