<center><h1> Combining our logs data with metadata from Gallica</center></h1>

#### 1. Imports

In [1]:
import pandas as pd
import glob
import pandas as pd
import numpy as np
import re 
import shutil
import requests
from bs4 import BeautifulSoup
from xml.etree import ElementTree as ET
from ast import literal_eval

#### 2. Useful functions 

In [2]:
# OAI request to Gallica
def OAI(id):

    OAI_BASEURL = 'https://gallica.bnf.fr/services/OAIRecord?ark='

    url = "".join([OAI_BASEURL, id])

    s = requests.get(url, stream=True)
    soup = BeautifulSoup(s.content,"lxml-xml")

    return soup


In [3]:
# dictionary of gallica themes, useful to interpret results from Gallica API queries
dewey_classification = {}
f = open("dewey.txt", "r",encoding='utf-8')
for x in f:
    try:
        if(x[2:4]=='0 '):
            dewey_classification[x[0:2]]= x[4:].rstrip()
    except:
        continue
        
index_to_themes=dewey_classification

In [4]:
import time

'''
get themes, date, title and language from ARKs using caching, if a user opens the same document twice we do not do any API call

Input: list of arks

Output: list of document titles

'''

def get_theme_from_ark(l):
    temp_theme = []
    temp_date = []
    temp_title = []
    temp_language =[]

    # regular expression to only catch fields containing theme
    for ark in l:  
        theme = ''
        date =''
        title =''
        language = ''
        # remembering that l is a list of list [[ark1],[ark2],[ark3]]
        # if ark is not empty
        if(len(ark)>0):
            try:
                oai_result = OAI(ark[0]) 
                if(oai_result != None ):                        
                    res = oai_result.results.notice.record.header.find_all("setSpec")
                    for e in res:
                        if "theme" in e.text:
                            theme= e.text.split(':')[3][:2]
                            theme = index_to_themes.get(theme)
                    date = oai_result.find("date").text  
                    title = oai_result.find("title").text
                    language = oai_result.find("language").text

            except:
                theme = ''
                date = ''
                title = ''
                language = ''

        temp_date.append(date)
        temp_theme.append(theme)
        temp_title.append(title)
        temp_language.append(language)

    
    return pd.Series([temp_theme, temp_date, temp_title, temp_language])

#### 3. Gather data 

In [None]:
# read sessions previously gathered 
sessions = pd.read_csv('SessionsApril2016from300to1000_clean.csv',engine='python',error_bad_lines=False)  
sessions.Ark = sessions.Ark.apply(literal_eval)

In [6]:
sessions

Unnamed: 0,Ark,Date,Country,City
0,"[[bpt6k8730899], [bpt6k9664572c], [bpt6k967442...","['12/Apr/2016:01:28:15 +0200', '12/Apr/2016:02...",Mexico,Mexico City
1,"[[bpt6k8630600z], [btv1b53118063c], [bpt6k9672...","['06/Apr/2016:15:38:02 +0200', '06/Apr/2016:15...",France,Paris
2,"[[btv1b69253712], [btv1b69253712], [btv1b69176...","['08/Apr/2016:10:30:43 +0200', '08/Apr/2016:11...",France,Paris
3,"[[bpt6k9664572c], [btv1b531180658], [bpt6k9668...","['09/Apr/2016:23:22:21 +0200', '09/Apr/2016:23...",France,Paris
4,"[[bpt6k122592m], [bpt6k122592m]]","['08/Apr/2016:21:26:02 +0200', '08/Apr/2016:21...",Egypt,
...,...,...,...,...
104187,"[[btv1b84472995], [btv1b84472995], [btv1b84472...","['09/Apr/2016:00:24:02 +0200', '09/Apr/2016:00...",Poland,
104188,"[[bpt6k5762405d], [bpt6k29835d], [bpt6k29835d]...","['09/Apr/2016:00:54:28 +0200', '09/Apr/2016:00...",France,Vouzeron
104189,"[[btv1b105359774], [bpt6k9672622g], [btv1b1053...","['09/Apr/2016:22:56:25 +0200', '09/Apr/2016:22...",Algeria,Tizi Ouzou
104190,"[[bpt6k111503c], [bpt6k111503c], [bpt6k111503c...","['07/Apr/2016:22:58:38 +0200', '07/Apr/2016:22...",France,Cubzac-les-Ponts


In [10]:
# removing subsequent ARKs and remove empty lists
def remove_consecutive_duplicates(l):
    return [v for i, v in enumerate(l) if (i == 0 or v != l[i-1]) and v!=[]]

    
sessions['Ark'] = sessions.apply(lambda x: remove_consecutive_duplicates(x['Ark']), axis = 1)

In [14]:
# Get only sessions that have 3<len<50
sessions["len"]= sessions.apply(lambda x: len(x['Ark']),axis=1)
sessions = sessions[(sessions["len"]<50) & (sessions["len"]>3)]
len(sessions)

31363

In [15]:
# Split our data into 3 chunks to reduce API calls
first_chunk = sessions[0:10000]
second_chunk = sessions[10000:20000]
third_chunk = sessions[20000:]

In [None]:
# Save data into csv for later use
for i in range(0,len(first_chunk),200):
    temp_1 = first_chunk[i:i+200].apply(lambda x: get_theme_from_ark(x['Ark']),axis = 1)
    temp_1.to_csv('first_chunk_300_1000.csv', mode='a', header=False)
    print(i+200)
    