# Processing XMLs with ***lxml*** library

## Imports

In [1]:
import glob                  
import os
from lxml import etree as ET 
import zipfile
import csv
import logging
import shutil
import re

In [2]:
logging.basicConfig(level='INFO')

In [3]:
# Création du CSV qui contient les données des points de ventes
pdvcsvfile = open('pdv.csv', 'w')
pdvnames   = ["annee",
              "id",
              "latitude",
              "longitude",
              "cp",
              "pop",
              "ouverture_debut",
              "ouverture_fin",
              "ouverture_saufjour",
              "adresse",
              "ville"]
pdv_writer = csv.DictWriter(pdvcsvfile, fieldnames=pdvnames)
pdv_writer.writeheader()

In [4]:
# Création du CSV qui contient les données des ruptures
rupturescsvfile = open('ruptures.csv', 'w')
rupturesnames = ["annee",
                 "id_pdv",
                 "id",
                 "nom",
                 "debut",
                 "fin"]
ruptures_writer = csv.DictWriter(rupturescsvfile, fieldnames=rupturesnames)
ruptures_writer.writeheader()

In [5]:
# Création du CSV qui contient les données des fermetures
fermeturescsvfile = open('fermetures.csv', 'w')
fermeturesnames = ["annee",
                   "id_pdv",
                   "type",
                   "debut",
                   "fin"]
fermetures_writer = csv.DictWriter(fermeturescsvfile, fieldnames=fermeturesnames)
fermetures_writer.writeheader()

In [6]:
# Création du CSV qui contient les données des prix
prixcsvfile = open('prix.csv', 'w')
prixnames   = ["annee",
              "id_pdv",
              "nom",
              "id",
              "maj",
              "valeur"]
prix_writer = csv.DictWriter(prixcsvfile, fieldnames=prixnames)
prix_writer.writeheader()

In [7]:
# Création du CSV qui contient les données des services
servicescsvfile = open('services.csv', 'w')
servicesnames   = ["annee",
                   "id_pdv",
                   "service"]
services_writer = csv.DictWriter(servicescsvfile, fieldnames=servicesnames)
services_writer.writeheader()

In [8]:
# Création du CSV qui contient les données des horaires
horairescsvfile = open('horaires.csv', 'w')
horairesnames   = ["annee",
                   "id_pdv",
                   "jour",
                   "ouverture",
                   "fermeture"]
horaires_writer = csv.DictWriter(horairescsvfile, fieldnames=horairesnames)
horaires_writer.writeheader()

In [9]:
# form XML element and a path : return content if exists else return empty string
def txt_path(element, path='.'):
    content = str(element.find(path).text) if element.find(path) is not None else ""
    return content

In [10]:
def process_horaires(annee, id_pdv, jour):
    horaires_row           = dict.fromkeys(horairesnames)
    horaires_row['annee']  = annee
    horaires_row['id_pdv'] = id_pdv
    nom, horaires_row["jour"]   = jour.items()[1]
    for horaire in jour.findall('horaire'):
        # attributes : ouverture, fermeture
        for name, value in sorted(horaire.items()):
            horaires_row[name] = value
        horaires_writer.writerow(horaires_row)

In [11]:
def process_prix(annee, id_pdv, prix):
    prix_row           = dict.fromkeys(prixnames)
    prix_row['annee']  = annee
    prix_row['id_pdv'] = id_pdv
    # attributes : nom, id, maj, valeur
    for name, value in sorted(prix.items()):
        prix_row[name] = value
    prix_writer.writerow(prix_row)

In [12]:
def process_service(annee, id_pdv, service):
    services_row            = dict.fromkeys(servicesnames)
    services_row['annee']   = annee
    services_row['id_pdv']  = id_pdv
    services_row["service"] = txt_path(service)
    services_writer.writerow(services_row)

In [13]:
def process_rupture(annee, id_pdv, rupture):
    ruptures_row           = dict.fromkeys(rupturesnames)
    ruptures_row['annee']  = annee
    ruptures_row['id_pdv'] = id_pdv
    # attributes : id, nom, debut, fin
    for name, value in sorted(rupture.items()):
        ruptures_row[name] = value
    ruptures_writer.writerow(ruptures_row)

In [14]:
def process_fermeture(annee, id_pdv, fermeture):
    fermetures_row           = dict.fromkeys(fermeturesnames)
    fermetures_row['annee']  = annee
    fermetures_row['id_pdv'] = id_pdv
    # attributes : type, debut, fin
    for name, value in sorted(fermeture.items()):
        fermetures_row[name] = value
    fermetures_writer.writerow(fermetures_row)

In [15]:
def process_pdv(annee, pdv):
    pdv_row            = dict.fromkeys(pdvnames)
    pdv_row['annee']   = annee
    pdv_row['adresse'] = txt_path(pdv, 'adresse')
    pdv_row['ville']   = txt_path(pdv, 'ville')
    # attributes : id, latitude, longitude, cp, pop
    for name, value in sorted(pdv.items()):
        pdv_row[name] = value
    # if 'ouverture' exists
    # attributes : debut, fin, saufjour
    if not pdv.find('ouverture') is None:
        for name, value in sorted(pdv.find('ouverture').items()):
            pdv_row["ouverture_" + name] = value
    
    id_pdv = pdv_row['id']
    for prix in pdv.findall('prix'):
        process_prix(annee, id_pdv, prix)
        prix.clear()
    
    for service in pdv.findall('services/service'):
        process_service(annee, id_pdv, service)
        service.clear()
        
    for rupture in pdv.findall('rupture'):
        process_rupture(annee, id_pdv, rupture)
        rupture.clear()
        
    for fermeture in pdv.findall('fermeture'):
        process_fermeture(annee, id_pdv, fermeture)
        fermeture.clear()
        
    for jour in pdv.findall('horaires/jour'):
        process_horaires(annee, id_pdv, jour)
        jour.clear()
        
    pdv_writer.writerow(pdv_row)

In [16]:
def process_all(data_file="."):
    for zip_file in glob.glob(data_file+'/PrixC*.zip'):
        annee   = re.findall('.*(\d{4})\.zip',zip_file)[0]
        xml     = re.findall('([^/]*\d{4})\.zip',zip_file)[0] + ".xml"
        # read zip
        logging.debug("Opening ZIP file " + zip_file)    
        zf = zipfile.ZipFile(zip_file, 'r')
        # read the XML
        f  = zf.open(xml)
        # context : it is an iterator of a map of trees built from "<pdv></pdv>" tags
        context = ET.iterparse(f, events=('end',), tag='pdv')
        for event, pdv in context:
            process_pdv(annee, pdv)
            pdv.clear() # clear sub-tree from memory
        # fermer le zip
        zf.close()

In [17]:
data_file = 'data'
# data_file = '/tmp/gas'
process_all(data_file)

In [18]:
pdvcsvfile.close()
rupturescsvfile.close()
fermeturescsvfile.close()
servicescsvfile.close()
prixcsvfile.close()

In [19]:
## Verify insertions count
print("{:10} {}\n".format("CSVs","nb de lignes"))
for csv in glob.glob('*.csv'):
    with open(csv) as c:
        print("{:10} {}".format(csv[:-4],len(c.readlines())))

CSVs       nb de lignes

prix       28552955
horaires   26584
fermetures 139614
pdv        140882
ruptures   260876
services   837092
