# Processing XMLs with ***lxml*** library

## Imports

In [2]:
import glob                  
import os
from lxml import etree as ET 
import zipfile
import csv
import logging
import shutil
import re

In [3]:
# Création du CSV qui contient les données des points de ventes
pdvcsvfile = open('pdv.csv', 'w')
pdvnames   = ["annee",
              "id",
              "latitude",
              "longitude",
              "cp",
              "pop",
              "ouverture_debut",
              "ouverture_fin",
              "ouverture_saufjour",
              "adresse",
              "ville"]
pdv_writer = csv.DictWriter(pdvcsvfile, fieldnames=pdvnames)
pdv_writer.writeheader()

# Création du CSV qui contient les données des ruptures
rupturescsvfile = open('ruptures.csv', 'w')
rupturesnames = ["annee",
                 "id_pdv",
                 "id",
                 "nom",
                 "debut",
                 "fin"]
ruptures_writer = csv.DictWriter(rupturescsvfile, fieldnames=rupturesnames)
ruptures_writer.writeheader()

# Création du CSV qui contient les données des fermetures
fermeturescsvfile = open('fermetures.csv', 'w')
fermeturesnames = ["annee",
                   "id_pdv",
                   "type",
                   "debut",
                   "fin"]
fermetures_writer = csv.DictWriter(fermeturescsvfile, fieldnames=fermeturesnames)
fermetures_writer.writeheader()


# Création du CSV qui contient les données des prix
prixcsvfile = open('prix.csv', 'w')
prixnames   = ["annee",
              "id_pdv",
              "nom",
              "id",
              "maj",
              "valeur"]
prix_writer = csv.DictWriter(prixcsvfile, fieldnames=prixnames)
prix_writer.writeheader()

# Création du CSV qui contient les données des services
servicescsvfile = open('services.csv', 'w')
servicesnames   = ["annee",
                   "id_pdv",
                   "service"]
services_writer = csv.DictWriter(servicescsvfile, fieldnames=servicesnames)
services_writer.writeheader()


# Initialisation des dictionnaire des lignes temporiaires à inserer dans les csv
pdv_row        = dict.fromkeys(pdvnames)
ruptures_row   = dict.fromkeys(rupturesnames)
fermetures_row = dict.fromkeys(fermeturesnames)
prix_row       = dict.fromkeys(prixnames)
services_row   = dict.fromkeys(servicesnames)

In [4]:
# txt_path : fonction qui prend une balise du XML (element), 
	# le chemin de la valeur qu’on souhaite récupérer (path) 
def txt_path(element, path='.'):
    if element.find(path) is not None:
        res = str(element.find(path).text)
        return res
    return ""

In [5]:
def process_prix(annee, id_pdv, prix):
    prix_row['annee']  = annee
    prix_row['id_pdv'] = id_pdv
    for name, value in sorted(prix.items()):
        prix_row[name] = value
    prix_writer.writerow(prix_row)

In [6]:
def process_service(annee, id_pdv, service):
    services_row['annee']   = annee
    services_row['id_pdv']  = id_pdv
    services_row["service"] = txt_path(service)
    services_writer.writerow(services_row)

In [7]:
def process_rupture(annee, id_pdv, rupture):
    ruptures_row['annee']  = annee
    ruptures_row['id_pdv'] = id_pdv
    for name, value in sorted(rupture.items()):
        ruptures_row[name] = value
    ruptures_writer.writerow(ruptures_row)

In [8]:
def process_fermeture(annee, id_pdv, fermeture):
    fermetures_row['annee']  = annee
    fermetures_row['id_pdv'] = id_pdv
    for name, value in sorted(fermeture.items()):
        fermetures_row[name] = value
    fermetures_writer.writerow(fermetures_row)

In [12]:
def process_pdv(annee, pdv):
    pdv_row['annee'] = annee
    pdv_row['adresse'] = txt_path(pdv, 'adresse')
    pdv_row['ville'] = txt_path(pdv, 'ville')
    
    for name, value in sorted(pdv.items()):
        pdv_row[name] = value
    if not pdv.find('ouverture') is None:
        for name, value in sorted(pdv.find('ouverture').items()):
            pdv_row["ouverture_" + name] = value
    
    id_pdv = pdv_row['id']
    for prix in pdv.findall('prix'):
        process_prix(annee, id_pdv, prix)
    
    for service in pdv.findall('services/service'):
        process_service(annee, id_pdv, service)
    
    for rupture in pdv.findall('rupture'):
        process_rupture(annee, id_pdv, rupture)
        
    for fermeture in pdv.findall('fermeture'):
        process_fermeture(annee, id_pdv, fermeture)
        
    pdv_writer.writerow(pdv_row)

In [10]:
for zip_file in glob.glob('data/PrixC*2019.zip'):
    annee   = re.findall('.*(\d{4})\.zip',zip_file)[0]
    xml     = re.findall('([^/]*\d{4})\.zip',zip_file)[0] + ".xml"

    # lire le zip
    logging.debug("Opening ZIP file " + zip_file)    
    zf = zipfile.ZipFile(zip_file, 'r')
    # lire le XML
    f  = zf.open(xml)
    # créer le XMLReader
    context = ET.iterparse(f, events=('end',), tag='pdv')
    for event, pdv in context:
        process_pdv(annee, pdv)
        pdv.close()
    # fermer le zip
    zf.close()

AttributeError: 'NoneType' object has no attribute 'items'

In [11]:
pdvcsvfile.close()
rupturescsvfile.close()
fermeturescsvfile.close()
servicescsvfile.close()
prixcsvfile.close()

In [None]:
print("{:10} {}\n".format("CSVs","nb de lignes"))
for csv in glob.glob('*.csv'):
    with open(csv) as c:
        print("{:10} {}".format(csv[:-4],len(c.readlines())))