In [1]:
import os
import time

def scan_directory(directory, pdfs=None):
    """
    Recursively scans a directory to detect PDFs and their metadata.
    Returns a list of tuples containing (name, path, size, modification_date).
    """
    if pdfs is None:
        pdfs = []
    
    for name in os.listdir(directory):
        path = os.path.join(directory, name)
        if os.path.isdir(path):
            scan_directory(path, pdfs)
        else:
            info = os.stat(path)
            if name.endswith('.pdf'):
                pdfs.append((name, path, info.st_size, time.ctime(info.st_mtime)))
    
    return pdfs

pdfs = scan_directory('../01_data/BBDD_Normativa_UPV')
pdfs[0]

('2013006700037300000387.pdf',
 '../01_data/BBDD_Normativa_UPV/2013/2013006700037300000387.pdf',
 550831,
 'Mon Nov 19 13:12:02 2018')

In [2]:
# Crear un código que haga una consulta a la base de datos para detectar si hay información de un archivo en la base de datos
import sqlite3

def connect_db(path):
    """
    Establishes a connection to the SQLite database and returns the connection and cursor objects.
    """
    connection = sqlite3.connect(path)
    cursor = connection.cursor()
    return connection, cursor

def disconnect_db(connection):
    """
    Commits changes and closes the database connection.
    """
    connection.commit()
    connection.close()

def register_pdf(cursor, name, path, date, size):
    """
    Inserts a new PDF record into the database and logs the creation.
    Uses parameterized queries to prevent SQL injection.
    """
    cursor.execute("""INSERT INTO pdfs (name, path, size, date) VALUES (?, ?, ?, ?)""", (name, path, size, date))
    pdf_id = cursor.lastrowid
    cursor.execute("""INSERT INTO logs (pdf_id, previous_id, date, action) VALUES (?, NULL, ?, 'CREATION')""", (pdf_id, date))

def modify_pdf(cursor, date, pdf_id, pdf_old_id):
    """
    Updates the PDF record in the database and logs the modification.
    Uses parameterized queries to prevent SQL injection.
    """
    cursor.execute("""UPDATE pdfs SET in_use = FALSE, last_use = ? WHERE id = ?""", (date, pdf_old_id))
    cursor.execute("""INSERT INTO logs (pdf_id, previous_id, date, action) VALUES (?, ?, ?, 'MODIFICATION')""", (pdf_id, pdf_old_id, date))

def stop_using_pdf(cursor, pdf_id, date):
    """
    Marks a PDF as no longer in use and logs the action.
    """
    cursor.execute("""UPDATE pdfs SET in_use = FALSE WHERE id = ?""", (pdf_id,))
    cursor.execute("""INSERT INTO logs (pdf_id, previous_id, date, action) VALUES (NULL, ?, ?, 'NO_USE')""", (pdf_id, date))

def detect_updates(cursor, name, path, size, date):
    """
    Detects whether the PDF file is already registered in the database.
    If the file exists but has changed, it updates the record.
    """
    cursor.execute("""SELECT id, size, date FROM pdfs WHERE path = ?""", (path,))
    data = cursor.fetchone()
    
    if data:
        pdf_id, stored_size, stored_date = data
        if stored_size != size or stored_date != date:
            stop_using_pdf(cursor, pdf_id, date)
            register_pdf(cursor, name, path, date, size)
            return "MOD"
        return "NOMOD"
    
    register_pdf(cursor, name, path, date, size)
    return "NEW"

In [3]:
conexion, cursor = connect_db('../01_data/project_database.db')

lista = []

for pdf in pdfs:
    nombre, ruta, size, date = pdf
    lista.append(detect_updates(cursor, nombre, ruta, size, date))

OperationalError: table logs has no column named previous_id

In [14]:
desconectar_bbdd(conexion)

In [5]:
conexion, cursor = conectar_bbdd('../01_data/project_database.db')

In [None]:
cursor.execute("""SELECT * FROM pdfs""")
datos = cursor.fetchall()
datos
# HASH, NOMBRE, RUTA, SIZE, FECHA

[(-9206002863770227617,
  '2024202400163600002156.pdf',
  '../01_data/BBDD_Normativa_UPV/2024/2024202400163600002156.pdf',
  430545.0,
  'Sun Jul  7 19:33:00 2024'),
 (-9202889180325441163,
  '2020013400107900001093.pdf',
  '../01_data/BBDD_Normativa_UPV/2020/2020013400107900001093.pdf',
  174533.0,
  'Sat Aug 15 21:22:10 2020'),
 (-9196298332774484602,
  '2021014300116300001210.pdf',
  '../01_data/BBDD_Normativa_UPV/2021/2021014300116300001210.pdf',
  171731.0,
  'Wed Jul  7 11:33:38 2021'),
 (-9195319733808271639,
  '2022202200139700001680.pdf',
  '../01_data/BBDD_Normativa_UPV/2022/2022202200139700001680.pdf',
  243891.0,
  'Mon Oct  9 20:38:34 2023'),
 (-9193990204548963421,
  '2022202200136500001613.pdf',
  '../01_data/BBDD_Normativa_UPV/2022/2022202200136500001613.pdf',
  418708.0,
  'Sat Oct  7 17:45:10 2023'),
 (-9193622631039048571,
  '2009003100017400000188.pdf',
  '../01_data/BBDD_Normativa_UPV/2009/2009003100017400000188.pdf',
  35457.0,
  'Mon Dec  2 19:57:36 2019'),
 (-91