In [None]:
import re

lines = None
cleaned_lines = []
error_lines = []

with open("L1-books.csv", encoding="utf-8") as f:
    lines = f.readlines()



re_perfect_line = re.compile(r'''
^
(?:"((?:[^"]|"")*)"|([^,]+))             # 1️⃣ Title (quoted or unquoted, required)
,\s*
(?:"((?:[^"]|"")*)"|([^,]*))?            # 2️⃣ Description (quoted, unquoted, or missing)
,\s*
(?:"(\[.*?\])"|(\[.*?\])|)?              # 3️⃣ Authors (quoted array, unquoted array, or missing)
,\s*
([^,]*)                                  # 4️⃣ Publisher (string, possibly empty)
,\s*
(\d{4}(?:-\d{2}){0,2}|)?                 # 5️⃣ Date (YYYY / YYYY-MM / YYYY-MM-DD or missing)
,\s*
(?:"(\[.*?\])"|(\[.*?\])|)?              # 6️⃣ Categories (array or missing)
\s*$
''', re.VERBOSE)

re_date_end = re.compile(r'\d$')

re_unquoted_title_multi = re.compile(r'''
^
(?!")            # no empieza con comillas
(.+?,)           # grupo 1: título potencial (mínimo hasta una coma)
(?=(?:[^"]*"[^"]*")*[^"]*$)  # asegura que no estamos dentro de comillas
(?=[^[]*\[)      # y antes del primer array [
''', re.VERBOSE)

def quote_unquoted_title(line: str) -> str:
    """
    Si el título no empieza con comillas pero contiene una o más comas
    antes del siguiente bloque (array o descripción), encierra el título en comillas.
    """
    if line.startswith('"'):
        return line  # ya está bien
    # cortar al primer array o hasta donde empiecen las comas separadoras
    parts = line.split(',[')
    if len(parts) > 1:
        before_array = parts[0]
        # Si hay más de una coma antes del primer array, probablemente sea parte del título
        if before_array.count(',') >= 2:
            # título = todo antes de la segunda coma vacía o antes del array
            title, rest = before_array, line[len(before_array):]
            return f'"{title.strip()}"{rest}'
    return line

with open("L1-books-reviewed.csv", "w") as f:
    for line in lines:
        line = line.strip()

        # if end_date_pattern_1.search(line):
        #     line = line + ","
        
        if bool(re_perfect_line.match(line)):
            f.write("1," + line + "\n")
        else:
            f.write("0," + line + "\n")

In [35]:
for line in error_lines[:10]:
    print(line)

Title,description,authors,publisher,publishedDate,categories
"Voices from the tapes, recordings from the other world,",['Peter Bander'],,1973,
"The Impatient Gardener's Lawn Book,"Explains how to grow a green, beautiful lawn and offers advice on fertilizers, watering, and lawn diseases"",['Jerry Baker'],"Random House Digital, Inc.",1987,['Gardening']
"Natural Wonders of the World: Converting Measurements to Metric Units (Powermath),"This book explores some of the world's most famous natural wonders, explaining measurements in the U.S. customary system and the metric system.","[""Kerri O'Donnell""]","The Rosen Publishing Group, Inc",2004-08-01",['Juvenile Nonfiction']
The book of the lover and the beloved,,A spiritual philosophy which calls our attention to contemplate our connection with God. In this remarkable mystic work Lull shares his personal views about romance with his readers. Splendid and motivating.,"['Ramon Llull', 'Kenneth Leech', 'Edgar Allison Peers']",,1978,
"Foreignisms

In [None]:

print(error_lines[1])

Dr. Seuss: American Icon,"Philip Nel takes a fascinating look into the key aspects of Seuss's career - his poetry, politics, art, marketing, and place in the popular imagination."" ""Nel argues convincingly that Dr. Seuss is one of the most influential poets in America. His nonsense verse, like that of Lewis Carroll and Edward Lear, has changed language itself, giving us new words like ""nerd."" And Seuss's famously loopy artistic style - what Nel terms an ""energetic cartoon surrealism"" - has been equally important, inspiring artists like filmmaker Tim Burton and illustrator Lane Smith. --from back cover",['Philip Nel'],A&C Black,2005-01-01,['Biography & Autobiography']
