In [27]:

import pandas as pd
from sqlalchemy import create_engine

engine = create_engine('postgresql://user:password@localhost:5432/booksdb')


query = """
SELECT *
FROM l1_books
"""

# test query
df = pd.read_sql(query, engine)
df[250:300]

Unnamed: 0,title,description,authors,publisher,publisheddate,categories
250,In a Mist,,['Sandra Field'],,1989,['Large type books']
251,Cold Truth,"Say the words... In a dark basement, a cold-bl...","['Donna Berdel', 'Mary Stone']",,2020-01-14,
252,"Let There Be Laughter: Living, Lifting, and La...",Learn how laughter and life in Christ can heal...,"['Richard Bimler', 'Robert D. Bimler']",,1999,['Religion']
253,"Modeling the Figure in Clay, 30th Anniversary ...","For thirty years, ""Modeling the Figure in Clay...",['Bruno Lucchesi'],Watson-Guptill,1996,['Art']
254,Focus on grammar (Longman grammar series),"Clear, communicative, and teachable, ""Focus on...","['Irene E. Schoenberg', 'Samuela Eckstut']",LONGMAN,2000,['Language Arts & Disciplines']
255,Maynard's House,"""Told with icy precision of eye and ear and a ...",['Herman Raucher'],Diversion Books,2015-05-03,['Fiction']
256,WHISPERS OF GOD: A JOURNEY INTO THE MIND OF A ...,"Definitely Not a bedtime story! ""Whispers of G...",['Mary B. Turner'],Expert Management Systems,2000-02-01,['Fiction']
257,The How and Why of Better Gardening,All aspects of home landscaping and gardening ...,['Better Homes and Gardens'],Better Homes & Gardens Books,1990-01-01,['Gardening']
258,Palmistry,These new cards easily demystify the science o...,['Vernon Mahabal'],Mandala Publishing Group,2002-04-30,"['Body, Mind & Spirit']"
259,Implementing and Managing Oracle Databases,This guide covers issues that database adminis...,"['Steve Lemme', 'John R. Colby']",Course Technology,2000,['Computers']


In [74]:
import re

df = pd.read_csv("L1-books.csv", delimiter=",")

row = df.iloc[30]
row_str = ",".join(map(str, row.tolist()))


# match group con  [ ]  todos! 
pattern = r'\[(.*?)\]'

matches = re.findall(pattern, row_str)
author = matches[0]
categories = matches[1]

print("Author:", author)
print("Categories:", categories)

# ahora el pattern es  ],string,fechas con nnnn-nn-nn o nnnn o cosas asi,[

pattern = r"\],(.*?),(.*?),\["
matches = re.findall(pattern, row_str)
if matches:
    publisher, date = [m.strip() for m in matches[0]]
    print("Publisher:", publisher)
    print("Date:", date)
else:
    print("No matches found")

pattern = r'^\s*"?([^",]+?)"?\s*(?:,\s*"([^"]*?)")?,\s*\['

matches = re.findall(pattern, row_str)
if matches:
    title, description = matches[0]
    print("Title:", title.strip())
    print("Description:", description.strip())
else:
    # Fallback: puede que haya comillas en título o no haya descripción
    pattern_fallback = r'^\s*"?([^",]+?)"?\s*(?:,\s*"?(.*?)"?)?,\s*\['
    matches = re.findall(pattern_fallback, row_str)
    if matches:
        title, description = matches[0]
        print("Title:", title.strip())
        print("Description:", description.strip())
    else:
        print("No match found")

Author: 'National Portrait Gallery'
Categories: 'Biography & Autobiography'
Publisher: Smithsonian Institution
Date: 2018-02-13
Title: Portraits of the Presidents: The National Portrait Gallery
Description: A striking collection of presidential portraits from the National Portrait Gallery, this volume encapsulates the spirit of the most powerful office in the world. America's Presidents showcases the nation's largest collection of portraits of all the presidents beyond the White House's own, capturing the permanent exhibition that lies at the heart of the Portrait Gallery's mission to tell the American story through the individuals who have shaped it. The book explores presidential imagery through portraits ranging from the traditional, such as the iconic and newly restored "Lansdowne" portrait of George Washington by Gilbert Stuart, to the contemporary, such as Elaine de Kooning's colorful depiction of John F. Kennedy. Many of the featured portraits reveal much about the sitter, such 

In [None]:
import re
import pandas as pd

records = []

with open("L1-books.csv", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line or line.startswith("Title"):
            continue  # skip header or empty lines

        # --- 1️⃣ Simple case: 6 columns, split directly ---
        comma_count = line.count(",")
        if comma_count == 6:
            parts = [p.strip().strip('"') for p in line.split(",", 5)]
            title, description, author, publisher, date, categories = parts
            records.append({
                "Title": title,
                "Description": description,
                "Author": author,
                "Publisher": publisher,
                "Date": date,
                "Categories": categories
            })
            continue

        # --- 2️⃣ Complex case: irregular commas / extra quotes ---
        # Extract author & categories from [ ... ]
        brackets = re.findall(r"\[(.*?)\]", line)
        author = brackets[0] if len(brackets) > 0 else ""
        categories = brackets[1] if len(brackets) > 1 else ""

        # Extract text before first [
        pre_bracket = line.split("[", 1)[0]

        # Get title and description from start of line
        quotes = re.findall(r'"(.*?)"', pre_bracket)
        if len(quotes) >= 2:
            title, description = quotes[0], quotes[1]
        elif len(quotes) == 1:
            title = pre_bracket.split(",")[0].replace('"', '').strip()
            description = quotes[0]
        else:
            parts = pre_bracket.split(",", 1)
            title = parts[0].strip()
            description = parts[1].strip() if len(parts) > 1 else ""

        # --- Between author and categories, find publisher/date ---
        between_match = re.search(r"\],(.*?),\[", line)
        publisher, date = "", ""
        if between_match:
            between = between_match.group(1).strip()
            # Split by comma only if exactly one comma
            if between.count(",") == 1:
                publisher, date = [p.strip() for p in between.split(",", 1)]
            elif between.count(",") == 0:
                publisher, date = between.strip(), ""
            else:
                # handle weird cases like ],,1834,[
                m = re.search(r"\],\s*,\s*([\d\-]+)\s*,\s*\[", line)
                if m:
                    publisher, date = "", m.group(1).strip()

        # --- Date validation (numbers + dashes only) ---
        if not re.match(r"^\d{4}(?:-\d{2})?(?:-\d{2})?$", date or ""):
            date = ""

        records.append({
            "Title": title,
            "Description": description,
            "Author": author,
            "Publisher": publisher,
            "Date": date,
            "Categories": categories
        })

df_clean = pd.DataFrame(records)
print(df_clean.head(20))


ValueError: not enough values to unpack (expected 6, got 5)

In [91]:
df_clean[100:150]

Unnamed: 0,Title,Description,Author,Publisher,Date,Categories
100,Bye-Bye,Mom and Dad,Little Critter thinks he is in charge when Mom...,['Mercer Mayer'],,"2004,['Families']"
101,The Scarletti Curse (Candleglow),",",'Christine Feehan',,,
102,Universal Secrets of Telecosmic Power,",",'Norvell',Reward Books,1974-01-01,'Success'
103,The Visitor's Guide to Normandy Landing Beache...,,['Tonie Holt','Valmai Holt'],Hunter Pub Incorporated,"1994-01-01,['Travel']"
104,The Bar Code Implementation Guide: Using Bar C...,,['Stephen L. Pearce','Richard D. Bushnell'],Tower Hill Press,"2000,['Business & Economics']"
105,Principles of Biblical Interpretation (Sacred ...,",",'Louis Berkhof',,1962,'Bible'
106,Cruel and Unusual (G K Hall Large Print Book S...,Wanneer er in dit achtste deel in de Kay Scarp...,'Patricia Cornwell',Luitingh Sijthoff,2017-06-01,'Fiction'
107,Gaston de Blondeville (The modern novelists),",",'Ann Radcliffe',,1834,'Great Britain'
108,Screenwriter: Words Become Pictures/304506,",""","'Rob D?ckers', 'Karin van Lieverloo', 'Museum ...",,,"'Illumination of books and manuscripts, Dutch'"
109,"The Art of Technical Documentation, Second Edi...",An essential text in the field of contemporary...,"'Kristine Stiles', 'Peter Selz'",,,'Art'


In [83]:


df_clean["pattern"] = (
    df_clean["Date"]
    .astype(str)
    .str.replace(r"[A-Za-z]", "a", regex=True)
    .str.replace(r"\d", "n", regex=True)
)

# Mostrar los patrones únicos
df_clean["pattern"].unique()

array(['nnnn', 'nnnn-nn-nn', 'nnnn-nn', '', 'aaa.,nnnn', 'aaa,nnnn-nn-nn',
       "['aaaaa aaaaaaaaaa'],aaaaa aaa aaaaaaaa,nnnn-nn-nn",
       'aaaaaaa,nnnn', 'aaa', 'aaaaaaaaaa,nnnn-nn-nn', 'aaa.,nnnn-nn-nn',
       'nnnn*', 'aaaaaa aaa aaaaaa,nnnn-nn-nn',
       'aa. : aaaaaaaaaa aa aaaaa aaaaa,nnnn', 'aaaaaaaaaa aa,nnnn-nn-nn',
       'aaa,nnnn', 'aaaaaa aaa aaaaa,nnnn-nn-nn',
       'aaaaaaa aa aaaaaaaa,nnnn', 'aaaaaaaaaaaa,nnnn',
       'a.a. : aaaaaaa aaaaa,nnnn',
       "'aaaaaa a. aaaaaaa', 'aaaaa a. aaaaaa'],aaa,nnnn-nn-nn",
       'aaaaa,nnnn-nn-nn', 'aaa,nnnn-nn',
       'aaa. : aaaaa & aaaaaaaa aa aaaaaa,nnnn', 'aa),nnnn-nn-nn',
       'aaa.",nnnn', 'nn??', 'aaaa. : aaaaaa aaaaaaa aaaaa,nnnn',
       'aaaaaaaaa,nnnn', 'nnnn [annnn],nnnn',
       'aaa aaa aaaaa aaaaaaa aaaaaaa aa aaaa aaaaaaaaaa aaaaa. aaaaa aaa aa aaaa aa aaaaaaaaaa aaaaaaaaaa aaa aaaa? aa aaa aaaa aa n aaaaa, aaa aaa aaaa aaaaa aaaaa aaaa aaaa a aaaa-aaaaa aaaaa aaa aaaaaa aa a aaaaaaaaaa aaa aaaaaaaaa aaa

In [95]:
data = pd.read_csv("L1-books.csv", header=None, engine="python")


In [99]:
import csv
from collections import Counter

path = "L1-books.csv"

col_counts = Counter()

with open(path, encoding="utf-8") as f:
    reader = csv.reader(f)
    for i, row in enumerate(reader, start=1):
        col_counts[len(row)] += 1

# Mostrar resumen
print("📊 Distribución de número de columnas por fila:")
for n_cols, count in sorted(col_counts.items()):
    print(f"  {n_cols} columnas → {count} filas")

# Mostrar totales clave
exact = col_counts.get(6, 0)
less = sum(v for k, v in col_counts.items() if k < 6)
more = sum(v for k, v in col_counts.items() if k > 6)

print("\n✅ Resumen:")
print(f"  ✔️ Filas correctas (6 columnas): {exact}")
print(f"  ⚠️ Filas con menos de 6 columnas: {less}")
print(f"  ❌ Filas con más de 6 columnas: {more}")


📊 Distribución de número de columnas por fila:
  6 columnas → 212405 filas

✅ Resumen:
  ✔️ Filas correctas (6 columnas): 212405
  ⚠️ Filas con menos de 6 columnas: 0
  ❌ Filas con más de 6 columnas: 0


In [105]:
# Parte 1: cargar el CSV “tal cual” en 6 columnas (aunque vengan mal alineadas)
import csv, pandas as pd, re

path = "L1-books.csv"

rows = []
with open(path, encoding="utf-8") as f:
    reader = csv.reader(f)
    for row in reader:
        # fuerza exactamente 6 columnas pegando cualquier sobrante al final
        if len(row) < 6:
            row = row + [""]*(6-len(row))
        elif len(row) > 6:
            row = row[:5] + [",".join(row[5:])]
        rows.append(row)

df = pd.DataFrame(rows, columns=["title_raw","description_raw","authors_raw","publisher_raw","publishedDate_raw","categories_raw"])


# Parte 2: funciones auxiliares para detectar y normalizar
def is_list(s: str) -> bool:
    return isinstance(s, str) and bool(re.match(r"^\s*\[.*\]\s*$", s or ""))

def clean_list(s: str) -> str:
    if not isinstance(s, str): return ""
    return re.sub(r",\s*", "; ", s.strip())

def first_year(cells) -> str:
    for c in cells:
        if not isinstance(c, str): 
            continue
        m = re.search(r"(18|19|20)\d{2}", c)
        if m:
            return m.group(0)
    return ""

def is_maybe_publisher(s: str) -> bool:
    if not isinstance(s, str): return False
    t = s.strip()
    if not t: return False
    if is_list(t): return False
    if re.search(r"(18|19|20)\d{2}", t): return False   # evitar que sea fecha
    if len(t) > 120: return False                      # muy largo para publisher
    if any(ch.isdigit() for ch in t): return False     # publishers raramente llevan dígitos
    return True

def is_texty(s: str) -> bool:
    return isinstance(s, str) and len(s.strip()) >= 30  # heurística para descripción

# Parte 3: reconstrucción heurística de columnas canónicas
def normalize_row(r):
    cells = [
        r["title_raw"], r["description_raw"], r["authors_raw"],
        r["publisher_raw"], r["publishedDate_raw"], r["categories_raw"]
    ]

    # autores / categorías (listas entre corchetes)
    lists = [c for c in cells if is_list(c)]
    authors = ""
    categories = ""
    if len(lists) >= 2:
        authors, categories = lists[0], lists[-1]
    elif len(lists) == 1:
        idx = cells.index(lists[0])
        categories = lists[0] if idx >= 4 else lists[0]  # si solo hay una lista, puede ser cualquiera; se reasigna luego si hay otra mejor
        # si además detectamos explícitamente otra lista en col_6, preferirla como categorías
        if is_list(r["categories_raw"]):
            categories = r["categories_raw"]
            if lists[0] != categories:
                authors = lists[0]

    # fecha (primer año de 4 dígitos que aparezca)
    year = first_year(cells)

    # publisher: primer candidato razonable entre cols 2-5 (evitar listas y años)
    pub_candidates = [r["publisher_raw"], r["title_raw"], r["description_raw"], r["publishedDate_raw"]]
    pub_candidates = [c for c in pub_candidates if is_maybe_publisher(c)]
    publisher = pub_candidates[0] if pub_candidates else (r["publisher_raw"] if is_maybe_publisher(r["publisher_raw"]) else "")

    # título: prioridad a title_raw; si parece claramente un fragmento de descripción y la desc real quedó en otra col, usamos la más corta sin listas/fecha
    title = r["title_raw"]
    if not isinstance(title, str) or len(title.strip()) == 0 or is_texty(title):
        title_candidates = [c for c in [r["title_raw"], r["description_raw"]] if isinstance(c,str)]
        title_candidates = [c for c in title_candidates if not is_list(c) and not re.search(r"(18|19|20)\d{2}", c)]
        if title_candidates:
            title = min(title_candidates, key=lambda s: len(s.strip()))  # el título suele ser más corto

    # descripción: elegimos el texto más largo no listado y no publisher
    used = set([title, publisher, authors, categories])
    desc_candidates = [c for c in cells if isinstance(c,str) and c not in used and not is_list(c)]
    desc_candidates = [c for c in desc_candidates if is_texty(c)]
    description = max(desc_candidates, key=lambda s: len(s)) if desc_candidates else (r["description_raw"] if isinstance(r["description_raw"],str) else "")

    # limpiar listas
    authors_clean = clean_list(authors) if authors else ""
    categories_clean = clean_list(categories) if categories else ""

    return pd.Series({
        "title": (title or "").strip(),
        "description": (description or "").strip(),
        "authors": authors_clean,
        "publisher": (publisher or "").strip(),
        "publishedDate": year,   # solo año; si querés YYYY-MM-DD, extraé más patrones
        "categories": categories_clean
    })

df_clean = df.apply(normalize_row, axis=1)

# (Opcional) si querés forzar publishedDate a Int64 con NaN cuando falte año:
# df_clean["publishedDate"] = pd.to_numeric(df_clean["publishedDate"], errors="coerce").astype("Int64")

# df_clean ahora tiene siempre: title, description, authors, publisher, publishedDate, categories


In [None]:
df_clean[250:300]

Unnamed: 0,col_1,col_2,col_3,col_4,col_5,col_6
250,"Run Wild, Run Free",Description: Movie Press Kits.,,,2013,
251,The Stone that the Builder Refused: A Novel,The Stone that the Builder Refused is the fina...,['Madison Smartt Bell'],Vintage,2007-12-18,['Fiction']
252,A Field Guide to Butterflies of Texas (Texas M...,Meet the wild world of common Texas insects wi...,"['John A. Jackman', 'Bastiaan M. Drees']",Taylor Trade Publishing,1998-03-01,['Nature']
253,"Feats on the fiord, (Tales for children from m...",,['Harriet Martineau'],,1914,['Norway']
254,Southern Knights: The Graphic Novel,,"['Henry Vogel', 'Jackson Guice']",,1986,['Heroes']
255,Lost Indianapolis (IN) (Images of America),"A history of the city, told through the storie...","['John McDonald', 'John P. McDonald']",Arcadia Publishing,2002,['History']
256,Eyewitness Travel Phrase Book: European (14 la...,"Explore this vast and beguiling country, from ...",['DK Travel'],Dorling Kindersley Ltd,2017-09-07,['Travel']
257,Conversations With Lincoln,A Lincoln book that says something new is a ra...,['Abraham Lincoln'],Transaction Publishers,2002,['History']
258,Thomas Paine : Collected Writings : Common Sen...,Thomas Paine was the impassioned democratic vo...,['Thomas Paine'],Library of America,1995-03-01,['History']
259,Abstract sets and finite ordinals,: An introduction to the study of set theory (...,,['Geoffrey Bourton Keene'],,1961


In [104]:
df["pattern"] = (
    df["col_5"]
    .astype(str)
    .str.replace(r"[A-Za-z]", "a", regex=True)
    .str.replace(r"\d", "n", regex=True)
)

# Mostrar los patrones únicos
df["pattern"].unique()

array(['aaaaaaaaaaaaa', 'nnnn', 'nnnn-nn-nn', 'nnnn-nn', '',
       ' aaaaaa : aaaaaaa aaaaaaaaa', 'a.a. aaaaaa', 'aaaa aaaa aaaaa',
       'aaaaaaaaaaa aaa', ' aaaaa aaa aaaa', 'nnnn*', ' aaaaaaaaa ',
       " aa aaaaa aaaaaa a aaaaaaaa aaaaaa. aaaaaaa aaaaaaa'a aaaaa aaaaa aa a aaaaaa - aaaaaa aaa aaaaaaaa aaaaaaa aaa aaa aaaaaa aa aaaaa aaaa - aa aaaaaaaaaa aaa aaaaaaaaaa aaaaaaaaa aa aaaaaaaa aaaaaaaaa aaaa a aaaaaaa aaaaaa.",
       'aaaaaa', 'aaaaaaaaaa aaaaaaaaaa', 'aaa aaaaaaaa aaaaa',
       'aaaaaaaaa aaaaaaaaaa aaaaaaa', 'aaaaaaaaa aaaaa',
       "['aaaaaaa aaaaa aaaaaaa']", ' aaa aaaa : aaaaaa aaaaaaaaaa aaaaa',
       'nn??', 'aaaaaaaaaa aaaaaaaaaa aaaaa',
       ' aaa aaaa aaaaaa aaaaaaa n.n aaa aa aaaa aa aaaaa aaaa aa aaa aaaaaaaaa aaa aaaaaaaa aaaaa aa aaa aaaa. aaaaaaa aaaaaaaa aa aa aaa aaaaa aaaaaaaaaa aa aaaaaaa aaaaaaaa--aaaaaaaaaaa aaa a aaaaaaa-aa-aaaaaa aaaaaaaaa aaaaaa (aaaaaaa n)',
       'aaaaaaa', 'aaaaaa aaaaaaaaaa', 'aaaaaaa aaaaa',
       "['aaaa a. aaaa

In [125]:

import re
import pandas as pd

# Leer archivo como texto bruto
with open("L1-books.csv", encoding="utf-8") as f:
    raw_lines = f.readlines()

# Regex robusto para parsear los 6 campos
pattern = re.compile(r"""
^
(.*?)                          # 1️⃣ Title
\s*,\s*
(.*?)                          # 2️⃣ Description
\s*,\s*
(\[.*?\]|NaN|)                 # 3️⃣ Authors
\s*,\s*
(.*?)                          # 4️⃣ Publisher
\s*,\s*
(.*?)   
\s*,\s*
(\[.*?\]|NaN|)                 # 6️⃣ Categorías
\s*$
""", re.VERBOSE)

# (\d{4}(?:-\d{2})?(?:-\d{2})?|NaN|)  # 5️⃣ Fecha
# Parsear las primeras 20 filas (saltando el header)
rows = []
for i, line in enumerate(raw_lines[1:len(raw_lines)], start=1):
    line = line.strip()
    match = pattern.match(line)
    if match:
        rows.append(match.groups())
    else:
        rows.append((None, None, None, None, None, None))

# Crear dataframe limpio
df_fixed = pd.DataFrame(rows, columns=[
    "Title", "Description", "Authors", "Publisher", "PublishedDate", "Categories"
])

df_fixed[0:20]


Unnamed: 0,Title,Description,Authors,Publisher,PublishedDate,Categories
0,Its Only Art If Its Well Hung!,,['Julie Strain'],,1996,['Comics & Graphic Novels']
1,Dr. Seuss: American Icon,"""Philip Nel takes a fascinating look into the ...",['Philip Nel'],A&C Black,2005-01-01,['Biography & Autobiography']
2,Wonderful Worship in Smaller Churches,"""This resource includes twelve principles in u...",['David R. Ray'],,2000,['Religion']
3,Whispers of the Wicked Saints,"""Julia Thomas finds her life spinning out of c...",['Veronica Haddon'],iUniverse,2005-02,['Fiction']
4,"""Nation Dance: Religion",Identity and Cultural Difference in the Caribb...,,['Edward Long'],",2003-03-01",
5,The Church of Christ: A Biblical Ecclesiology ...,"""In The Church of Christ: A Biblical Ecclesiol...",['Everett Ferguson'],Wm. B. Eerdmans Publishing,1996,['Religion']
6,The Overbury affair (Avon),,['Miriam Allen De Ford'],,1960,
7,A Walk in the Woods: a Play in Two Acts,,['Lee Blessing'],,1988,
8,Saint Hyacinth of Poland,"""The story for children 10 and up of St. Hyaci...",['Mary Fabyan Windeatt'],Tan Books & Pub,2009-01-01,['Biography & Autobiography']
9,Rising Sons and Daughters: Life Among Japan's ...,"""Wardell recalls his experience as a foreign s...",['Steven Wardell'],Plympton PressIntl,1995,['Social Science']


In [None]:
import re
import pandas as pd

# Leer archivo como texto bruto
with open("L1-books.csv", encoding="utf-8") as f:
    raw_lines = f.readlines()

# Regex robusto con soporte para comillas y NaN
pattern = re.compile(r"""
^
\s*
(?:
    "((?:[^"]|"")*)"         # 1️⃣ Title entre comillas dobles con comillas escapadas
  | '((?:[^']|'')*)'         # 1️⃣ Title entre comillas simples con comillas escapadas
  | ([^,]+?)                 # 1️⃣ Title sin comillas (hasta la coma principal)
)
\s*,\s*
(?:
    "((?:[^"]|"")*)"         # 2️⃣ Description entre comillas dobles con comillas escapadas
  | '((?:[^']|'')*)'         # 2️⃣ Description entre comillas simples con comillas escapadas
  | ([^,\[]*?)               # 2️⃣ Description sin comillas ni corchetes
)?
\s*,\s*
(\[.*?\]|NaN|)               # 3️⃣ Authors
\s*,\s*
(.*?)                        # 4️⃣ Publisher
\s*,\s*
(\d{4}(?:-\d{2})?(?:-\d{2})?|NaN|)  # 5️⃣ Fecha
\s*,\s*
(\[.*?\]|NaN|)?              # 6️⃣ Categorías
\s*$
""", re.VERBOSE)

rows = []
for i, line in enumerate(raw_lines[1:], start=1):
    line = line.strip()
    match = pattern.match(line)
    if match:
        # Combinar grupos alternativos para Title y Description
        title = match.group(1) or match.group(2) or match.group(3)
        desc  = match.group(4) or match.group(5) or match.group(6)
        authors = match.group(7)
        publisher = match.group(8)
        date = match.group(9)
        cats = match.group(10)
        rows.append((title, desc, authors, publisher, date, cats))
    else:
        rows.append((None, None, None, None, None, None))

# Crear dataframe limpio
df_fixed = pd.DataFrame(rows, columns=[
    "Title", "Description", "Authors", "Publisher", "PublishedDate", "Categories"
])

df_fixed.head(20)


Unnamed: 0,Title,Description,Authors,Publisher,PublishedDate,Categories
0,Its Only Art If Its Well Hung!,,['Julie Strain'],,1996,['Comics & Graphic Novels']
1,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],A&C Black,2005-01-01,['Biography & Autobiography']
2,Wonderful Worship in Smaller Churches,This resource includes twelve principles in un...,['David R. Ray'],,2000,['Religion']
3,Whispers of the Wicked Saints,Julia Thomas finds her life spinning out of co...,['Veronica Haddon'],iUniverse,2005-02,['Fiction']
4,"Nation Dance: Religion, Identity and Cultural ...",,['Edward Long'],,2003-03-01,
5,The Church of Christ: A Biblical Ecclesiology ...,In The Church of Christ: A Biblical Ecclesiolo...,['Everett Ferguson'],Wm. B. Eerdmans Publishing,1996,['Religion']
6,The Overbury affair (Avon),,['Miriam Allen De Ford'],,1960,
7,A Walk in the Woods: a Play in Two Acts,,['Lee Blessing'],,1988,
8,Saint Hyacinth of Poland,The story for children 10 and up of St. Hyacin...,['Mary Fabyan Windeatt'],Tan Books & Pub,2009-01-01,['Biography & Autobiography']
9,Rising Sons and Daughters: Life Among Japan's ...,Wardell recalls his experience as a foreign st...,['Steven Wardell'],Plympton PressIntl,1995,['Social Science']


In [129]:
df_fixed["pattern"] = (
    df_fixed["PublishedDate"]
    .astype(str)
    .str.replace(r"[A-Za-z]", "a", regex=True)
    .str.replace(r"\d", "n", regex=True)
)

# Mostrar los patrones únicos
df_fixed["pattern"].unique()

array(['nnnn', 'aaaa', 'nnnn-nn', ...,
       'aaaaa : aaa aaaaa aaaaaaaaaa",nnnn',
       '\'aaaaa\']",aaaaa a aaaaaa aaaaaaaaaaaa,nnnn',
       'aaa & aaaaaaa",nnnn'], shape=(1096,), dtype=object)

In [157]:
df_fixed.to_csv("L1-books-fixed.csv", index=False)

In [142]:
def reverse_func(text:str):
    return text[::-1].replace("[", "<temp>").replace("]", "[").replace("<temp>", "]")

In [None]:
with open("L1-books.csv", encoding="utf-8") as f:
    raw_lines = f.readlines()

reversed_lines = []
for line in raw_lines:
    reversed_lines.append(reverse_func(line))

with open("L1-books-reversed.csv", "w", encoding="utf-8") as f:
    f.writelines(reversed_lines)

In [144]:
df = pd.read_csv("L1-books-reversed.csv", index=False)

TypeError: read_csv() got an unexpected keyword argument 'index'

In [145]:
df = pd.read_csv("L1-books-reversed.csv")


In [149]:
df[100:150]

Unnamed: 0,seirogetac,etaDdehsilbup,rehsilbup,srohtua,noitpircsed,eltiT,pattern
100,['seilimaF'],4002,,['reyaM recreM'],.mih htiw yad eht dneps apdnarG dna amdnarG hg...,"daD dna moM ,eyB-eyB",nnnn
101,,,,['naheeF enitsirhC'],,)wolgeldnaC( esruC ittelracS ehT,aaa
102,['sseccuS'],10-10-4791,skooB draweR,['llevroN'],,rewoP cimsoceleT fo sterceS lasrevinU,nn-nn-nnnn
103,['levarT'],10-10-4991,detaroprocnI buP retnuH,"['tloH iamlaV' ,'tloH einoT']",,)rellevarT lanoigeR( smuesuM dna slairomeM :se...,nn-nn-nnnn
104,['scimonocE & ssenisuB'],0002,sserP lliH rewoT,"['llenhsuB .D drahciR' ,'ecraeP .L nehpetS']",,noitubirtsiD ni sedoC raB gnisU :ediuG noitatn...,nnnn
105,['elbiB'],2691,,['fohkreB siuoL'],,)scituenemreH dercaS( noitaterpretnI lacilbiB ...,nnnn
106,['noitciF'],10-60-7102,ffohtjiS hgnitiuL,['llewnroC aicirtaP'],.reggaD dloG ed ne drawA ragdE ed tem dnoorkeb...,))htolC( seireS kooB tnirP egraL llaH K G( lau...,nn-nn-nnnn
107,['niatirB taerG'],4381,,['effilcdaR nnA'],,)stsilevon nredom ehT( ellivednolB ed notsaG,nnnn
108,"['hctuD ,stpircsunam dna skoob fo noitanimullI']",5002,,"['negemjiN fohklaV teH muesuM' ,'oolreveiL nav...",,605403/serutciP emoceB sdroW :retirwneercS,nnnn
109,['trA'],52-90-2102,sserP ainrofilaC fo vinU,"['zleS reteP' ,'selitS enitsirK']",".ailartsuA dna ,aciremA nitaL ,acirfA ,aisA ,t...","noitidE dnoceS ,noitatnemucoD lacinhceT fo trA...",nn-nn-nnnn


In [147]:
df["pattern"] = (
    df["etaDdehsilbup"]
    .astype(str)
    .str.replace(r"[A-Za-z]", "a", regex=True)
    .str.replace(r"\d", "n", regex=True)
)

# Mostrar los patrones únicos
df["pattern"].unique()

array(['nnnn', 'nn-nn-nnnn', 'nn-nnnn', 'aaa',
       'aaaaaaaaa aaaaaaa : aaaaaa ', 'aaaaaa .a.a', 'aaaaa aaaa aaaa',
       'aaa aaaaaaaaaaa', 'aaaa aaa aaaaa ', '*nnnn', ' aaaaaaaaa ',
       ".aaaaaa aaaaaaa a aaaa aaaaaaaaa aaaaaaaa aa aaaaaaaaa aaaaaaaaaa aaa aaaaaaaaaa aa - aaaa aaaaa aa aaaaaa aaa aaa aaaaaaa aaaaaaaa aaa aaaaaa - aaaaaa a aa aaaaa aaaaa a'aaaaaaa aaaaaaa .aaaaaa aaaaaaaa a aaaaaa aaaaa aa ",
       'aaaaaa', 'aaaaaaaaaa aaaaaaaaaa', 'aaaaa aaaaaaaa aaa',
       'aaaaaaa aaaaaaaaaa aaaaaaaaa', 'aaaaa aaaaaaaaa',
       "['aaaaaaa aaaaa aaaaaaa']", 'aaaaa aaaaaaaaaa aaaaaa : aaaa aaa ',
       '??nn', 'aaaaa aaaaaaaaaa aaaaaaaaaa',
       ')n aaaaaaa( aaaaaa aaaaaaaaa aaaaaa-aa-aaaaaaa a aaa aaaaaaaaaaa--aaaaaaaa aaaaaaa aa aaaaaaaaaa aaaaa aaa aa aa aaaaaaaa aaaaaaa .aaaa aaa aa aaaaa aaaaaaaa aaa aaaaaaaaa aaa aa aaaa aaaaa aa aaaa aa aaa n.n aaaaaaa aaaaaa aaaa aaa ',
       'aaaaaaa', 'aaaaaaaaaa aaaaaa', 'aaaaa aaaaaaa',
       "['aaaaaaaaa .a aaaa']", 'aaa

In [158]:
import csv
import pandas as pd

# Leer CSV correctamente manejando comillas dobles escapadas y comas internas
rows = []
with open("L1-books.csv", encoding="utf-8") as f:
    reader = csv.reader(f, delimiter=',', quotechar='"', doublequote=True, skipinitialspace=False)
    headers = next(reader, None)  # si tiene encabezado
    for row in reader:
        # Asegurar longitud uniforme
        row = (row + [None]*6)[:6]
        rows.append(row)

# Crear DataFrame
df = pd.DataFrame(rows, columns=[
    "Title", "Description", "Authors", "Publisher", "PublishedDate", "Categories"
])

df.head(20)

Unnamed: 0,Title,Description,Authors,Publisher,PublishedDate,Categories
0,Its Only Art If Its Well Hung!,,['Julie Strain'],,1996,['Comics & Graphic Novels']
1,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],A&C Black,2005-01-01,['Biography & Autobiography']
2,Wonderful Worship in Smaller Churches,This resource includes twelve principles in un...,['David R. Ray'],,2000,['Religion']
3,Whispers of the Wicked Saints,Julia Thomas finds her life spinning out of co...,['Veronica Haddon'],iUniverse,2005-02,['Fiction']
4,"Nation Dance: Religion, Identity and Cultural ...",,['Edward Long'],,2003-03-01,
5,The Church of Christ: A Biblical Ecclesiology ...,In The Church of Christ: A Biblical Ecclesiolo...,['Everett Ferguson'],Wm. B. Eerdmans Publishing,1996,['Religion']
6,The Overbury affair (Avon),,['Miriam Allen De Ford'],,1960,
7,A Walk in the Woods: a Play in Two Acts,,['Lee Blessing'],,1988,
8,Saint Hyacinth of Poland,The story for children 10 and up of St. Hyacin...,['Mary Fabyan Windeatt'],Tan Books & Pub,2009-01-01,['Biography & Autobiography']
9,Rising Sons and Daughters: Life Among Japan's ...,Wardell recalls his experience as a foreign st...,['Steven Wardell'],Plympton PressIntl,1995,['Social Science']


In [163]:
df["pattern"] = (
    df["Publisher"]
    .astype(str)
    .str.replace(r"[A-Za-z]", "a", regex=True)
    .str.replace(r"\d", "n", regex=True)
)

# Mostrar los patrones únicos
df["pattern"].value_counts()

pattern
                                       74903
aaaaaaaaa                               6574
aaaaaaa                                 5594
aaaaa aaa aaaaaaaa                      3439
aaaaaa aaaaa                            3158
                                       ...  
aaaaaaaa'a aaaaaaa aaaaaaaa                1
['aaaaa a. a. aaaa']                       1
aaaaa, aaaaa : aaa aaaaa aaaaaaaaaa        1
 aaaaa aaaaaa']                            1
['aa. aaaaaa']                             1
Name: count, Length: 6899, dtype: int64

# Cosas 

In [20]:
import re
import pandas as pd
import csv
from io import StringIO

pattern = re.compile(r"""
^
\s*
(?:
    "((?:[^"]|"")*)"         # 1️⃣ Title entre comillas dobles con comillas escapadas
  | '((?:[^']|'')*)'         # 1️⃣ Title entre comillas simples con comillas escapadas
  | ([^,]+?)                 # 1️⃣ Title sin comillas (hasta la coma principal)
)
\s*,\s*
(?:
    "((?:[^"]|"")*)"         # 2️⃣ Description entre comillas dobles con comillas escapadas
  | '((?:[^']|'')*)'         # 2️⃣ Description entre comillas simples con comillas escapadas
  | ([^,\[]*?)               # 2️⃣ Description sin comillas ni corchetes
)?
\s*,\s*
(\[.*?\]|NaN|)               # 3️⃣ Authors
\s*,\s*
(.*?)                        # 4️⃣ Publisher
\s*,\s*
(\d{4}(?:-\d{2})?(?:-\d{2})?|NaN|)  # 5️⃣ Fecha
\s*,\s*
(\[.*?\]|NaN|)?              # 6️⃣ Categorías
\s*$
""", re.VERBOSE)

# ⬇️ Agrega estas helpers arriba (después de imports)
date_re = re.compile(r'^\s*(?:\d{4}(?:-\d{2}){0,2}|NaN)?\s*$')

def _looks_like_date(s):
    return isinstance(s, str) and date_re.match(s) is not None

def _looks_like_listish(s):
    if not isinstance(s, str): return False
    t = s.strip()
    return (t.startswith('[') and t.endswith(']')) or t in ('NaN', '')

def _realign_row_by_schema(cells):
    """
    Repara filas con columnas corridas intentando ubicar:
    [Title, Description, Authors(listish), Publisher, PublishedDate(date), Categories(listish)]
    """
    if not cells: return None
    # 1) Title fijo en 0
    title = cells[0] if cells else None

    # 2) Authors: primer listish desde pos 1
    a_idx = None
    for i in range(1, len(cells)):
        if _looks_like_listish(cells[i]):
            a_idx = i
            break
    if a_idx is None: 
        return None  # no podemos ubicar Authors

    # 3) Date: primer date después de authors
    d_idx = None
    for i in range(a_idx + 1, len(cells)):
        if _looks_like_date(cells[i]):
            d_idx = i
            break
    if d_idx is None:
        return None  # no podemos ubicar fecha

    # 4) Categories: último listish luego de date (opcional)
    c_idx = None
    for i in range(len(cells) - 1, d_idx, -1):
        if _looks_like_listish(cells[i]):
            c_idx = i
            break

    # 5) Reconstrucción por uniones (para absorber comas extra)
    desc = ','.join(cells[1:a_idx]).strip() or None
    authors = cells[a_idx]
    publisher = ','.join(cells[a_idx + 1:d_idx]).strip() or None
    date = cells[d_idx]
    cats = ','.join(cells[d_idx + 1: c_idx + 1]).strip() if c_idx is not None else (cells[d_idx + 1].strip() if d_idx + 1 < len(cells) else None)
    if cats == '': cats = None

    return (title, desc, authors, publisher, date, cats)


In [21]:
# ⬇️ Modifica tu parseo híbrido para validar/realinear incluso si hay 6 columnas
def parse_line_hybrid(line: str):
    # 1) Intento CSV
    try:
        reader = csv.reader(StringIO(line), delimiter=',', quotechar='"', doublequote=True)
        row = next(reader)
        # a) Caso feliz: 6 columnas y la col 4 parece fecha
        if len(row) == 6 and _looks_like_date(str(row[4])):
            return tuple(row[:6])
        # b) Si hay 6 pero la col 4 NO es fecha, intentamos realinear
        fixed = _realign_row_by_schema(row)
        if fixed is not None:
            return fixed
        # c) Si hay más o menos de 6, intentamos realinear con todas las celdas
        if len(row) != 6:
            fixed = _realign_row_by_schema(row)
            if fixed is not None:
                return fixed
    except Exception:
        pass

    # 2) Fallback REGEX original
    m = pattern.match(line.strip())
    if m:
        title = m.group(1) or m.group(2) or m.group(3)
        desc  = m.group(4) or m.group(5) or m.group(6)
        authors = m.group(7)
        publisher = m.group(8)
        date = m.group(9)
        cats = m.group(10)
        return (title, desc, authors, publisher, date, cats)

    # 3) Último recurso
    return (None, None, None, None, None, None)


In [None]:
rows = []
with open("L1-books.csv", encoding="utf-8") as f:
    lines = f.readlines()

for i, line in enumerate(lines[1:], start=1):
    if line.strip():
        if re.match(r'(.*?/\d{2})$', line):
            line = line + ","
        rows.append(parse_line_hybrid(line))

df = pd.DataFrame(rows, columns=["Title","Description","Authors","Publisher","PublishedDate","Categories"])

In [28]:
# if categories is a number and publishedDate is empty, swap them
def fix_swapped_columns(row):
    date = row["PublishedDate"]
    cats = row["Categories"]
    if (isinstance(cats, str) and re.match(r'^\s*\d{4}(?:-\d{2})?(?:-\d{2})?\s*$', cats)) and (not isinstance(date, str) or date.strip() == ""):
        # Swap
        row["PublishedDate"], row["Categories"] = cats, date
    return row

df = df.apply(fix_swapped_columns, axis=1)

In [30]:
df.to_csv("L1-books_magic.csv", index=False)

In [8]:
file_path = 'L2-books.csv'

try:
    df_l2 = pd.read_csv(file_path)
    print("CSV file read successfully!")
    display(df_l2.head()) # Display the first few rows of the DataFrame
except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
except Exception as e:
    print(f"An error occurred: {e}")

CSV file read successfully!


Unnamed: 0,ISBN,Book-Title,Book-Author
0,195153448,Classical Mythology,Mark P. O. Morford
1,2005018,Clara Callan,Richard Bruce Wright
2,60973129,Decision in Normandy,Carlo D'Este
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
4,393045218,The Mummies of Urumchi,E. J. W. Barber


In [9]:
merged_df = pd.merge(df, df_l2, left_on='Title', right_on='Book-Title', how='inner')

print("Merged DataFrame:")
display(merged_df.head())
merged_df.info()

Merged DataFrame:


Unnamed: 0,Title,Description,Authors,Publisher,PublishedDate,Categories,ISBN,Book-Title,Book-Author
0,Alaska Sourdough,"""Sourdough is a magical food"", as author Ruth ...",['Ruth Allman'],Alaska Northwest Books,1976,['Cooking'],882400851,Alaska Sourdough,Ruth Allman
1,Eyewitness Travel Guide to Europe,The DK Eyewitness Travel Guide: Eastern and Ce...,"['Dorling Kindersley Publishing Staff', 'Jonat...",,2015-06-15,['Europe'],789480662,Eyewitness Travel Guide to Europe,Dk Travel Writers
2,History of Magic and the Occult,"See the history of witchcraft, magic and super...",['DK'],Dorling Kindersley Ltd,2020-08-06,"['Body, Mind & Spirit']",517550083,History of Magic and the Occult,Kurt Seligmann
3,Seance,"Mysterieus, duister en passioneel Londen, 19de...",['Kevin Valgaeren'],Lannoo Meulenhoff - Belgium,2016-02-13,['Fiction'],330302736,Seance,Andre Launay
4,"Mary, Mary",,['United States. Coast Guard'],,1976,['Merchant marine'],446600547,"Mary, Mary",Evan Hunter


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29418 entries, 0 to 29417
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Title          29418 non-null  object
 1   Description    29390 non-null  object
 2   Authors        29418 non-null  object
 3   Publisher      29408 non-null  object
 4   PublishedDate  29418 non-null  object
 5   Categories     29280 non-null  object
 6   ISBN           29418 non-null  object
 7   Book-Title     29418 non-null  object
 8   Book-Author    29418 non-null  object
dtypes: object(9)
memory usage: 2.0+ MB


In [10]:
# Clean 'Title' in df_l1
df['cleaned_title'] = df['Title'].str.lower().str.strip()

# Clean 'Book-Title' in df_l2
df_l2['cleaned_book_title'] = df_l2['Book-Title'].str.lower().str.strip()

print("DataFrames with cleaned title columns:")
display(df[['Title', 'cleaned_title']].head())
display(df_l2[['Book-Title', 'cleaned_book_title']].head())

DataFrames with cleaned title columns:


Unnamed: 0,Title,cleaned_title
0,Its Only Art If Its Well Hung!,its only art if its well hung!
1,Dr. Seuss: American Icon,dr. seuss: american icon
2,Wonderful Worship in Smaller Churches,wonderful worship in smaller churches
3,Whispers of the Wicked Saints,whispers of the wicked saints
4,"Nation Dance: Religion, Identity and Cultural ...","nation dance: religion, identity and cultural ..."


Unnamed: 0,Book-Title,cleaned_book_title
0,Classical Mythology,classical mythology
1,Clara Callan,clara callan
2,Decision in Normandy,decision in normandy
3,Flu: The Story of the Great Influenza Pandemic...,flu: the story of the great influenza pandemic...
4,The Mummies of Urumchi,the mummies of urumchi


In [11]:
merged_cleaned_df = pd.merge(df, df_l2, left_on='cleaned_title', right_on='cleaned_book_title', how='inner')

print("Merged Cleaned DataFrame:")
display(merged_cleaned_df.head())
merged_cleaned_df.info()

Merged Cleaned DataFrame:


Unnamed: 0,Title,Description,Authors,Publisher,PublishedDate,Categories,cleaned_title,ISBN,Book-Title,Book-Author,cleaned_book_title
0,Alaska Sourdough,"""Sourdough is a magical food"", as author Ruth ...",['Ruth Allman'],Alaska Northwest Books,1976,['Cooking'],alaska sourdough,882400851,Alaska Sourdough,Ruth Allman,alaska sourdough
1,Eyewitness Travel Guide to Europe,The DK Eyewitness Travel Guide: Eastern and Ce...,"['Dorling Kindersley Publishing Staff', 'Jonat...",,2015-06-15,['Europe'],eyewitness travel guide to europe,789480662,Eyewitness Travel Guide to Europe,Dk Travel Writers,eyewitness travel guide to europe
2,History of Magic and the Occult,"See the history of witchcraft, magic and super...",['DK'],Dorling Kindersley Ltd,2020-08-06,"['Body, Mind & Spirit']",history of magic and the occult,517550083,History of Magic and the Occult,Kurt Seligmann,history of magic and the occult
3,Seance,"Mysterieus, duister en passioneel Londen, 19de...",['Kevin Valgaeren'],Lannoo Meulenhoff - Belgium,2016-02-13,['Fiction'],seance,330302736,Seance,Andre Launay,seance
4,"Mary, Mary",,['United States. Coast Guard'],,1976,['Merchant marine'],"mary, mary",446600547,"Mary, Mary",Evan Hunter,"mary, mary"


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36669 entries, 0 to 36668
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Title               36669 non-null  object
 1   Description         36607 non-null  object
 2   Authors             36669 non-null  object
 3   Publisher           36657 non-null  object
 4   PublishedDate       36669 non-null  object
 5   Categories          36374 non-null  object
 6   cleaned_title       36669 non-null  object
 7   ISBN                36669 non-null  object
 8   Book-Title          36669 non-null  object
 9   Book-Author         36669 non-null  object
 10  cleaned_book_title  36669 non-null  object
dtypes: object(11)
memory usage: 3.1+ MB


In [13]:
import string

# Function to remove punctuation and special characters
def remove_punctuation(text):
    if isinstance(text, str):
        return text.translate(str.maketrans('', '', string.punctuation))
    return text

# Clean 'Title' in df_l1
df['cleaned_title'] = df['Title'].str.lower().str.strip().apply(remove_punctuation)

# Clean 'Book-Title' in df_l2
df_l2['cleaned_book_title'] = df_l2['Book-Title'].str.lower().str.strip().apply(remove_punctuation)

print("DataFrames with cleaned title columns (lowercase, trimmed, and punctuation removed):")
display(df[['Title', 'cleaned_title']].head())
display(df_l2[['Book-Title', 'cleaned_book_title']].head())

DataFrames with cleaned title columns (lowercase, trimmed, and punctuation removed):


Unnamed: 0,Title,cleaned_title
0,Its Only Art If Its Well Hung!,its only art if its well hung
1,Dr. Seuss: American Icon,dr seuss american icon
2,Wonderful Worship in Smaller Churches,wonderful worship in smaller churches
3,Whispers of the Wicked Saints,whispers of the wicked saints
4,"Nation Dance: Religion, Identity and Cultural ...",nation dance religion identity and cultural di...


Unnamed: 0,Book-Title,cleaned_book_title
0,Classical Mythology,classical mythology
1,Clara Callan,clara callan
2,Decision in Normandy,decision in normandy
3,Flu: The Story of the Great Influenza Pandemic...,flu the story of the great influenza pandemic ...
4,The Mummies of Urumchi,the mummies of urumchi


In [14]:
merged_cleaned_df = pd.merge(df, df_l2, left_on='cleaned_title', right_on='cleaned_book_title', how='inner')


print("Merged Cleaned DataFrame:")
display(merged_cleaned_df.head())
merged_cleaned_df.info()

Merged Cleaned DataFrame:


Unnamed: 0,Title,Description,Authors,Publisher,PublishedDate,Categories,cleaned_title,ISBN,Book-Title,Book-Author,cleaned_book_title
0,Alaska Sourdough,"""Sourdough is a magical food"", as author Ruth ...",['Ruth Allman'],Alaska Northwest Books,1976,['Cooking'],alaska sourdough,882400851,Alaska Sourdough,Ruth Allman,alaska sourdough
1,Eyewitness Travel Guide to Europe,The DK Eyewitness Travel Guide: Eastern and Ce...,"['Dorling Kindersley Publishing Staff', 'Jonat...",,2015-06-15,['Europe'],eyewitness travel guide to europe,789480662,Eyewitness Travel Guide to Europe,Dk Travel Writers,eyewitness travel guide to europe
2,History of Magic and the Occult,"See the history of witchcraft, magic and super...",['DK'],Dorling Kindersley Ltd,2020-08-06,"['Body, Mind & Spirit']",history of magic and the occult,517550083,History of Magic and the Occult,Kurt Seligmann,history of magic and the occult
3,Seance,"Mysterieus, duister en passioneel Londen, 19de...",['Kevin Valgaeren'],Lannoo Meulenhoff - Belgium,2016-02-13,['Fiction'],seance,330302736,Seance,Andre Launay,seance
4,"Mary, Mary",,['United States. Coast Guard'],,1976,['Merchant marine'],mary mary,446600547,"Mary, Mary",Evan Hunter,mary mary


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39949 entries, 0 to 39948
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Title               39949 non-null  object
 1   Description         39880 non-null  object
 2   Authors             39949 non-null  object
 3   Publisher           39937 non-null  object
 4   PublishedDate       39949 non-null  object
 5   Categories          39639 non-null  object
 6   cleaned_title       39949 non-null  object
 7   ISBN                39949 non-null  object
 8   Book-Title          39949 non-null  object
 9   Book-Author         39949 non-null  object
 10  cleaned_book_title  39949 non-null  object
dtypes: object(11)
memory usage: 3.4+ MB
