In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
from PIL import Image
from io import BytesIO
import re
import google.generativeai as genai
from config import GOOGLE_API_KEY
from config import ISBNDB_API_KEY
import time
import requests
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
items = pd.read_csv('data/items.csv')

# Cleaning the original dataset

In [3]:
items['ISBN'] = items['ISBN Valid'].str.split(';').str[0]
items['title_clean'] = items['Title'].apply(lambda x: x.rstrip(' /') if isinstance(x, str) else np.nan)
items['google_api_title'] = items['title_clean'].str.split(' :').str[0]
(items['google_api_title'].isna().sum())
# No missing data for titles, therefore the title embedding is unaffected by data enhancing methods
def get_clean_author(author_series):
    def clean_author_string(author):
        if pd.isna(author):
            return None
        # Remove all digits and dashes
        cleaned = re.sub(r'[\d\-]', '', author)
        # Collapse multiple spaces and strip
        cleaned = re.sub(r'\s+', ' ', cleaned).strip()
        return cleaned if cleaned else None

    clean_author = author_series.apply(clean_author_string)
    return clean_author

items['author_clean'] = get_clean_author(items['Author'])

In [4]:
token_count = items['google_api_title'].apply(lambda x: len(str(x).split()))
average_token_count = token_count.mean()
print('Average batch size:',2048/average_token_count)

Average batch size: 511.5065906603728


In [None]:
# Configure your API key
genai.configure(api_key=GOOGLE_API_KEY)

# The name of the embedding model
embedding_model_name = 'models/embedding-001'

embeddings = []
batch_size = 400
# Loop through the items in batches
for i in range(0, len(items), batch_size):
    batch = items.iloc[i:i+batch_size]
    texts_to_embed = batch['google_api_title'].tolist()
    responses = genai.embed_content(model=embedding_model_name, content=texts_to_embed)
    embedding_batch = responses['embedding']
    embeddings += embedding_batch


In [None]:
items['embedded_title'] = embeddings
items.iloc[:7500].to_csv('embeddings_part1.csv', index = False)
items.iloc[7500:].to_csv('embeddings_part2.csv', index = False)

In [None]:

def get_isbn_by_title(book_title):
    base_url = "https://www.googleapis.com/books/v1/volumes"
    params = {
        "q": f'intitle:"{book_title}"',
        "maxResults": 10  # Check up to 10 results
    }
    response = requests.get(base_url, params=params)
    data = response.json()

    if "items" not in data:
        print("No books found.")
        return None

    for item in data["items"]:
        volume_info = item.get("volumeInfo", {})
        identifiers = volume_info.get("industryIdentifiers", [])
        for identifier in identifiers:
            if identifier["type"] == "ISBN_13":
                return identifier["identifier"]

    print("No ISBN-13 found in any of the results.")
    return None

# Example usage
isbn = get_isbn_by_title("Thinking, Fast and Slow")
print("ISBN-13:", isbn)

ISBN-13: 9781429969352


In [None]:
# This is for all the books without an ISBN
# Can also directly use this to find the other info we're interested in
import requests

def get_isbn_by_title(book_title):
    base_url = "https://www.googleapis.com/books/v1/volumes"
    params = {
        "q": f'intitle:"{book_title}"',
        "maxResults": 3  # Limit search to top 10 results
    }
    response = requests.get(base_url, params=params)
    data = response.json()

    if "items" not in data:
        print("No books found.")
        return None

    for item in data["items"]:
        volume_info = item.get("volumeInfo", {})
        identifiers = volume_info.get("industryIdentifiers", [])
        for identifier in identifiers:
            if identifier.get("type") in ("ISBN_13", "ISBN_10"):
                return identifier.get("identifier")  # Return first available ISBN

    print("No ISBN found in any of the results.")
    return None

# Example usage
isbn = get_isbn_by_title("Thinking, Fast and Slow")
print("ISBN:", isbn)



ISBN: 9781429969352


In [23]:
def get_book_info_by_title(book_title):
    """Extracts missing data for books without an ISBN on the google books APi using the book's title. If no ISBN is found, the rest of the metadata of the first match is returned
    - book_title: str
    """
    base_url = "https://www.googleapis.com/books/v1/volumes"
    params = {
        "q": f'intitle:"{book_title}"',
        "maxResults": 10
    }
    response = requests.get(base_url, params=params)
    data = response.json()

    if "items" not in data:
        print("No books found.")
        return None

    first_valid_data = None  # fallback if no ISBN is found

    for item in data["items"]:
        volume_info = item.get("volumeInfo", {})

        # Extract fields early
        authors = volume_info.get("authors", [])
        language = volume_info.get("language", "")
        canonical_link = volume_info.get("canonicalVolumeLink", "")
        published_date = volume_info.get("publishedDate", "")
        image_link = volume_info.get("imageLinks", {}).get("thumbnail", "")
        description = volume_info.get("description", "")
        identifiers = volume_info.get("industryIdentifiers", [])
        subjects = volume_info.get('categories', [])

        entry_data = {
            "ISBN": None,
            "Author": ", ".join(authors),
            "Language": language,
            "CanonicalLink": canonical_link,
            "PublishedDate": published_date,
            "ImageLink": image_link,
            "Description": description,
            "Subjects": subjects
        }

        # Save the first result's metadata as fallback
        if first_valid_data is None:
            first_valid_data = entry_data

        # Check for ISBN (either type)
        for identifier in identifiers:
            if identifier.get("type") in ("ISBN_13", "ISBN_10"):
                entry_data["ISBN"] = identifier.get("identifier")
                return entry_data  # Found ISBN, return immediately

    # No ISBN found, return first available result info
    print("No ISBN found; returning fallback metadata.")
    return first_valid_data

# Data enhancing
## Google API
### Books with no ISBN

In [None]:
nan_books = items[items['ISBN Valid'].isna()]

In [28]:
enhanced_missing_isbn = nan_books['google_api_title'].apply(get_book_info_by_title)

No ISBN found; returning fallback metadata.
No ISBN found; returning fallback metadata.
No ISBN found; returning fallback metadata.
No ISBN found; returning fallback metadata.
No ISBN found; returning fallback metadata.
No ISBN found; returning fallback metadata.
No ISBN found; returning fallback metadata.
No ISBN found; returning fallback metadata.
No ISBN found; returning fallback metadata.
No ISBN found; returning fallback metadata.
No ISBN found; returning fallback metadata.
No ISBN found; returning fallback metadata.
No ISBN found; returning fallback metadata.
No books found.
No ISBN found; returning fallback metadata.
No ISBN found; returning fallback metadata.
No ISBN found; returning fallback metadata.
No ISBN found; returning fallback metadata.
No ISBN found; returning fallback metadata.
No ISBN found; returning fallback metadata.
No ISBN found; returning fallback metadata.
No ISBN found; returning fallback metadata.
No ISBN found; returning fallback metadata.
No ISBN found; r

In [None]:
# Step 1: Handle NaNs
safe_results = [x if isinstance(x, dict) else {} for x in enhanced_missing_isbn]
# Step 2: Convert to DataFrame
info_df = pd.DataFrame(safe_results, index=nan_books.index)
# Handle subjects list -> sentence
info_df["Subjects"] = info_df["Subjects"].apply(
    lambda x: ", ".join(x) if isinstance(x, list) else ""
)

Unnamed: 0,ISBN,Author,Language,CanonicalLink,PublishedDate,ImageLink,Description,Subjects
261,,Lucius Annaeus Seneca,fr,https://play.google.com/store/books/details?id...,1932,http://books.google.com/books/content?id=m5qZA...,,Philosophy
264,,Lucius Annaeus Seneca,fr,https://play.google.com/store/books/details?id...,1932,http://books.google.com/books/content?id=m5qZA...,,Philosophy
269,2070108295,Charles Baudelaire,fr,https://books.google.com/books/about/Oeuvres_P...,1990,,,
367,2952930295,Jean-François Chevrier,fr,https://books.google.com/books/about/L_halluci...,2012,,L'Hallucination artistique (la formule est de ...,Art and literature
393,,Karl Wieland,fr,https://books.google.com/books/about/Les_droit...,1914,http://books.google.com/books/content?id=yo7dw...,,Mortgages
...,...,...,...,...,...,...,...,...
15195,9785392066186,Suisse,fr,https://books.google.com/books/about/Code_p%C3...,2015-11-25,http://books.google.com/books/content?id=2fQEC...,Code pénal suisse du 21 décembre 1937; Etat le...,Law
15203,9781453697481,"Scriblerus, Talia Felix, John Davidson, Lawren...",en,https://books.google.com/books/about/La_Commed...,2010-08-02,http://books.google.com/books/content?id=_em-q...,The Commedia dell'Arte is best known through t...,Drama
15232,,,fr,https://books.google.com/books/about/Payerne_v...,2015,,,
15245,,Anton Pavlovich Chekhov,ru,https://books.google.com/books/about/Polnoe_so...,1971,,,


In [31]:
# Step 3: Combine with original DataFrame
# handle the subjects format first: from list to comma separated sentence
nan_books_enhanced = nan_books.combine_first(info_df)
nan_books_enhanced

Unnamed: 0,Author,CanonicalLink,Description,ISBN,ISBN Valid,ImageLink,Language,PublishedDate,Publisher,Subjects,Title,author_clean,google_api_title,i,title_clean
261,"Seneca, Lucius Annaeus, l'Ancien",https://play.google.com/store/books/details?id...,,,,http://books.google.com/books/content?id=m5qZA...,fr,1932,Garnier frères,Philosophy,Controverses ; et Suasoires /,"Seneca, Lucius Annaeus, l'Ancien",Controverses ; et Suasoires,261,Controverses ; et Suasoires
264,"Seneca, Lucius Annaeus, l'Ancien",https://play.google.com/store/books/details?id...,,,,http://books.google.com/books/content?id=m5qZA...,fr,1932,Garnier,Philosophy,Controverses ; et Suasoires /,"Seneca, Lucius Annaeus, l'Ancien",Controverses ; et Suasoires,264,Controverses ; et Suasoires
269,"Baudelaire, Charles",https://books.google.com/books/about/Oeuvres_P...,,2070108295,,,fr,1990,puis Gallimard; La Pléiade,,"Oeuvres / Poésies de jeunesse, poésies diverse...","Baudelaire, Charles","Oeuvres / Poésies de jeunesse, poésies diverse...",269,"Oeuvres / Poésies de jeunesse, poésies diverse..."
367,"Quercy, Pierre",https://books.google.com/books/about/L_halluci...,L'Hallucination artistique (la formule est de ...,2952930295,,,fr,2012,F Alcan,Art and literature,L'hallucination /,"Quercy, Pierre",L'hallucination,367,L'hallucination
393,"Wieland, Karl, 1864-1936",https://books.google.com/books/about/Les_droit...,,,,http://books.google.com/books/content?id=yo7dw...,fr,1914,M Giard et E Brière,droits réels--* droit civil--Suisse; Sachenrec...,Les droits réels dans le Code civil suisse /,"Wieland, Karl,",Les droits réels dans le Code civil suisse,393,Les droits réels dans le Code civil suisse
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15195,Suisse,https://books.google.com/books/about/Code_p%C3...,Code pénal suisse du 21 décembre 1937; Etat le...,9785392066186,,http://books.google.com/books/content?id=2fQEC...,fr,2015-11-25,diff OFCL; Chancellerie fédérale,Strafrecht; Strafgesetzbuch; Droit pénal; Schw...,Code pénal suisse : [du 21 décembre 1937 (état...,,Code pénal suisse,15195,Code pénal suisse : [du 21 décembre 1937 (état...
15203,"Scriblerus, Talia Felix, John Davidson, Lawren...",https://books.google.com/books/about/La_Commed...,The Commedia dell'Arte is best known through t...,9781453697481,,http://books.google.com/books/content?id=_em-q...,en,2010-08-02,Sansoni Antiquariato,commedia dell'arte--[anthologie],La Commedia dell'Arte : storia e testo /,,La Commedia dell'Arte,15203,La Commedia dell'Arte : storia e testo
15232,,https://books.google.com/books/about/Payerne_v...,,,,,fr,2015,Editions du Caïon rodze,,"Payerne vracs : [ville, rues, archives, campag...",,Payerne vracs,15232,"Payerne vracs : [ville, rues, archives, campag..."
15245,"Chekhov, Anton Pavlovich",https://books.google.com/books/about/Polnoe_so...,,,,,ru,1971,Nauka,,Polnoe sobranie sochineniĭ i pisem : v tridt︠s...,"Chekhov, Anton Pavlovich",Polnoe sobranie sochineniĭ i pisem,15245,Polnoe sobranie sochineniĭ i pisem : v tridt︠s...


In [None]:
nan_books_enhanced.to_csv('google_api_enhanced/nan_books_enhanced.csv', index = False)

### Books with ISBNs

In [67]:
def search_google_books(isbn):
    params = {
        'q': f'isbn:{isbn}',
        'maxResults': 1,
        'printType': 'books',
        'projection': 'full'
        #'key': 'YOUR_API_KEY'  # optional
    }
    response = requests.get('https://www.googleapis.com/books/v1/volumes', params=params)
    book = response.json()
    
    if 'items' not in book:
        return {}
    else: 
        volume_info = book['items'][0]['volumeInfo']
        # Extract relevant fields 
        authors = volume_info.get("authors", [])
        language = volume_info.get("language", "")
        canonical_link = volume_info.get("canonicalVolumeLink", "")
        published_date = volume_info.get("publishedDate", "")
        image_link = volume_info.get("imageLinks", {}).get("thumbnail", "")
        description = volume_info.get("description", "")
        subjects = volume_info.get('categories', [])

    entry_data = {
        "Author": ", ".join(authors),
        "Language": language,
        "CanonicalLink": canonical_link,
        "PublishedDate": published_date,
        "ImageLink": image_link,
        "Description": description,
        "Subjects": subjects
    }
        
    return entry_data

In [8]:
not_nan_books = items[items['ISBN Valid'].notna()].copy()
not_nan_books

Unnamed: 0,Title,Author,ISBN Valid,Publisher,Subjects,i,ISBN,title_clean,google_api_title,author_clean
0,Classification décimale universelle : édition ...,,9782871303336; 2871303339,Ed du CEFAL,Classification décimale universelle; Indexatio...,0,9782871303336,Classification décimale universelle : édition ...,Classification décimale universelle,
1,Les interactions dans l'enseignement des langu...,"Cicurel, Francine, 1947-",9782278058327; 2278058320,Didier,didactique--langue étrangère - enseignement; d...,1,9782278058327,Les interactions dans l'enseignement des langu...,Les interactions dans l'enseignement des langues,"Cicurel, Francine,"
2,Histoire de vie et recherche biographique : pe...,,2343190194; 9782343190198,L'Harmattan,Histoires de vie en sociologie; Sciences socia...,2,2343190194,Histoire de vie et recherche biographique : pe...,Histoire de vie et recherche biographique,
3,Ce livre devrait me permettre de résoudre le c...,"Mazas, Sylvain, 1980-",9782365350020; 236535002X; 9782365350488; 2365...,Vraoum!,Moyen-Orient; Bandes dessinées autobiographiqu...,3,9782365350020,Ce livre devrait me permettre de résoudre le c...,Ce livre devrait me permettre de résoudre le c...,"Mazas, Sylvain,"
4,Les années glorieuses : roman /,"Lemaitre, Pierre, 1951-",9782702180815; 2702180817; 9782702183618; 2702...,Calmann-Lévy,France--1945-1975; Roman historique; Roman fra...,4,9782702180815,Les années glorieuses : roman,Les années glorieuses,"Lemaitre, Pierre,"
...,...,...,...,...,...,...,...,...,...,...
15285,Nouvelles orientales /,"Yourcenar, Marguerite",2070299732; 9782070299737,Gallimard,,15285,2070299732,Nouvelles orientales,Nouvelles orientales,"Yourcenar, Marguerite"
15286,Le vagabond de Tokyo /,"Fukutani, Takashi, 1952-2000",9782353480111; 235348011X; 9782353480241; 2353...,Le Lézard noir,Mangas,15286,9782353480111,Le vagabond de Tokyo,Le vagabond de Tokyo,"Fukutani, Takashi,"
15287,God of high school : le match contre les dieux /,"Park, Yong-Je",9782382880203; 2382880201; 9782382880210; 2382...,Kbooks,,15287,9782382880203,God of high school : le match contre les dieux,God of high school,"Park, YongJe"
15288,Blue Lock /,"Kaneshiro, Muneyuki",9782811650254; 2811650253; 9782811661274; 2811...,Pika,Compétitions; Football; Entraînement (sports);...,15288,9782811650254,Blue Lock,Blue Lock,"Kaneshiro, Muneyuki"


In [None]:
enhanced_isbn = []
batch_size = 10
# Loop through the items in batches
for i in range(0, 15500, batch_size):
    batch = not_nan_books.iloc[i:i+batch_size]
    enhanced_isbn_batch = batch['ISBN'].apply(search_google_books)
    enhanced_isbn.extend(enhanced_isbn_batch)

In [None]:
info_df = pd.DataFrame(enhanced_isbn, index=nan_books.index)
# Handle subjects list -> sentence
info_df["Subjects"] = info_df["Subjects"].apply(
    lambda x: ", ".join(x) if isinstance(x, list) else ""
    )
# Step 3: Combine with original DataFrame
# handle the subjects format first: from list to comma separated sentence
not_nan_books_enhanced = not_nan_books.combine_first(info_df)

In [None]:
not_nan_books_enhanced.to_csv('google_api_enhanced/not_nan_books_enhanced.csv', index = False)

In [None]:
not_nan_books_enhanced = pd.read_csv('google_api_enhanced/not_nan_books_enhanced.csv')

In [53]:
# Need to merge to handle the way it was stored
merged = not_nan_books.reset_index(drop=True).combine_first(
         not_nan_books_enhanced.reset_index(drop=True))
merged['author_clean'] = get_clean_author(merged['Author'])
merged['PublishedDate'] = merged['PublishedDate'][:4]

In [None]:
all_books_enhanced = pd.concat([nan_books_enhanced, merged], ignore_index=True)
all_books_enhanced.drop(columns = ['Author','ISBN Valid'],inplace = True)

In [None]:
all_books_enhanced.to_csv('google_api_enhanced/all_items_enhanced.csv', index = False)

In [46]:
print(all_books_enhanced['ImageLink'].notna().sum())

3857


In [None]:
all_books_enhanced = pd.read_csv('google_api_enhanced/all_items_enhanced.csv')
print(all_books_enhanced['Description'].notna().sum())
all_books_enhanced['PublishedDate'] = all_books_enhanced['PublishedDate'].astype(str).str[:4]
all_books_enhanced.fillna('', inplace = True)
# Creating all the combinations relevant for embedding
all_books_enhanced['title_description'] = all_books_enhanced['title_clean'] + ' ' + all_books_enhanced['Description']
all_books_enhanced['author_title_description'] = all_books_enhanced['author_clean'] + ' ' + all_books_enhanced['title_clean'] + ' ' + all_books_enhanced['Description']
all_books_enhanced['author_date_title_description'] = all_books_enhanced['author_clean'] + ' ' + all_books_enhanced['PublishedDate'] + ' ' + all_books_enhanced['title_clean'] + ' ' + all_books_enhanced['Description']
all_books_enhanced['author_date_title'] = all_books_enhanced['author_clean'] + ' ' + all_books_enhanced['PublishedDate'] + ' ' + all_books_enhanced['title_clean'] 
all_books_enhanced['author_date_title_subjects'] = all_books_enhanced['author_clean'] + ' ' + all_books_enhanced['PublishedDate'] + ' ' + all_books_enhanced['title_clean'] + ' ' + all_books_enhanced['Subjects']
all_books_enhanced['author_title_subjects'] = all_books_enhanced['author_clean'] + ' ' + ' ' + all_books_enhanced['title_clean'] + ' ' + all_books_enhanced['Subjects']

7067


In [57]:
# Prioritising top combinations 
top_columns = ['title_description','author_date_title_description','author_date_title_subjects']

In [219]:
# Configure your API key
genai.configure(api_key=GOOGLE_API_KEY)

# The name of the embedding model
def generate_embeddings(items, column = 'google_api_enhanced', batch_size=400):
    """
    Generate embeddings for a given DataFrame of items using the specified embedding model.

    Parameters:
    - items (pd.DataFrame): DataFrame containing the items to embed.
    - column (str): Name of the column to embed
    - batch_size (int): Number of items to process in each batch.

    Returns:
    - list: A list of embeddings.
    """
    embedding_model_name = 'models/embedding-001'
    embeddings = []
    for i in range(0, len(items), batch_size):
        batch = items.iloc[i:i+batch_size]
        texts_to_embed = batch[column].tolist()
        responses = genai.embed_content(model=embedding_model_name, content=texts_to_embed)
        embedding_batch = responses['embedding']
        embeddings += embedding_batch
    return embeddings

In [220]:
import os
def save_embeddings(dataframe, columns):
    split = round(len(dataframe) / 2)
    
    for column in tqdm(columns, desc="Embedding columns"):
        os.makedirs(f'final_items/{column}', exist_ok=True)
        
        for batch in tqdm([0, 1], desc=f"Batches for {column}", leave=False):
            items_batch = dataframe[batch * split : (batch + 1) * split].copy()
            embeddings = generate_embeddings(items_batch, column)
            items_batch['embedding'] = embeddings
            
            items_batch.to_csv(f'final_items/{column}/embeddings_part{batch+1}.csv', index=False)

In [None]:
split = round(len(all_books_enhanced)/2)
for column in top_columns:
    for batch in [0,1]:
        items_batch = all_books_enhanced[batch*split:(batch+1)*split].copy()
        embeddings = generate_embeddings(items_batch,column)
        items_batch['embedding'] = embeddings
        items_batch.to_csv(f'{column}/embeddings_part{batch+1}.csv')

## Using the ISBN Database

In [None]:
# Making use of the extra isbns that we got from the first data extraction
items.set_index('i', inplace=True)
print(items['ISBN'].isna().sum())
nan_books_enhanced.set_index('i',inplace=True)
items['ISBN'] = items['ISBN'].combine_first(nan_books_enhanced['ISBN'])
print(items['ISBN'].isna().sum())
items.reset_index(inplace=True)

723
351


In [None]:
# === CONFIGURATION ===
API_URL = 'https://api2.isbndb.com/books'
HEADERS = {
    'Authorization': ISBNDB_API_KEY,
    'accept': 'application/json',
    'Content-Type': 'application/json',
}
BATCH_SIZE = 100
API_SLEEP = 1  # seconds

# === LOAD DATAFRAME ===
df = items.copy()  # Ensure this is defined

# === EXTRACT FIRST VALID ISBN (10 or 13 digits) ===
def extract_isbn(text):
    if isinstance(text, str):
        for val in re.split(r'[;,]', text):
            cleaned = val.strip().replace('-', '')
            if cleaned.isdigit() and len(cleaned) in (10, 13):
                return cleaned
    return None

df['first_isbn'] = df['ISBN Valid'].apply(extract_isbn)

# === PREPARE UNIQUE ISBNs ===
unique_isbns = df['first_isbn'].dropna().drop_duplicates().tolist()
isbn_batches = [unique_isbns[i:i + BATCH_SIZE] for i in range(0, len(unique_isbns), BATCH_SIZE)]

# === FETCH BOOK DATA ===
book_data = []

for batch in tqdm(isbn_batches, desc="Fetching ISBNs", unit="batch"):
    try:
        payload_str = 'isbns=' + ','.join(batch)
        response = requests.post(API_URL, headers=HEADERS, data=payload_str)

        if response.status_code == 200:
            books = response.json().get('data', [])
            book_data.extend(books)
    except Exception:
        pass
    time.sleep(API_SLEEP)

# === SAVE RESULTS ===
books_df = pd.DataFrame(book_data)
books_df.to_csv('isbn_enriched_data.csv', index=False)

Fetching ISBNs: 100%|██████████| 145/145 [04:33<00:00,  1.88s/batch]


In [13]:
items

Unnamed: 0,Title,Author,ISBN Valid,Publisher,Subjects,i,ISBN,title_clean,google_api_title,author_clean
0,Classification décimale universelle : édition ...,,9782871303336; 2871303339,Ed du CEFAL,Classification décimale universelle; Indexatio...,0,9782871303336,Classification décimale universelle : édition ...,Classification décimale universelle,
1,Les interactions dans l'enseignement des langu...,"Cicurel, Francine, 1947-",9782278058327; 2278058320,Didier,didactique--langue étrangère - enseignement; d...,1,9782278058327,Les interactions dans l'enseignement des langu...,Les interactions dans l'enseignement des langues,"Cicurel, Francine,"
2,Histoire de vie et recherche biographique : pe...,,2343190194; 9782343190198,L'Harmattan,Histoires de vie en sociologie; Sciences socia...,2,2343190194,Histoire de vie et recherche biographique : pe...,Histoire de vie et recherche biographique,
3,Ce livre devrait me permettre de résoudre le c...,"Mazas, Sylvain, 1980-",9782365350020; 236535002X; 9782365350488; 2365...,Vraoum!,Moyen-Orient; Bandes dessinées autobiographiqu...,3,9782365350020,Ce livre devrait me permettre de résoudre le c...,Ce livre devrait me permettre de résoudre le c...,"Mazas, Sylvain,"
4,Les années glorieuses : roman /,"Lemaitre, Pierre, 1951-",9782702180815; 2702180817; 9782702183618; 2702...,Calmann-Lévy,France--1945-1975; Roman historique; Roman fra...,4,9782702180815,Les années glorieuses : roman,Les années glorieuses,"Lemaitre, Pierre,"
...,...,...,...,...,...,...,...,...,...,...
15286,Le vagabond de Tokyo /,"Fukutani, Takashi, 1952-2000",9782353480111; 235348011X; 9782353480241; 2353...,Le Lézard noir,Mangas,15286,9782353480111,Le vagabond de Tokyo,Le vagabond de Tokyo,"Fukutani, Takashi,"
15287,God of high school : le match contre les dieux /,"Park, Yong-Je",9782382880203; 2382880201; 9782382880210; 2382...,Kbooks,,15287,9782382880203,God of high school : le match contre les dieux,God of high school,"Park, YongJe"
15288,Blue Lock /,"Kaneshiro, Muneyuki",9782811650254; 2811650253; 9782811661274; 2811...,Pika,Compétitions; Football; Entraînement (sports);...,15288,9782811650254,Blue Lock,Blue Lock,"Kaneshiro, Muneyuki"
15289,Red eyes sword : akame ga kill ! Zero /,Takahiro,9782368522134; 2368522131; 9782368522141; 2368...,Kurokawa,Bandes dessinées; Mangas,15289,9782368522134,Red eyes sword : akame ga kill ! Zero,Red eyes sword,Takahiro


In [None]:
books_df.drop(['msrp','binding', 'isbn', 'edition', 'related', 'dewey_decimal','publisher', 'title_long', 'dimensions', 'dimensions_structured', 'pages'], axis = 1,inplace=True)

In [90]:
df_isbn13 = df[df['first_isbn'].str.len() == 13]
df_isbn13= df_isbn13.merge(books_df, how = 'left', left_on = 'first_isbn', right_on = 'isbn13')

In [91]:
df_isbn10 = df[df['first_isbn'].str.len() == 10]
df_isbn10= df_isbn10.merge(books_df, how = 'left', left_on = 'first_isbn', right_on = 'isbn10')

In [82]:
df_otherisbn = df[(df['first_isbn'].str.len() != 10) & (df['first_isbn'].str.len() != 13)]
df_otherisbn['first_isbn'].notna().sum()

np.int64(0)

In [163]:
df_all_isbn = pd.concat([df_isbn13, df_isbn10, df_otherisbn], ignore_index=True)
df_all_isbn.drop_duplicates(subset='i', inplace = True)
df_all_isbn

Unnamed: 0,Title,Author,ISBN Valid,Publisher,Subjects,i,ISBN,title_clean,google_api_title,author_clean,...,synopsis,language,image,image_original,date_published,subjects,authors,title,isbn13,isbn10
0,Classification décimale universelle : édition ...,,9782871303336; 2871303339,Ed du CEFAL,Classification décimale universelle; Indexatio...,0,9782871303336,Classification décimale universelle : édition ...,Classification décimale universelle,,...,,fr,https://images.isbndb.com/covers/8447463483210...,https://images.isbndb.com/covers/original/8447...,2012-03-05,,[unknown author],Classification décimale universelle : Edition ...,9782871303336,2871303339
1,Les interactions dans l'enseignement des langu...,"Cicurel, Francine, 1947-",9782278058327; 2278058320,Didier,didactique--langue étrangère - enseignement; d...,1,9782278058327,Les interactions dans l'enseignement des langu...,Les interactions dans l'enseignement des langues,"Cicurel, Francine,",...,Agir professoral et pratiques de classe.<br/>U...,fr,https://images.isbndb.com/covers/2099063482999...,https://images.isbndb.com/covers/original/2099...,2011-11-09,"[Education & Teaching, Schools & Teaching]",[Francine Cicurel],Les interactions dans l'enseignement des langu...,9782278058327,2278058320
2,Ce livre devrait me permettre de résoudre le c...,"Mazas, Sylvain, 1980-",9782365350020; 236535002X; 9782365350488; 2365...,Vraoum!,Moyen-Orient; Bandes dessinées autobiographiqu...,3,9782365350020,Ce livre devrait me permettre de résoudre le c...,Ce livre devrait me permettre de résoudre le c...,"Mazas, Sylvain,",...,Product Description<br/><br/><br/>Sylvain Maza...,fr,https://images.isbndb.com/covers/4356503483030...,https://images.isbndb.com/covers/original/4356...,2012-06-07,[Subjects],[Sylvain Sylvain mazas],ce livre devrait me permettre de resoudre le c...,9782365350020,236535002X
3,Les années glorieuses : roman /,"Lemaitre, Pierre, 1951-",9782702180815; 2702180817; 9782702183618; 2702...,Calmann-Lévy,France--1945-1975; Roman historique; Roman fra...,4,9782702180815,Les années glorieuses : roman,Les années glorieuses,"Lemaitre, Pierre,",...,"La famille PelletierTrois histoires d’amour, u...",fr,https://images.isbndb.com/covers/2369653483150...,https://images.isbndb.com/covers/original/2369...,2022-01-25,"[Literature & Fiction, Genre Fiction, Literary]",[Pierre Lemaitre],Le Grand Monde,9782702180815,2702180817
4,100 idées pour mieux gérer les troubles de l'a...,"Lussier, Francine",9782353450428; 2353450423,Tom Pousse,Trouble déficitaire de l'attention avec ou san...,5,9782353450428,100 idées pour mieux gérer les troubles de l'a...,100 idées pour mieux gérer les troubles de l'a...,"Lussier, Francine",...,"Chaque jour, le parent, l'enseignant, est conf...",fr,https://images.isbndb.com/covers/2578953348302...,https://images.isbndb.com/covers/original/2578...,2011-01-01,[Teen & Young Adult],[],100 idées pour mieux gérer les troubles de l'a...,9782353450428,2353450423
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15335,Code pénal suisse : [du 21 décembre 1937 (état...,,,diff OFCL; Chancellerie fédérale,Strafrecht; Strafgesetzbuch; Droit pénal; Schw...,15195,,Code pénal suisse : [du 21 décembre 1937 (état...,Code pénal suisse,,...,,,,,,,,,,
15336,La Commedia dell'Arte : storia e testo /,,,Sansoni Antiquariato,commedia dell'arte--[anthologie],15203,,La Commedia dell'Arte : storia e testo,La Commedia dell'Arte,,...,,,,,,,,,,
15337,"Payerne vracs : [ville, rues, archives, campag...",,,Editions du Caïon rodze,,15232,,"Payerne vracs : [ville, rues, archives, campag...",Payerne vracs,,...,,,,,,,,,,
15338,Polnoe sobranie sochineniĭ i pisem : v tridt︠s...,"Chekhov, Anton Pavlovich",,Nauka,,15245,,Polnoe sobranie sochineniĭ i pisem : v tridt︠s...,Polnoe sobranie sochineniĭ i pisem,"Chekhov, Anton Pavlovich",...,,,,,,,,,,


In [155]:
# Remove <br/> and <br> tags from the 'synopsis' column in merged
df_all_isbn['synopsis'] = df_all_isbn['synopsis'].replace({r'<br\s*/?>': ' '}, regex=True)
# Convert from list of str to str
for column in ['authors','subjects']:
    df_all_isbn[column] = df_all_isbn[column].apply(
        lambda x: ", ".join(x) if isinstance(x, list) else ""
    )
df_all_isbn.to_csv('isbndb_enhanced_items.csv',index=False)

In [None]:
# Handling 1 row that misbehaves when opening the file
df = pd.read_csv('isbndb_enhanced_items.csv', engine='python')
df = df[df['i'].apply(lambda x: str(x).isdigit())]
df.to_csv('isbndb_enhanced_items.csv',index=False)

### Combining the data from both data enhancing techniques

In [229]:
df_all_items = pd.read_csv('google_api_enhanced/all_items_enhanced.csv').sort_values('i',ascending = True)
df_all_isbn = pd.read_csv('isbndb_enhanced_items.csv')
# Rename columns in df_all_isbn to match df_all_items

column_map = {
    'ImageLink': 'image',
    'Language': 'language',
    'PublishedDate': 'date_published',
    'Subjects': 'subjects',
    'author_clean': 'authors',
    'title_clean': 'title',
    'Description': 'synopsis'  # Assuming this is the corresponding column
}


In [231]:
df_all_items

Unnamed: 0.1,Unnamed: 0,CanonicalLink,Description,ISBN,ImageLink,Language,PublishedDate,Publisher,Subjects,Title,author_clean,google_api_title,i,title_clean
723,723,https://books.google.com/books/about/Classific...,,9782871303336,,fr,2012,Ed du CEFAL,Classification décimale universelle; Indexatio...,Classification décimale universelle : édition ...,UDC Consortium (The Hague),Classification décimale universelle,0,Classification décimale universelle : édition ...
724,724,https://books.google.com/books/about/Les_inter...,C'est dans l'interaction en classe que s'actua...,9782278058327,,fr,2011,Didier,didactique--langue étrangère - enseignement; d...,Les interactions dans l'enseignement des langu...,"Cicurel, Francine,",Les interactions dans l'enseignement des langues,1,Les interactions dans l'enseignement des langu...
725,725,https://books.google.com/books/about/Histoire_...,Depuis la parution en 1918 de l'ouvrage fondat...,2343190194,http://books.google.com/books/content?id=Q2PMD...,fr,2020,L'Harmattan,Histoires de vie en sociologie; Sciences socia...,Histoire de vie et recherche biographique : pe...,"Aneta Slowik, Hervé Breton, Gaston Pineau",Histoire de vie et recherche biographique,2,Histoire de vie et recherche biographique : pe...
726,726,https://books.google.com/books/about/Ce_livre_...,,9782365350020,,fr,2012-06-07,Vraoum!,Moyen-Orient; Bandes dessinées autobiographiqu...,Ce livre devrait me permettre de résoudre le c...,"Mazas, Sylvain,",Ce livre devrait me permettre de résoudre le c...,3,Ce livre devrait me permettre de résoudre le c...
727,727,https://books.google.com/books/about/Le_grand_...,"Trois histoires d'amour, un lanceur d'alerte, ...",9782702180815,http://books.google.com/books/content?id=f5u3z...,fr,,Calmann-Lévy,France--1945-1975; Roman historique; Roman fra...,Les années glorieuses : roman /,"Lemaitre, Pierre,",Les années glorieuses,4,Les années glorieuses : roman
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15287,15287,https://books.google.com/books/about/Le_vagabo...,,9782353480111,,fr,,Le Lézard noir,Mangas,Le vagabond de Tokyo /,"Fukutani, Takashi,",Le vagabond de Tokyo,15286,Le vagabond de Tokyo
15288,15288,,,9782382880203,,,,Kbooks,,God of high school : le match contre les dieux /,"Park, YongJe",God of high school,15287,God of high school : le match contre les dieux
15289,15289,,,9782811650254,,,,Pika,Compétitions; Football; Entraînement (sports);...,Blue Lock /,"Kaneshiro, Muneyuki",Blue Lock,15288,Blue Lock
15290,15290,https://books.google.com/books/about/Red_Eyes_...,Plusieurs années avant les événements de Red E...,9782368522134,,fr,,Kurokawa,Bandes dessinées; Mangas,Red eyes sword : akame ga kill ! Zero /,Takahiro,Red eyes sword,15289,Red eyes sword : akame ga kill ! Zero


In [210]:
for target_col, source_col in column_map.items():
    df_all_items[target_col] = df_all_items[target_col].combine_first(df_all_isbn[source_col])

In [234]:
def clean_text(text):
    if isinstance(text, str):
        return text.replace('\n', ' ').replace('\r', ' ')
    return text

# Apply to all object (text) columns
for col in df_all_items.select_dtypes(include='object').columns:
    df_all_items[col] = df_all_items[col].apply(clean_text)

df_all_items['Description'] = df_all_items['Description'].replace({r'<br\s*/?>': ' '}, regex=True)

# Now save
df_all_items.to_csv('items_enhanced_final.csv', index=False)

### Embeddings on the dataset

In [None]:
df_all_items_before = pd.read_csv('google_api_enhanced/all_items_enhanced.csv').sort_values('i',ascending = True)
df_all_items_before.isna().sum()

Unnamed: 0              0
CanonicalLink        1304
Description          8224
ISBN                  351
ImageLink           11758
Language             1304
PublishedDate       14577
Publisher              25
Subjects              266
Title                   0
author_clean          993
google_api_title        0
i                       0
title_clean             0
dtype: int64

In [None]:
df_all_items.isna().sum()

Unnamed: 0             0
CanonicalLink       1304
Description         3171
ISBN                 351
ImageLink           1174
Language             271
PublishedDate       1929
Publisher             25
Subjects              52
Title                  0
author_clean         129
google_api_title       0
i                      0
title_clean            0
dtype: int64

In [212]:
df_all_items.fillna("", inplace=True)
df_all_items['PublishedDate'] = df_all_items['PublishedDate'].astype(str).str[:4]

In [213]:
def create_embeddings_columns(df):
    df['title_description'] = df['title_clean'] + ' ' + df['Description']
    df['date_title_description'] = df['PublishedDate'] + ' '+ df['title_clean'] + ' ' + df['Description']
    df['author_title_description'] = df['author_clean'] + ' ' + df['title_clean'] + ' ' + df['Description']
    df['author_date_title_description'] = df['author_clean'] + ' ' + df['PublishedDate'] + ' ' + df['title_clean'] + ' ' + df['Description']
    df['author_date_title'] = df['author_clean'] + ' ' + df['PublishedDate'] + ' ' + df['title_clean'] 
    df['author_date_title_subjects'] = df['author_clean'] + ' ' + df['PublishedDate'] + ' ' + df['title_clean'] + ' ' + df['Subjects']
    df['author_title_subjects'] = df['author_clean'] + ' ' + ' ' + df['title_clean'] + ' ' + df['Subjects']

In [217]:
create_embeddings_columns(df_all_items)
all_columns = ['title_clean','title_description','date_title_description','author_title_description','author_date_title_description','author_date_title','author_date_title_subjects','author_title_subjects']
priotity_columns = ['title_clean','title_description','date_title_description','author_date_title_description','author_date_title_subjects']

In [221]:
save_embeddings(df_all_items,priotity_columns)

Embedding columns: 100%|██████████| 5/5 [17:33<00:00, 210.68s/it]


### Use this to add missing open covers in the end

In [None]:
# Useless, already included with the ISBNDB
# Add image urls for those that are missing one
open_cover = all_books_enhanced['ISBN'].apply(lambda x: f"https://covers.openlibrary.org/b/isbn/{x}-L.jpg")

# Replace the df with the one I go for (nan books enhanced or not nan books enhanced or the concat of the two)
all_books_enhanced["ImageLink"] = all_books_enhanced["ImageLink"].combine_first(open_cover)
print(all_books_enhanced['ImageLink'].notna().sum())

all_books_enhanced['title_clean'] = all_books_enhanced['Title'].apply(lambda x: x.rstrip(' /') if isinstance(x, str) else np.nan)

all_books_enhanced.to_csv('google_api_enhanced/all_items_enhanced.csv', index = False)


In [227]:
missing_covers = df_all_items[df_all_items['ImageLink'] == ''].copy()
missing_covers['first_isbn'] = missing_covers['ISBN'].apply(extract_isbn)
missing_covers['first_isbn'].apply(lambda x: f"https://covers.openlibrary.org/b/isbn/{x}-L.jpg")

800      https://covers.openlibrary.org/b/isbn/97820706...
814      https://covers.openlibrary.org/b/isbn/97828203...
858      https://covers.openlibrary.org/b/isbn/97828896...
863      https://covers.openlibrary.org/b/isbn/97822030...
866      https://covers.openlibrary.org/b/isbn/97828838...
                               ...                        
15286    https://covers.openlibrary.org/b/isbn/20702997...
15287    https://covers.openlibrary.org/b/isbn/97823534...
15288    https://covers.openlibrary.org/b/isbn/97823828...
15289    https://covers.openlibrary.org/b/isbn/97828116...
15290    https://covers.openlibrary.org/b/isbn/97823685...
Name: first_isbn, Length: 1174, dtype: object

In [228]:
df_all_items.columns

Index(['Unnamed: 0', 'CanonicalLink', 'Description', 'ISBN', 'ImageLink',
       'Language', 'PublishedDate', 'Publisher', 'Subjects', 'Title',
       'author_clean', 'google_api_title', 'i', 'title_clean',
       'title_description', 'date_title_description',
       'author_title_description', 'author_date_title_description',
       'author_date_title', 'author_date_title_subjects',
       'author_title_subjects'],
      dtype='object')