In [None]:
# !pip install scikit-learn

In [4]:
import pandas as pd
import requests
import re
import unicodedata
import string
from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict
import numpy as np
from typing import Iterable
from scipy.sparse import csr_matrix

In [5]:
url = "https://www.gutenberg.org/cache/epub/feeds/pg_catalog.csv"
r = requests.get(url)
with open("pg_catalog.csv", "wb") as f:
    f.write(r.content)

In [6]:
df = pd.read_csv("pg_catalog.csv").fillna("")
df.head()

Unnamed: 0,Text#,Type,Issued,Title,Language,Authors,Subjects,LoCC,Bookshelves
0,1,Text,1971-12-01,The Declaration of Independence of the United ...,en,"Jefferson, Thomas, 1743-1826","United States -- History -- Revolution, 1775-1...",E201; JK,Politics; American Revolutionary War; United S...
1,2,Text,1972-12-01,The United States Bill of Rights\r\nThe Ten Or...,en,United States,Civil rights -- United States -- Sources; Unit...,JK; KF,Politics; American Revolutionary War; United S...
2,3,Text,1973-11-01,John F. Kennedy's Inaugural Address,en,"Kennedy, John F. (John Fitzgerald), 1917-1963",United States -- Foreign relations -- 1961-196...,E838,"Category: Essays, Letters & Speeches; Category..."
3,4,Text,1973-11-01,Lincoln's Gettysburg Address\r\nGiven November...,en,"Lincoln, Abraham, 1809-1865",Consecration of cemeteries -- Pennsylvania -- ...,E456,"US Civil War; Category: Essays, Letters & Spee..."
4,5,Text,1975-12-01,The United States Constitution,en,United States,United States -- Politics and government -- 17...,JK; KF,United States; Politics; American Revolutionar...


In [7]:
# caractères à garder tels quels
_KEEP = set("-'")  # utile pour noms composés et titres
_TRANS_TABLE = str.maketrans({c: " " for c in string.punctuation if c not in _KEEP})


def remove_accents(s: str) -> str:
    """
    La normalisation NFKD (Normalization Form KD = Compatibility Decomposition) décompose les caractères en leur forme de base + diacritiques.
    Par exemple:
        "e" --> "e"
        "è" --> "e`"
    La fonction 'unicodedata.normalize' opère cette séparation
    La fonction 'unicodedata.combining' remplace chaque caractère par un entier différent de 0 si c'est un accent, 0 si c'est un accent.
    """
    s = unicodedata.normalize("NFKD", s)
    return "".join(ch for ch in s if not unicodedata.combining(ch))

def base_normalize(s: str) -> str:
    if not s: 
        return ""
    # suppression de la casse
    s = s.casefold()
    s = remove_accents(s)
    s = s.translate(_TRANS_TABLE)          # ponctuation → espaces (sauf - et ')
    # compacter les espaces multiples et supprimer les espaces en début et fin de chaine
    s = re.sub(r"\s+", " ", s).strip()
    return s

def normalize_title(title: str) -> dict:
    # À enrichir si on veut aller plus loin, par exemple supprimer les articles
    norm = base_normalize(title)
    return norm

def normalize_authors(authors: str) -> list:
    if not authors:
        return []
    return sorted(set(normalize_author(author) for author in authors.split(";")))

def check_initials(potential_initials: str, potential_full_name: str) -> bool:
    initials = [initial for initial in potential_initials.split(" ") if initial]
    names = [name for name in potential_full_name.split(" ") if name]
    if len(initials) == len(names):
        for initial, name in zip(initials, names):
            if initial[0] != name[0]:
                return False
        return True
    return False

def remove_all_potential_initials(s: str) -> str:
    all_potential_initials = re.findall(r"((?:(?:\w+ )|(?:\w\. ))+)\(((?:\w+ ?)+?)\)", s)
    if all_potential_initials:
        for (potential_initials, potential_full_name) in all_potential_initials:
            if check_initials(potential_initials, potential_full_name):
                s = s.replace(potential_initials, potential_full_name)
    return s
            
def normalize_author(author: str) -> str:
    """
    Nettoie les auteurs pour l'autocomplete:
      - supprime dates (chiffres), contenus entre ()/[]/{}
      - compresse initiales 'J. K.' → 'jk'
      - garde - et ' pour les noms composés (dumas, o'connor)
    """
    if not author: 
        return ""
    s = author
    s = remove_all_potential_initials(s)
    # retirer parenthèses / crochets / accolades et leur contenu
    s = re.sub(r"[\(\[\{].*?[\)\]\}]", " ", s)
    # retirer chiffres (dates, numéros)
    s = re.sub(r"\d+", " ", s)
    # normalisation de base (accents, casse, ponctuation)
    s = base_normalize(s)
    # compacter initiales restantes qui n'ont pas pu être supprimées: "j. k." -> "jk"; "j k" -> "jk"
    s = re.sub(r"\b([a-z])\b(?:\s+|\.)", r"\1", s)  # colle les lettres isolées
    # Retirer les tirets résiduels en début et fin de chaine
    s = re.sub(r"^-*", "", s)
    s = re.sub(r"-*$", "", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

df["title_norm"] = df["Title"].apply(lambda s: normalize_title(s))
df["auths_norm"] = df["Authors"].apply(lambda s: normalize_authors(s))
df["auth_norm"] = df["auths_norm"].apply(lambda s: "; ".join(s))

In [85]:
auth_norm_all = df["auth_norm"].drop_duplicates().sort_values().to_list()
title_norm_all = df["title_norm"].sort_values().to_list()
real_norm_title_matching = df.set_index("title_norm")["Title"].to_dict()
real_norm_auth_matching = df.set_index("auth_norm")["Authors"].to_dict()

In [53]:
def match_all_3grams_from_list(L: list[str]) -> defaultdict[str, set[str]]:
    trigrams = defaultdict(set)
    for elem in L:
        for i in range(max(0, 1 + len(elem) - 3)):
            trigrams[elem[i: i + 3]].add(elem)
    return trigrams

auth_3grams = match_all_3grams_from_list(auth_norm_all)
title_3grams = match_all_3grams_from_list(title_norm_all)

In [54]:
def idf_score(trigram, is_title=True):
    if is_title:
        corpus = title_norm_all
        subcorpus = title_3grams.get(trigram, set())
    else:
        corpus = auth_norm_all
        subcorpus = auth_3grams.get(trigram, set())
    return 1 + np.log((1 + len(corpus)) / (1 + len(subcorpus)))

title_idfs = {trigram: idf_score(trigram, is_title=True) for trigram in title_3grams}
auth_idfs = {trigram: idf_score(trigram, is_title=False) for trigram in auth_3grams}

In [55]:
def get_all_3grams_from_string(s: str) -> list[str]:
    return [s[i: i + 3] for i in range(max(0, 1 + len(s) - 3))]
get_all_3grams_from_string("bonjour")

['bon', 'onj', 'njo', 'jou', 'our']

In [56]:
title_index_matching = {title: index for index, title in enumerate(title_norm_all)}
title_trigram_index_matching = {trigram: index for index, trigram in enumerate(title_3grams)}

auth_index_matching = {auth: index for index, auth in enumerate(auth_norm_all)}
auth_trigram_index_matching = {trigram: index for index, trigram in enumerate(auth_3grams)}

In [57]:
data, rows, cols = [], [], []
for title, i in title_index_matching.items():
    trigrams = get_all_3grams_from_string(title)
    for trigram in trigrams:
        j = title_trigram_index_matching[trigram]
        data.append(title_idfs[trigram])
        rows.append(i)
        cols.append(j)
title_weights = csr_matrix((data, (rows, cols)), shape=(len(title_norm_all), len(title_3grams)), dtype="float32") 

In [58]:
data, rows, cols = [], [], []
for auth, i in auth_index_matching.items():
    trigrams = get_all_3grams_from_string(auth)
    for trigram in trigrams:
        j = auth_trigram_index_matching[trigram]
        data.append(auth_idfs[trigram])
        rows.append(i)
        cols.append(j)
auth_weights = csr_matrix((data, (rows, cols)), shape=(len(auth_norm_all), len(auth_3grams)), dtype="float32") 

In [102]:
def norm(L: Iterable) -> float:
    return np.sqrt(sum(x ** 2 for x in L))

def get_all_posible_choices(trigrams, is_title=True):
    if not trigrams:
        return set()
    if is_title:
        return set.intersection(*(title_3grams.get(trigram, set()) for trigram in trigrams))
    return set.intersection(*(auth_3grams.get(trigram, set()) for trigram in trigrams))

def get_most_relevant_choice(q, is_title=True):
    if is_title:
        q = normalize_title(q)
    else:
        q = "; ".join(normalize_authors(q))
    trigrams = get_all_3grams_from_string(q)
    posible_choices = get_all_posible_choices(trigrams, is_title=is_title)
    if is_title:
        idfs = title_idfs
        weights = title_weights
        indexes = title_index_matching
    else:
        idfs = auth_idfs
        weights = auth_weights
        indexes = auth_index_matching
    w_q = np.array([
        idfs[trigram] if trigram in trigrams else 0
        for trigram in idfs
    ])
    best_score = 0
    best_choice = ""
    for choice in posible_choices:
        w_d = weights[indexes[choice]]
        score = float(w_d @ w_q) / norm(w_q) / np.sqrt(w_d.multiply(w_d).sum())
        if score > best_score:
            best_score = score
            best_choice = choice
    if is_title:
        return real_norm_title_matching.get(best_choice, "")
    return real_norm_auth_matching.get(best_choice, "")

In [113]:
get_most_relevant_choice("dosto", is_title=False)

  score = float(w_d @ w_q) / norm(w_q) / np.sqrt(w_d.multiply(w_d).sum())


'Dostoyevsky, Fyodor, 1821-1881'

In [None]:
def get_sublist_from_indexes(L, indexes):
    return [x for i, x in enumerate(L) if int(indexes[i])]

def get_all_indexes(L):
    N = len(L)
    return [
        list(format(k, f"0{N}b"))
        for k in range(1, 2**N)
    ]

def get_all_sublists(L):
    return [
        get_sublist_from_indexes(L, indexes)
        for indexes in get_all_indexes(L)
    ]