In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv("tous_les_livres_523.csv")

In [3]:
print(df.head(2))

                  Titre    Prix            Disponibilité  \
0  A Light in the Attic  £51.77  In stock (22 available)   
1    Tipping the Velvet  £53.74  In stock (20 available)   

                                         Description   Note  \
0  It's hard to imagine a world without A Light i...  Three   
1  "Erotic and absorbing...Written with starling ...    One   

                                               Image  Page  
0  https://books.toscrape.com/media/cache/fe/72/f...     1  
1  https://books.toscrape.com/media/cache/08/e9/0...     1  


In [4]:
df["Description"] = (
    df["Description"]
    .str.strip()  # Supprimer espaces au début/fin
    .str.replace(r"\s+", " ", regex=True)  # Remplacer espaces multiples par un seul
    .str.replace(r"[^a-zA-ZÀ-ÿ0-9\s]", "", regex=True)  # Supprimer caractères spéciaux
)

In [5]:
df["Prix"] = df["Prix"].str.strip().str.replace("£", "").astype(float)

In [6]:
# Extraire uniquement le nombre et le convertir en int
df["Disponibilité"] = df["Disponibilité"].str.extract(r"(\d+)").astype(int)

In [7]:
# Extraire la note sous forme numérique (ex: "Three" →3).
# Dictionnaire de correspondance
note_map = {
    "One": 1,
    "Two": 2,
    "Three": 3,
    "Four": 4,
    "Five": 5
}

# Conversion
df["Note"] = df["Note"].map(note_map)

In [8]:
df.isnull().sum()

Titre            0
Prix             0
Disponibilité    0
Description      0
Note             0
Image            0
Page             0
dtype: int64

In [9]:
df["Description"] = df.apply(
    lambda row: row["Titre"] if len(str(row["Description"]).strip()) < 10 else row["Description"],
    axis=1
)

In [10]:
df=df.drop("Page", axis=1)

In [11]:
df = df.rename(columns={
    "Titre": "titre",
    "Prix": "prix",
    "Disponibilité": "disponibilite",
    "Description": "description",
    "Note": "note",
    "Image": "image"
})

In [12]:
df.to_csv("livres_bruts.csv", index=False, encoding="utf-8")

In [14]:
print(df.head(2))

                  titre   prix  disponibilite  \
0  A Light in the Attic  51.77             22   
1    Tipping the Velvet  53.74             20   

                                         description  note  \
0  Its hard to imagine a world without A Light in...     3   
1  Erotic and absorbingWritten with starling powe...     1   

                                               image  
0  https://books.toscrape.com/media/cache/fe/72/f...  
1  https://books.toscrape.com/media/cache/08/e9/0...  


In [156]:
# ! pip install sqlalchemy psycopg2

In [21]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.types import Integer, String, Float, Text
from dotenv import load_dotenv
import os

# ----- Charger les variables d'environnement -----
load_dotenv()  # lit le fichier .env
DB_URI = os.getenv("DATABASE_URL")  # utilise DATABASE_URL du .env

# 🔹 Chemin vers ton CSV
CSV_FILE = "livres_bruts.csv"

def load_books_to_db(csv_file: str, db_uri: str):
    try:
        # ----- Charger CSV -----
        df = pd.read_csv(csv_file)

        # ----- Ajouter un ID si nécessaire -----
        if 'id' not in df.columns:
            df.reset_index(inplace=True)
            df.rename(columns={'index': 'id'}, inplace=True)

        # ----- Vérification rapide -----
        print("\nAperçu des données:")
        display(df.head())
        print(f"\nNombre de livres: {len(df)}")
        print(f"Colonnes: {list(df.columns)}")
        print(f"Types: {df.dtypes}")

        # ----- Connexion à PostgreSQL -----
        engine = create_engine(db_uri)

        # ----- Sauvegarde dans PostgreSQL -----
        df.to_sql(
            "livres",
            engine,
             if_exists="append", # ou 'append' si tu veux ajouter
            index=False,
            chunksize=1000,
            dtype={
                "id": Integer,
                "titre": String(500),
                "description": Text,
                "prix": Float,
                "image": String(500),
                "disponibilite": Integer,
                "note": Integer
            }
        )
        print(" Données sauvegardées dans PostgreSQL avec succès !")

        # ----- Vérification -----
        result = pd.read_sql_query("SELECT COUNT(*) as total FROM livres", engine)
        print(f"Vérification: {result['total'].iloc[0]} livres dans la base")
    except Exception as e:
        print(f"Erreur lors du chargement des données : {e}")
# 🔹 Lancer le script
if __name__ == "__main__":
    load_books_to_db(CSV_FILE, DB_URI)



Aperçu des données:


Unnamed: 0,id,titre,prix,disponibilite,description,note,image
0,0,A Light in the Attic,51.77,22,Its hard to imagine a world without A Light in...,3,https://books.toscrape.com/media/cache/fe/72/f...
1,1,Tipping the Velvet,53.74,20,Erotic and absorbingWritten with starling powe...,1,https://books.toscrape.com/media/cache/08/e9/0...
2,2,Soumission,50.1,20,Dans une France assez proche de la nôtre un ho...,1,https://books.toscrape.com/media/cache/ee/cf/e...
3,3,Sharp Objects,47.82,20,WICKED above her hipbone GIRL across her heart...,4,https://books.toscrape.com/media/cache/c0/59/c...
4,4,Sapiens: A Brief History of Humankind,54.23,20,From a renowned historian comes a groundbreaki...,5,https://books.toscrape.com/media/cache/ce/5f/c...



Nombre de livres: 523
Colonnes: ['id', 'titre', 'prix', 'disponibilite', 'description', 'note', 'image']
Types: id                 int64
titre             object
prix             float64
disponibilite      int64
description       object
note               int64
image             object
dtype: object
Erreur lors du chargement des données : (psycopg2.errors.UniqueViolation) ERREUR:  la valeur d'une clé dupliquée rompt la contrainte unique « livres_pkey »
DETAIL:  La clé « (id)=(0) » existe déjà.

[SQL: INSERT INTO livres (id, titre, prix, disponibilite, description, note, image) VALUES (%(id__0)s, %(titre__0)s, %(prix__0)s, %(disponibilite__0)s, %(description__0)s, %(note__0)s, %(image__0)s), (%(id__1)s, %(titre__1)s, %(prix__1)s, %(disponibilite__ ... 63292 characters truncated ... __522)s, %(prix__522)s, %(disponibilite__522)s, %(description__522)s, %(note__522)s, %(image__522)s)]
[parameters: {'note__0': 3, 'prix__0': 51.77, 'description__0': 'Its hard to imagine a world without A Li

In [None]:
# ! pip install pydantic_settings



In [22]:
# Charger uniquement les titres et descriptions
db_uri = os.getenv("DATABASE_URL")
engine = create_engine(db_uri)
df = pd.read_sql("SELECT id, titre, description FROM livres", engine)
print(df.head())


   id                                  titre  \
0   0                   A Light in the Attic   
1   1                     Tipping the Velvet   
2   2                             Soumission   
3   3                          Sharp Objects   
4   4  Sapiens: A Brief History of Humankind   

                                         description  
0  Its hard to imagine a world without A Light in...  
1  Erotic and absorbingWritten with starling powe...  
2  Dans une France assez proche de la nôtre un ho...  
3  WICKED above her hipbone GIRL across her heart...  
4  From a renowned historian comes a groundbreaki...  


In [180]:
! pip install nltk

^C




In [24]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to C:\Users\Rehmi
[nltk_data]     Salma\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Rehmi
[nltk_data]     Salma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [25]:
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

In [26]:
def preprocess_text(text):
    text = str(text).lower()  # mettre en minuscules
    words = word_tokenize(text)  # tokenisation
    words = [word for word in words if word.isalpha() and word not in stop_words]  # filtrer
    stemmed_words = [stemmer.stem(word) for word in words]  # stemming
    return ' '.join(stemmed_words)  # reformer le texte


In [27]:
df['description'] = df['description'].apply(preprocess_text)
print(df["description"].head())

0    hard imagin world without light attic nowclass...
1    erot absorbingwritten starl powerth new york t...
2    dan une franc assez proch de la nôtre un homm ...
3    wick hipbon girl across heart word like road m...
4    renown historian come groundbreak narr human c...
Name: description, dtype: object


In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
# On fixe le nombre maximum de mots à garder dans la matrice TF-IDF
tfidf = TfidfVectorizer(max_features=5000)  # par exemple 5000 mots les plus fréquents
X_tfidf = tfidf.fit_transform(df['description'])

In [29]:
print("Dimensions : ", X_tfidf.shape)  # (n_messages, n_mots)
print("Quelques mots : ", tfidf.get_feature_names_out()[:10])  # Voir les 10 premiers mots

Dimensions :  (523, 5000)
Quelques mots :  ['aaron' 'abandon' 'abbi' 'abbot' 'abduct' 'abil' 'abl' 'abound' 'abra'
 'abraham']


In [30]:
# Transformer la matrice en DataFrame pour visualiser
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf.get_feature_names_out())
print(tfidf_df.head())

   aaron  abandon  abbi  abbot  abduct     abil  abl  abound  abra  abraham  \
0    0.0      0.0   0.0    0.0     0.0  0.00000  0.0     0.0   0.0      0.0   
1    0.0      0.0   0.0    0.0     0.0  0.00000  0.0     0.0   0.0      0.0   
2    0.0      0.0   0.0    0.0     0.0  0.00000  0.0     0.0   0.0      0.0   
3    0.0      0.0   0.0    0.0     0.0  0.00000  0.0     0.0   0.0      0.0   
4    0.0      0.0   0.0    0.0     0.0  0.06711  0.0     0.0   0.0      0.0   

   ...  zeuss  zimbabw  zimbardo  zinger  zoe  zombi  zoo  zoologist  zorin  \
0  ...    0.0      0.0       0.0     0.0  0.0    0.0  0.0        0.0    0.0   
1  ...    0.0      0.0       0.0     0.0  0.0    0.0  0.0        0.0    0.0   
2  ...    0.0      0.0       0.0     0.0  0.0    0.0  0.0        0.0    0.0   
3  ...    0.0      0.0       0.0     0.0  0.0    0.0  0.0        0.0    0.0   
4  ...    0.0      0.0       0.0     0.0  0.0    0.0  0.0        0.0    0.0   

   zuko  
0   0.0  
1   0.0  
2   0.0  
3   0.0  


In [31]:
from sklearn.metrics.pairwise import cosine_similarity

# Calcul de la similarité cosinus
cosine_sim = cosine_similarity(X_tfidf, X_tfidf)

# cosine_sim est une matrice (n_books x n_books) contenant les scores de similarité
print(cosine_sim.shape)


(523, 523)


In [32]:
book_index = 0  # par exemple, le premier livre
similar_books = list(enumerate(cosine_sim[book_index]))
similar_books = sorted(similar_books, key=lambda x: x[1], reverse=True)

print("Livre de référence :", df.iloc[book_index]["titre"])
print("\nLivres similaires :")
for idx, score in similar_books[1:6]:  # les 5 plus similaires
    print(f"- {df.iloc[idx]['titre']} (score: {score:.2f})")


Livre de référence : A Light in the Attic

Livres similaires :
- salt. (score: 0.10)
- Whole Lotta Creativity Going On: 60 Fun and Unusual Exercises to Awaken and Strengthen Your Creativity (score: 0.10)
- The Artist's Way: A Spiritual Path to Higher Creativity (score: 0.09)
- Steal Like an Artist: 10 Things Nobody Told You About Being Creative (score: 0.08)
- You can't bury them all: Poems (score: 0.08)


In [33]:
import joblib

# Sauvegarder le TF-IDF et la matrice
joblib.dump(tfidf, "tfidf_vectorizer.pkl")
joblib.dump(X_tfidf, "livre_vectors.pkl")
joblib.dump(cosine_sim, "cosine_sim_matrix.pkl")

print("✅ Modèles enregistrés !")

✅ Modèles enregistrés !
