In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import joblib
import os
from dotenv import load_dotenv
import re
from typing import List, Dict
from sqlalchemy import create_engine

load_dotenv()

# ==============================
# 1. Connexion à la base de données via SQLAlchemy
# ==============================
def connect_db_engine():
    """Connexion à la base via SQLAlchemy engine"""
    db_user = os.getenv('DB_USER', 'postgres')
    db_pass = os.getenv('DB_PASSWORD', 'admin')
    db_host = os.getenv('DB_HOST', 'localhost')
    db_port = os.getenv('DB_PORT', '5432')
    db_name = os.getenv('DB_NAME', 'booksdb')

    db_uri = f"postgresql+psycopg2://{db_user}:{db_pass}@{db_host}:{db_port}/{db_name}"
    engine = create_engine(db_uri)
    return engine

# ==============================
# 2. Chargement des données
# ==============================
def load_data_from_db():
    """Charger les données depuis la base"""
    try:
        engine = connect_db_engine()

        query = """
            SELECT titre, description, image_url, 
                   disponibilite as stock, note as rating, prix
            FROM livres
            WHERE description IS NOT NULL AND description != ''
        """
        df = pd.read_sql_query(query, engine)

        # Création d'un identifiant interne
        df = df.reset_index().rename(columns={"index": "id"})

        print(f"Données chargées: {len(df)} livres")
        return df
    except Exception as e:
        print(f"Erreur chargement données : {e}")
        return None

# ==============================
# 3. Alternative: Function to inspect database schema
# ==============================
def inspect_database_schema():
    """Inspecter le schéma de la base pour connaître les noms exacts des colonnes"""
    try:
        engine = connect_db_engine()
        query = """
            SELECT column_name, data_type 
            FROM information_schema.columns 
            WHERE table_name = 'livres'
            ORDER BY ordinal_position;
        """
        columns_info = pd.read_sql_query(query, engine)
        print("Colonnes disponibles dans la table 'livres':")
        print(columns_info)
        return columns_info
    except Exception as e:
        print(f"Erreur inspection schéma : {e}")
        return None

# ==============================
# 4. Prétraitement du texte
# ==============================
def preprocess_text(text):
    if not text or pd.isna(text):
        return ""
    text = str(text).lower()  # Added str() conversion for safety
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# ==============================
# 5. Préparation TF-IDF
# ==============================
def prepare_tfidf_matrix(df):
    try:
        df['description_clean'] = df['description'].apply(preprocess_text)
        df['combined_features'] = df['titre'].fillna('') + ' ' + df['description_clean'].fillna('')
        vectorizer = TfidfVectorizer(
            max_features=5000,
            stop_words='english',
            ngram_range=(1, 2),
            min_df=2,
            max_df=0.8
        )
        tfidf_matrix = vectorizer.fit_transform(df['combined_features'])
        print(f"Matrice TF-IDF créée: {tfidf_matrix.shape}")
        return vectorizer, tfidf_matrix
    except Exception as e:
        print(f"Erreur TF-IDF: {e}")
        return None, None

# ==============================
# 6. Calcul Similarité Cosinus
# ==============================
def compute_cosine_similarity_matrix(tfidf_matrix, df):
    try:
        cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
        book_indices = pd.Series(df.index, index=df['titre']).drop_duplicates()
        print("Matrice de similarité cosinus calculée")
        return cosine_sim, book_indices
    except Exception as e:
        print(f"Erreur similarité: {e}")
        return None, None

# ==============================
# 7. Recommandations par titre
# ==============================
def get_recommendations_by_title(title, df, cosine_sim, book_indices, n=5):
    try:
        if title not in book_indices:
            possible_titles = df[df['titre'].str.contains(title, case=False, na=False)]
            if possible_titles.empty:
                return []
            title = possible_titles.iloc[0]['titre']
        idx = book_indices[title]
        sim_scores = list(enumerate(cosine_sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:n+1]
        recommendations = []
        for i, score in sim_scores:
            book = df.iloc[i]
            recommendations.append({
                'id': int(book['id']),
                'titre': book['titre'],
                'description': book['description'][:200] + ('...' if len(str(book['description'])) > 200 else ''),
                'image_url': book['image_url'],
                'rating': float(book['rating']) if pd.notna(book['rating']) else 0.0,
                'stock': int(book['stock']) if pd.notna(book['stock']) else 0,
                'prix': float(book['prix']) if pd.notna(book['prix']) else 0.0,
                'similarity_score': float(score)
            })
        return recommendations
    except Exception as e:
        print(f"Erreur recommandations par titre : {e}")
        return []

# ==============================
# 8. Recommandations par description
# ==============================
def get_recommendations_by_description(user_desc, df, vectorizer, tfidf_matrix, n=5):
    try:
        processed_desc = preprocess_text(user_desc)
        user_tfidf = vectorizer.transform([processed_desc])
        similarities = cosine_similarity(user_tfidf, tfidf_matrix).flatten()
        similar_indices = similarities.argsort()[::-1][:n]
        recommendations = []
        for idx in similar_indices:
            if similarities[idx] > 0:
                book = df.iloc[idx]
                recommendations.append({
                    'id': int(book['id']),
                    'titre': book['titre'],
                    'description': book['description'][:200] + ('...' if len(str(book['description'])) > 200 else ''),
                    'image_url': book['image_url'],
                    'rating': float(book['rating']) if pd.notna(book['rating']) else 0.0,
                    'stock': int(book['stock']) if pd.notna(book['stock']) else 0,
                    'prix': float(book['prix']) if pd.notna(book['prix']) else 0.0,
                    'similarity_score': float(similarities[idx])
                })
        return recommendations
    except Exception as e:
        print(f"Erreur recommandations par description : {e}")
        return []

# ==============================
# 9. Sauvegarde/Chargement modèle
# ==============================
def save_model(filepath, vectorizer, tfidf_matrix, cosine_sim, book_indices, df):
    try:
        # Create directory if it doesn't exist
        os.makedirs(os.path.dirname(filepath), exist_ok=True)
        
        model_data = {
            'tfidf_vectorizer': vectorizer,
            'tfidf_matrix': tfidf_matrix,
            'cosine_sim': cosine_sim,
            'book_indices': book_indices,
            'df': df
        }
        joblib.dump(model_data, filepath)
        print(f"Modèle sauvegardé dans {filepath}")
        return True
    except Exception as e:
        print(f"Erreur sauvegarde: {e}")
        return False

def load_model(filepath):
    try:
        model_data = joblib.load(filepath)
        print(f"Modèle chargé depuis {filepath}")
        return (model_data['tfidf_vectorizer'], model_data['tfidf_matrix'],
                model_data['cosine_sim'], model_data['book_indices'], model_data['df'])
    except Exception as e:
        print(f"Erreur chargement modèle: {e}")
        return None, None, None, None, None

# ==============================
# 10. Programme principal
# ==============================
def main():
    print("=== Inspection du schéma de la base ===")
    inspect_database_schema()
    
    print("\n=== Entraînement du système de recommandation ===")
    df = load_data_from_db()
    if df is None:
        return
    
    print(f"Aperçu des données:")
    display(df.head())
    print(f"\nTypes de colonnes:")
    display(df.dtypes)
    
    vectorizer, tfidf_matrix = prepare_tfidf_matrix(df)
    if vectorizer is None:
        return
    cosine_sim, book_indices = compute_cosine_similarity_matrix(tfidf_matrix, df)
    if cosine_sim is None:
        return
    save_model('models/recommendation_model.joblib', vectorizer, tfidf_matrix, cosine_sim, book_indices, df)

    print("\n=== Test recommandations par titre ===")
    # Test with the first book title from the dataframe
    if len(df) > 0:
        test_title = df.iloc[0]['titre']
        display(f"Test avec le titre: '{test_title}'")
        recs = get_recommendations_by_title(test_title, df, cosine_sim, book_indices, 3)
        for rec in recs:
            display(f"- {rec['titre']} (Score: {rec['similarity_score']:.3f})")
    else:
        print("Aucun livre trouvé pour tester")

    print("\n=== Test recommandations par description ===")
    recs = get_recommendations_by_description("I want a romantic adventure story", df, vectorizer, tfidf_matrix, 3)
    for rec in recs:
        display(f"- {rec['titre']} (Score: {rec['similarity_score']:.3f})")

if __name__ == "__main__":
    main()

=== Inspection du schéma de la base ===
Colonnes disponibles dans la table 'livres':
     column_name         data_type
0             id            bigint
1          titre              text
2    description              text
3           prix  double precision
4  disponibilite            bigint
5      image_url              text
6           note            bigint

=== Entraînement du système de recommandation ===
Données chargées: 998 livres
Aperçu des données:


Unnamed: 0,id,titre,description,image_url,stock,rating,prix
0,0,A Light in the Attic,It's hard to imagine a world without A Light i...,https://books.toscrape.com/media/cache/2c/da/2...,0,3,51.77
1,1,Tipping the Velvet,"""Erotic and absorbing...Written with starling ...",https://books.toscrape.com/media/cache/26/0c/2...,0,1,53.74
2,2,Soumission,"Dans une France assez proche de la nôtre, un h...",https://books.toscrape.com/media/cache/3e/ef/3...,0,1,50.1
3,3,Sharp Objects,"WICKED above her hipbone, GIRL across her hear...",https://books.toscrape.com/media/cache/32/51/3...,0,4,47.82
4,4,Sapiens: A Brief History of Humankind,From a renowned historian comes a groundbreaki...,https://books.toscrape.com/media/cache/be/a5/b...,0,5,54.23



Types de colonnes:


id               int64
titre           object
description     object
image_url       object
stock            int64
rating           int64
prix           float64
dtype: object

Matrice TF-IDF créée: (998, 5000)
Matrice de similarité cosinus calculée
Modèle sauvegardé dans models/recommendation_model.joblib

=== Test recommandations par titre ===


"Test avec le titre: 'A Light in the Attic'"

'- Quarter Life Poetry: Poems for the Young, Broke and Hangry (Score: 0.171)'

'- Twenty Love Poems and a Song of Despair (Score: 0.159)'

'- salt. (Score: 0.142)'


=== Test recommandations par description ===


'- The Course of Love (Score: 0.175)'

"- Charity's Cross (Charles Towne Belles #4) (Score: 0.168)"

'- Reasons to Stay Alive (Score: 0.142)'