In [1]:
import sys
import os

# Ajouter le dossier parent (contenant IA et Database)
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

In [2]:
from Database.db import *
import time
from tqdm import tqdm
import torch
import seaborn

import pandas as pd
import numpy as np
import re
import emoji
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder,FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn.impute import SimpleImputer 

from sentence_transformers import SentenceTransformer,util
import hdbscan
from sklearn.metrics import silhouette_score
import warnings

from bertopic import BERTopic

from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel

import spacy

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Télécharger les listes de mots vides en anglais et français une seule fois
try:
    stopwords.words('english')
except LookupError:
    print("Téléchargement des listes de stopwords de NLTK...")
    nltk.download('stopwords')
    print("Téléchargement terminé.")

In [4]:
engine = connect_to_db(
    os.environ['DB_NAME'],
    os.environ['DB_USER'],
    os.environ['DB_PASSWORD'],
    os.environ['DB_HOST'],
    os.environ['DB_PORT']
    )
    # Lire la table souhaitée
df = pd.read_sql_table("githubRepo", engine)

'Rain'
'postgres'
'Root'
'localhost'
Connexion réussie : 1


In [5]:
def group_rare_categories(series, threshold_ratio=0.01, new_category_name='other'):
    """Regroupe les catégories rares en une seule catégorie 'other'."""
    series_no_na = series.dropna()
    if len(series_no_na) == 0:
        return series
    counts = series_no_na.value_counts()
    threshold_count = len(series) * threshold_ratio
    rare_categories = counts[counts < threshold_count].index
    return series.replace(rare_categories, new_category_name)

In [None]:
# --- 1. Fonctions de Nettoyage par Étape ---

def clean_text_features(df):
    """Étape 1: Nettoie les colonnes textuelles et crée 'cleaned_text'."""
    print("\n[1/5] Nettoyage du texte...")
    df_processed = df.copy()
    
    # Gestion des valeurs manquantes
    df_processed.replace([None, ''], np.nan, inplace=True)
    df_processed['topics'] = df_processed['topics'].apply(lambda x: np.nan if (isinstance(x, list) and not x) else x)
    df_processed['description_translated'].fillna('', inplace=True)
    df_processed['topics'].fillna('', inplace=True)
    
    # Définition des stopwords
    stop_words = set(stopwords.words('english'))
    custom_stop_words = {
        'repository', 'explore', 'contribute', 'project', 'github', 'app', 'application', 
        'tool', 'library', 'framework', 'platform', 'model', 'gui', 'api', 'backend', 'interface'
    }
    stop_words.update(custom_stop_words)

    def clean_text(text):
        if not isinstance(text, str): return ""
        text = text.lower()
        text = re.sub(r'https?://\S+|www\.\S+', '', text)
        text = re.sub(r'[^\w\s-]', '', text)
        words = text.split()
        cleaned_words = [word for word in words if word not in stop_words and len(word) > 1]
        return ' '.join(cleaned_words)

    df_processed['cleaned_description'] = df_processed['description_translated'].apply(clean_text)

    def clean_topics(topics_data):
        if isinstance(topics_data, list): topics_list = [str(t).lower() for t in topics_data]
        elif isinstance(topics_data, str): topics_list = topics_data.lower().split()
        else: return ""
        unique_topics = sorted(list(set(topics_list)))
        cleaned_topics = [topic for topic in unique_topics if topic not in stop_words and len(topic) > 1]
        return ' '.join(cleaned_topics)

    df_processed['cleaned_topics'] = df_processed['topics'].apply(clean_topics)
    
    # Combiner la description et les topics nettoyés
    df_processed['cleaned_text'] = df_processed['cleaned_description'] + ' ' + df_processed['cleaned_topics']
    
    # Supprimer les espaces multiples
    df_processed['cleaned_text'] = df_processed['cleaned_text'].str.replace(r'\s+', ' ', regex=True).str.strip()

    
    return df_processed

In [7]:
def engineer_datetime_features(df):
    """Étape 2: Crée les features basées sur les dates."""
    print("\n[2/5] Création des features temporelles...")
    df_processed = df.copy()
    
    df_processed['created_at'] = pd.to_datetime(df_processed['created_at'], errors='coerce')
    df_processed['updated_at'] = pd.to_datetime(df_processed['updated_at'], errors='coerce')
    df_processed.dropna(subset=['created_at'], inplace=True)

    df_processed['updated_at'] = df_processed.apply(
        lambda row: row['created_at'] if pd.isna(row['updated_at']) or row['updated_at'] < row['created_at'] else row['updated_at'],
        axis=1
    )

    current_time = pd.Timestamp.now()
    df_processed['repo_age_days'] = (current_time - df_processed['created_at']).dt.days
    df_processed['days_since_last_update'] = (current_time - df_processed['updated_at']).dt.days
    df_processed['created_year'] = df_processed['created_at'].dt.year
    return df_processed

In [8]:
def clean_categorical_features(df):
    """Étape 3: Nettoie et regroupe les features catégorielles."""
    print("\n[3/5] Nettoyage des features catégorielles...")
    df_processed = df.copy()
    
    df_processed['license'].fillna('No License', inplace=True)
    df_processed['license'] = group_rare_categories(df_processed['license'], new_category_name='other_license')
    
    if 'langue' in df_processed.columns:
        df_processed['langue'].fillna('Unknown', inplace=True)
        df_processed['langue'] = group_rare_categories(df_processed['langue'], new_category_name='other_language')
        
    return df_processed

In [9]:
def transform_numeric_features(df):
    """Étape 4: Applique une transformation logarithmique aux features numériques."""
    print("\n[4/5] Transformation des features numériques...")
    df_processed = df.copy()
    
    for col in ['stargazers_count', 'forks_count']:
        if col in df_processed.columns:
            df_processed[f'{col}_log'] = np.log1p(df_processed[col].astype(float))
    
    return df_processed

In [10]:
def finalize_dataset(df):
    """Étape 5: Sélectionne les colonnes finales et effectue un dernier nettoyage."""
    print("\n[5/5] Sélection finale des colonnes...")
    df_processed = df.copy()
    
    final_columns = [
        'name', 'description_translated', 'cleaned_text', 'topics', 'license', 'langue',
        'stargazers_count', 'forks_count', 'stargazers_count_log', 'forks_count_log',
        'repo_age_days', 'days_since_last_update', 'created_year', 'html_url'
    ]
    
    for col in final_columns:
        if col not in df_processed.columns:
            df_processed[col] = np.nan

    df_final = df_processed[final_columns]
    df_final = df_final[df_final['cleaned_text'] != ''].copy()
    
    return df_final

In [11]:
def run_full_preparation_pipeline(df_raw):
    """
    Exécute toutes les étapes de nettoyage et de préparation en séquence.
    """
    print("--- Début du Pipeline de Préparation Complet ---")
    
    df = clean_text_features(df_raw)
    df = engineer_datetime_features(df)
    df = clean_categorical_features(df)
    df = transform_numeric_features(df)
    df = finalize_dataset(df)
    
    print("\n--- Pipeline de Préparation Terminé ---")
    return df

In [None]:
# Appliquer le pipeline complet
df_cleaned = run_full_preparation_pipeline(df)

# Afficher un aperçu du DataFrame final
print("\n--- Aperçu du DataFrame 'Parfait' ---")
print(df_cleaned.info())
print("\n")
with pd.option_context('display.max_colwidth', 80):
    print(df_cleaned.head())

--- Début du Pipeline de Préparation Complet ---

[1/5] Nettoyage du texte...

[2/5] Création des features temporelles...

[3/5] Nettoyage des features catégorielles...

[4/5] Transformation des features numériques...

[5/5] Sélection finale des colonnes...

--- Pipeline de Préparation Terminé ---

--- Aperçu du DataFrame 'Parfait' ---
<class 'pandas.core.frame.DataFrame'>
Index: 7191 entries, 0 to 7193
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   name                    7191 non-null   object 
 1   description_translated  7191 non-null   object 
 2   cleaned_text            7191 non-null   object 
 3   topics                  7191 non-null   object 
 4   license                 7191 non-null   object 
 5   langue                  7191 non-null   object 
 6   stargazers_count        7191 non-null   int64  
 7   forks_count             7191 non-null   int64  
 8   stargazers_count_log    7

In [None]:
# --- 2. Définition des Modèles à Bencher ---

# Sélection des modèles conformément au rapport E2
models_to_benchmark = [
 "Qwen/Qwen3-Embedding-0.6B",
 "sentence-transformers/all-MiniLM-L6-v2",
 "intfloat/multilingual-e5-large-instruct",
 "sentence-transformers/all-mpnet-base-v2",
 "BAAI/bge-large-en-v1.5"
]


benchmark_results = []
corpus = df_cleaned['cleaned_text'].tolist()
# Tokenizer le corpus pour le calcul de la cohérence
tokenized_corpus = [doc.split() for doc in corpus]

# AMÉLIORATION 1 : Lemmatisation
print("Chargement du modèle Spacy pour la lemmatisation...")
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

def lemmatize(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

print("Application de la lemmatisation sur le corpus...")
df_cleaned['lemmatized_text'] = df_cleaned['cleaned_text'].apply(lemmatize)
corpus = df_cleaned['lemmatized_text'].tolist()
tokenized_corpus = [doc.split() for doc in corpus]


# AMÉLIORATION 2 : Test de différentes granularités de thèmes
min_topic_sizes_to_test = [15, 12, 10, 8, 5]

# AMÉLIORATION 3 : Vectoriseur personnalisé
vectorizer_model = CountVectorizer(ngram_range=(1, 2), min_df=5, stop_words="english")

benchmark_results = []

# --- 3. Exécution du Benchmark ---
print("\n--- DÉBUT DU BENCHMARK ---")

for model in models_to_benchmark:
    print(f"\n{'='*80}")
    print(f"ÉVALUATION DU MODÈLE : {model}")
    print(f"{'='*80}")
    
    embedding_model = SentenceTransformer(model)

    for min_size in min_topic_sizes_to_test:
        print(f"\n--- Test avec min_topic_size = {min_size} ---")
        
        topic_model = BERTopic(
            embedding_model=embedding_model,
            min_topic_size=min_size,
            vectorizer_model=vectorizer_model, # Utilisation du vectoriseur personnalisé
            verbose=False # Mis à False pour alléger la sortie
        )

        start_time = time.time()
        topics, _ = topic_model.fit_transform(corpus)
        duration = time.time() - start_time

        topic_info = topic_model.get_topic_info()
        num_topics_found = len(topic_info) - 1
        num_outliers = topic_info.loc[topic_info['Topic'] == -1, 'Count'].iloc[0] if -1 in topic_info['Topic'].values else 0
        noise_percentage = (num_outliers / len(corpus)) * 100

        coherence_score = 0.0
        if num_topics_found > 0:
            dictionary = Dictionary(tokenized_corpus)
            bow_corpus = [dictionary.doc2bow(doc) for doc in tokenized_corpus]
            topic_words = [[words[0] for words in topic_model.get_topic(topic_id)] for topic_id in range(num_topics_found)]
            coherence_model = CoherenceModel(topics=topic_words, texts=tokenized_corpus, corpus=bow_corpus, dictionary=dictionary, coherence='c_v')
            coherence_score = coherence_model.get_coherence()

        print("Résultats qualitatifs (Top 5 des thèmes) :")
        for topic_id in range(min(5, num_topics_found)):
            keywords = topic_model.get_topic(topic_id)
            if keywords:
                print(f"  Thème {topic_id}: {[word for word, score in keywords]}")

        print(f"Résultats : {num_topics_found} thèmes, {noise_percentage:.2f}% de bruit, Cohérence C_v: {coherence_score:.4f}, Temps: {duration:.2f}s")

        benchmark_results.append({
            "model": model,
            "min_topic_size": min_size,
            "Temps (s)": round(duration, 2),
            "Nb Thèmes": num_topics_found,
            "% Bruit": round(noise_percentage, 2),
            "Cohérence (C_v)": round(coherence_score, 4)
        })

# --- 4. Analyse des Résultats ---
print(f"\n\n{'='*80}")
print("SYNTHÈSE FINALE DU BENCHMARK")
print(f"{'='*80}")

summary_df = pd.DataFrame(benchmark_results)
# Trier pour une meilleure lisibilité
summary_df.sort_values(by=["model", "min_topic_size"], inplace=True)
print(summary_df.to_string(index=False))

print("\n--- FIN DU BENCHMARK ---")


Chargement du modèle Spacy pour la lemmatisation...
Application de la lemmatisation sur le corpus...

--- DÉBUT DU BENCHMARK ---

ÉVALUATION DU MODÈLE : Qwen/Qwen3-Embedding-0.6B

--- Test avec min_topic_size = 15 ---
Résultats qualitatifs (Top 5 des thèmes) :
  Thème 0: ['agent', 'ai agent', 'agent ai', 'multi agent', 'ai', 'multi', 'autonomous', 'framework', 'agentic', 'autonomous agent']
  Thème 1: ['chatgpt', 'gpt', 'gpt gpt', 'openai', 'chat', 'chatbot', 'api', 'chatgpt chatgpt', 'free', 'bing']
  Thème 2: ['mcp', 'mcp server', 'server', 'mcp mcp', 'protocol', 'context protocol', 'context', 'protocol mcp', 'mcp client', 'claude']
  Thème 3: ['inference', 'llama', 'llm', 'serve', 'lora', 'llm inference', 'fine', 'model', 'finetune', 'language model']
  Thème 4: ['code', 'copilot', 'vscode', 'plugin', 'github', 'completion', 'developer', 'review', 'extension', 'ide']
Résultats : 88 thèmes, 35.28% de bruit, Cohérence C_v: 0.7023, Temps: 102.91s

--- Test avec min_topic_size = 12 ---
