# Syst√®me de Recommandation GitHub - NMF Optimis√©

Ce notebook impl√©mente un syst√®me de recommandation bas√© sur la **Factorisation de Matrice Non-n√©gative (NMF)**. 
Les hyperparam√®tres utilis√©s ici ont √©t√© optimis√©s via **Optuna** pour minimiser l'erreur RMSE.

In [1]:
# 1. Installation des d√©pendances si n√©cessaire
%pip install scikit-surprise pymongo seaborn matplotlib requests mlflow

Note: you may need to restart the kernel to use updated packages.


In [2]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from pymongo import MongoClient
from sklearn.metrics import roc_curve, classification_report, confusion_matrix, mean_squared_error
from surprise import Dataset, Reader, NMF
from surprise import dump
import os
import mlflow
from collections import defaultdict
import mlflow.sklearn

# Configuration de la reproductibilit√©
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# Meilleurs param√®tres trouv√©s par Optuna (Trial 8)
BEST_PARAMS = {
    'n_factors': 132,
    'n_epochs': 76,
    'reg_pu': 0.03034349689970731,
    'reg_qi': 0.12323194796302143,
    'biased': False,
    'random_state': SEED
}

MONGO_URI = "mongodb+srv://toto_db_user:Peche4412@bigdata.0wqiq5r.mongodb.net/?retryWrites=true&w=majority"

  from .autonotebook import tqdm as notebook_tqdm


## 2. Chargement des donn√©es
R√©cup√©ration des interactions 'stars' depuis la base de donn√©es MongoDB.

In [3]:
def load_data_from_mongo(uri, limit=250000):
    print("Connexion √† MongoDB...")
    client = MongoClient(uri, serverSelectionTimeoutMS=250000)
    db = client["githubAPI"]
    collection = db["stars"]
    
    # Extraction s√©lective des colonnes pour optimiser la m√©moire
    cursor = collection.find({}, {'user_login': 1, 'repo_full_name': 1, '_id': 0}).limit(limit)
    
    data = []
    for doc in cursor:
        if doc.get('user_login') and doc.get('repo_full_name'):
            data.append({
                "user": doc.get('user_login'),
                "item": doc.get('repo_full_name'),
                "rating": 1.0
            })
    client.close()
    print(f"‚úÖ {len(data)} enregistrements r√©cup√©r√©s.")
    return data

raw_data = load_data_from_mongo(MONGO_URI)

Connexion √† MongoDB...
‚úÖ 250000 enregistrements r√©cup√©r√©s.


## 3. Pr√©paration du Dataset
Cr√©ation des ensembles d'entra√Ænement et de test, et g√©n√©ration d'√©chantillons n√©gatifs.

In [4]:
random.shuffle(raw_data)
split_idx = int(len(raw_data) * 0.8)
train_pos = raw_data[:split_idx]
test_pos = raw_data[split_idx:]

df_train_pos = pd.DataFrame(train_pos)
all_items = set(df_train_pos['item'].unique())

print("G√©n√©ration des exemples n√©gatifs (√©chantillonnage)...")
train_negatives = []
users_in_train = df_train_pos['user'].unique()

for u in users_in_train:
    seen = set(df_train_pos[df_train_pos['user'] == u]['item'])
    candidates = list(all_items - seen)
    # On prend 5 fois plus de n√©gatifs que de positifs pour un meilleur apprentissage
    num_to_take = len(seen) * 5
    if candidates:
        negs = random.sample(candidates, min(len(candidates), num_to_take))
        for item in negs:
            train_negatives.append({"user": u, "item": item, "rating": 0.0})

df_train_final = pd.concat([df_train_pos, pd.DataFrame(train_negatives)])

# Format compatible avec Surprise
reader = Reader(rating_scale=(0, 1))
train_data_surprise = Dataset.load_from_df(df_train_final[['user', 'item', 'rating']], reader)
trainset = train_data_surprise.build_full_trainset()

G√©n√©ration des exemples n√©gatifs (√©chantillonnage)...


## 4. Entra√Ænement avec Param√®tres Optimis√©s
Application du mod√®le NMF avec les r√©sultats issus d'Optuna.

In [None]:
# Nettoyage et Configuration du chemin
if os.path.exists("mlruns"):
    shutil.rmtree("mlruns")

base_dir = os.path.abspath(os.getcwd())
tracking_db = os.path.join(base_dir, "mlflow_recsys.db")
tracking_uri = f"sqlite:///{tracking_db}"

mlflow.set_tracking_uri(tracking_uri)
mlflow.set_experiment("GitHub_Recommender_Final")

# Fonctions de m√©triques m√©tier
def precision_recall_at_k(predictions, k=10, threshold=0.16):
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions, recalls = dict(), dict()
    for uid, user_ratings in user_est_true.items():
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        n_rel = sum((true_r >= 1.0) for (_, true_r) in user_ratings)
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])
        n_rel_and_rec_k = sum(((true_r >= 1.0) and (est >= threshold))
                            for (est, true_r) in user_ratings[:k])

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return np.mean(list(precisions.values())), np.mean(list(recalls.values()))

# Pr√©paration du TEST SET (D√©plac√© ici pour √©viter le NameError)
print("üß™ Pr√©paration du jeu de test (√©chantillonnage n√©gatif)...")
all_data_df = pd.DataFrame(raw_data)
test_users = set([x['user'] for x in test_pos])
test_negatives = []

for u in test_users:
    # On r√©cup√®re ce que l'utilisateur a d√©j√† aim√© pour ne pas lui proposer en "n√©gatif"
    seen = set(all_data_df[all_data_df['user'] == u]['item'])
    candidates = list(all_items - seen)
    if candidates:
        # On g√©n√®re 50 exemples n√©gatifs par utilisateur de test
        negs = random.sample(candidates, min(len(candidates), 50))
        for item in negs:
            test_negatives.append({'user': u, 'item': item, 'rating': 0.0})

# Construction de la liste finale pour Surprise
test_set_surprise = [(x['user'], x['item'], x['rating']) for x in test_pos] + \
                    [(x['user'], x['item'], x['rating']) for x in test_negatives]

# Entra√Ænement et Supervision 
with mlflow.start_run(run_name="NMF_Optimized_Final"):
    mlflow.log_params(BEST_PARAMS)
    mlflow.log_param("dataset_limit", 250000)
    
    algo = NMF(**BEST_PARAMS)
    print("üöÄ Entra√Ænement du mod√®le NMF...")
    algo.fit(trainset)
    
    # √âvaluation
    predictions = algo.test(test_set_surprise)
    
    rmse_val = np.sqrt(mean_squared_error([p.r_ui for p in predictions], [p.est for p in predictions]))
    prec, rec = precision_recall_at_k(predictions, k=10)
    
    mlflow.log_metric("rmse", rmse_val)
    mlflow.log_metric("precision_at_10", prec)
    mlflow.log_metric("recall_at_10", rec)
    
    model_filename = "github_nmf_model.pkl"
    dump.dump(model_filename, algo=algo)
    mlflow.log_artifact(model_filename)
    
    print(f"‚úÖ Termin√© ! RMSE: {rmse_val:.4f} | Prec@10: {prec:.4f}")

print(f"\nüëâ Pour voir les r√©sultats :")
print(f"mlflow ui --backend-store-uri {tracking_uri} --port 5001")

üß™ Pr√©paration du jeu de test (√©chantillonnage n√©gatif)...
üöÄ Entra√Ænement du mod√®le NMF...
‚úÖ Termin√© ! RMSE: 0.3522 | Prec@10: 0.5068

üëâ Pour voir les r√©sultats :
mlflow ui --backend-store-uri sqlite:////home/lumen/Documents/Universit√©/M2/mlflow_recsys.db --port 5001


In [None]:
### RECHARGER UN MOELE

from surprise import dump

# Charger le fichier
file_name = "github_nmf_model_150000.pkl"
_, loaded_algo = dump.load(file_name)

print("‚úÖ Mod√®le recharg√© et pr√™t pour les pr√©dictions.")

## 5. √âvaluation des Performances
Calcul du seuil optimal et affichage des m√©triques de classification.

In [None]:
print("Pr√©paration du test set...")
all_data_df = pd.DataFrame(raw_data)
test_users = set([x['user'] for x in test_pos])
test_negatives = []

for u in test_users:
    seen = set(all_data_df[all_data_df['user'] == u]['item'])
    candidates = list(all_items - seen)
    if candidates:
        negs = random.sample(candidates, min(len(candidates), 50))
        for item in negs:
            test_negatives.append((u, item, 0.0))

test_set_surprise = [(x['user'], x['item'], x['rating']) for x in test_pos] + test_negatives

# Pr√©dictions
predictions = algo.test(test_set_surprise)
y_true = [int(p.r_ui) for p in predictions]
y_scores = [p.est for p in predictions]

# Calcul de la courbe ROC pour trouver le seuil optimal
fpr, tpr, thresholds = roc_curve(y_true, y_scores)
best_thresh = thresholds[np.argmax(tpr - fpr)]

y_pred = [1 if s >= best_thresh else 0 for s in y_scores]

print(f"\nüéØ Seuil Optimal : {best_thresh:.4f}")
print(classification_report(y_true, y_pred, target_names=['Non-Star', 'Star']))

# Affichage de la matrice de confusion
plt.figure(figsize=(6, 5))
sns.heatmap(confusion_matrix(y_true, y_pred), annot=True, fmt='d', cmap='Oranges')
plt.title("Matrice de Confusion - NMF Optimis√©")
plt.ylabel("R√©el")
plt.xlabel("Pr√©dit")
plt.show()

## 6. Fonction de Recommandation Reelle
Pr√©dire des d√©p√¥ts pour un utilisateur GitHub sp√©cifique.

In [None]:
def get_recommendations(github_username, model, df_ref, token=None):
    headers = {"Accept": "application/vnd.github.v3+json"}
    if token: headers["Authorization"] = f"token {token}"
    
    # R√©cup√©ration des stars r√©elles via API GitHub
    url = f"https://api.github.com/users/{github_username}/starred?per_page=100"
    resp = requests.get(url, headers=headers)
    
    if resp.status_code != 200:
        print(f"‚ùå Erreur API : {resp.status_code}")
        return
        
    user_stars = [repo['full_name'] for repo in resp.json()]
    if not user_stars:
        print("L'utilisateur n'a pas de stars publiques.")
        return

    # Recherche du profil le plus proche dans notre base (similarit√© Jaccard)
    user_likes_set = set(user_stars)
    db_users_data = df_ref.groupby('user')['item'].apply(set)
    
    best_match = None
    max_sim = -1
    for user, items in db_users_data.items():
        sim = len(user_likes_set & items) / len(user_likes_set | items)
        if sim > max_sim:
            max_sim = sim
            best_match = user

    # Pr√©diction sur les d√©p√¥ts que l'utilisateur n'a pas encore vus
    all_repos = set(df_ref['item'].unique())
    candidates = list(all_repos - user_likes_set)
    
    predictions = []
    for repo in candidates:
        score = model.predict(uid=best_match, iid=repo).est
        predictions.append((repo, score))
    
    predictions.sort(key=lambda x: x[1], reverse=True)
    
    print(f"\nüåü Recommandations pour {github_username} (bas√©es sur {best_match}) :")
    for i, (repo, score) in enumerate(predictions[:10], 1):
        print(f"#{i}: {repo} (Score: {score:.4f})")

# Exemple d'ex√©cution
get_recommendations('MathildeBoo', algo, all_data_df)