In [1]:
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import mlflow
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import gc
from collections import Counter

def load_and_prepare_data():
    df = pd.read_csv('data/Prepared_Data_with_Embeddings.csv')
    df = df.sample(frac=1, random_state=1)
    df['Title'] = df['Title'].astype(str)
    df['Tags'] = df['Tags'].apply(eval)
    df.dropna(inplace=True)
    
    tag_list = [tag for sublist in df['Tags'] for tag in sublist]
    tag_counts = Counter(tag_list)
    min_occurrences = 100
    filtered_tags = {tag for tag, count in tag_counts.items() if count >= min_occurrences}
    
    df['Filtered_Tags'] = df['Tags'].apply(lambda tags: [tag for tag in tags if tag in filtered_tags])
    df = df[df['Filtered_Tags'].apply(len) > 0]
    del df['Tags']
    gc.collect()
    
    return df

def create_use_embeddings(df):
    device_name = '/CPU:0' if not tf.config.list_physical_devices('GPU') else '/GPU:0'
    with tf.device(device_name):
        use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
    
    def _create_embeddings(texts, model, batch_size=32):
        embeddings = []
        for start in tqdm(range(0, len(texts), batch_size), desc="Creating USE embeddings"):
            end = min(start + batch_size, len(texts))
            batch_texts = texts[start:end]
            with tf.device(device_name):
                batch_embeddings = model(batch_texts).numpy()
            embeddings.extend(batch_embeddings)
        return np.array(embeddings)

    X_dense = _create_embeddings(df['Title'].tolist(), use_model)
    return X_dense

def resample_data(X_dense, y):
    y_counts = np.sum(y, axis=0)
    min_count = 2
    max_count = int(np.percentile(y_counts, 90))
    indices_to_sample = np.where((y_counts >= min_count) & (y_counts <= max_count))[0]
    
    X_resampled_rows = []
    y_resampled_rows = []

    for i in tqdm(range(X_dense.shape[0]), desc="Rééchantillonnage"):
        if np.any(y[i, indices_to_sample]):
            X_resampled_rows.append(X_dense[i])
            y_resampled_rows.append(y[i])
            X_resampled_rows.append(X_dense[i])
            y_resampled_rows.append(y[i])

    if X_resampled_rows:
        X_resampled = np.vstack(X_resampled_rows)
        y_resampled = np.array(y_resampled_rows)
    else:
        raise ValueError("Aucune donnée n'a été sélectionnée pour le rééchantillonnage.")
    
    del X_dense, X_resampled_rows, y_resampled_rows
    gc.collect()
    
    return X_resampled, y_resampled

def train_model(X_train, y_train, X_val, y_val, input_dim, output_dim):
    class MLPModel(nn.Module):
        def __init__(self, input_dim, output_dim):
            super(MLPModel, self).__init__()
            self.fc1 = nn.Linear(input_dim, 128)
            self.fc2 = nn.Linear(128, output_dim)
        
        def forward(self, x):
            x = torch.relu(self.fc1(x))
            x = self.fc2(x)
            return torch.sigmoid(x)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = MLPModel(input_dim=input_dim, output_dim=output_dim).to(device)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32).to(device), torch.tensor(y_train, dtype=torch.float32).to(device))
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    
    epochs = 10
    with mlflow.start_run(run_name="USE + MLP"):
        for epoch in range(epochs):
            model.train()
            running_loss = 0.0
            for inputs, labels in tqdm(train_loader, desc=f"Training Epoch {epoch+1}/{epochs}"):
                optimizer.zero_grad()
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
                running_loss += loss.item()
            
            avg_loss = running_loss / len(train_loader)
            print(f"Average loss for epoch {epoch+1}: {avg_loss}")
            mlflow.log_metric("loss", avg_loss, step=epoch)

        model.eval()
        with torch.no_grad():
            y_pred_tensor = model(torch.tensor(X_val, dtype=torch.float32).to(device))
            y_pred_bin = (y_pred_tensor.cpu().numpy() > 0.5).astype(int)

        y_val_cpu = y_val
        precision = precision_score(y_val_cpu, y_pred_bin, average='weighted', zero_division=0)
        recall = recall_score(y_val_cpu, y_pred_bin, average='weighted', zero_division=0)
        f1 = f1_score(y_val_cpu, y_pred_bin, average='weighted', zero_division=0)

        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1 Score: {f1:.4f}")

        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)
        mlflow.pytorch.log_model(model, "USE_MLP_model")

if __name__ == "__main__":
    df = load_and_prepare_data()
    X_dense = create_use_embeddings(df)
    
    mlb = MultiLabelBinarizer()
    y = mlb.fit_transform(df['Filtered_Tags'])
    np.save("models/mlb_classes.npy", mlb.classes_)
    
    X_resampled, y_resampled = resample_data(X_dense, y)
    
    X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
    
    train_model(X_train, y_train, X_val, y_val, input_dim=X_train.shape[1], output_dim=y_train.shape[1])


FileNotFoundError: [Errno 2] No such file or directory: 'data/Prepared_Data_with_Embeddings.csv'

In [None]:
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import mlflow
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import gc
from collections import Counter

# Votre code existant ici ...

def predict_tags(model, text, use_model, mlb_classes):
    # Convertir le texte en embedding avec le modèle USE
    title_embedding = use_model([text]).numpy()
    
    # Convertir en Tensor pour PyTorch
    title_tensor = torch.tensor(title_embedding, dtype=torch.float32)
    
    # Faire la prédiction
    with torch.no_grad():
        y_pred_tensor = model(title_tensor)
        y_pred_scores = y_pred_tensor.numpy().flatten()

    # Afficher les scores pour chaque tag
    predicted_tags_with_scores = {mlb_classes[i]: y_pred_scores[i] for i in range(len(mlb_classes))}
    
    # Trier par score décroissant pour plus de lisibilité
    predicted_tags_with_scores = dict(sorted(predicted_tags_with_scores.items(), key=lambda item: item[1], reverse=True))
    
    return predicted_tags_with_scores

if __name__ == "__main__":
    # Code d'entraînement existant
    df = load_and_prepare_data()
    X_dense = create_use_embeddings(df)
    
    mlb = MultiLabelBinarizer()
    y = mlb.fit_transform(df['Filtered_Tags'])
    np.save("models/mlb_classes.npy", mlb.classes_)
    
    X_resampled, y_resampled = resample_data(X_dense, y)
    
    X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
    
    model = train_model(X_train, y_train, X_val, y_val, input_dim=X_train.shape[1], output_dim=y_train.shape[1])

    # Tester la prédiction sur une phrase d'exemple
    example_text = "How to implement a neural network in Python?"
    
    # Charger le modèle USE
    use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
    
    # Charger les classes du MultiLabelBinarizer
    mlb_classes = np.load("models/mlb_classes.npy", allow_pickle=True)
    
    # Faire une prédiction
    predicted_tags = predict_tags(model, example_text, use_model, mlb_classes)
    
    print("Predicted tags with scores:")
    for tag, score in predicted_tags.items():
        print(f"{tag}: {score:.4f}")
