In [7]:
from google.colab import drive
from sentence_transformers import SentenceTransformer
import os
import pandas as pd


# Monter le Drive
drive.mount('/content/drive')

# Changer de répertoire vers SteamAnalyse
project_dir = '/content/drive/MyDrive/SteamAnalyse'
os.chdir(project_dir)

# Vérifier
!pwd
!ls

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/SteamAnalyse
description.pkl  error_data.jsonl.zip  test.ipynb  valide_data.jsonl.zip


In [9]:
from sentence_transformers import SentenceTransformer

In [18]:
path="description.pkl"
dfall = pd.read_pickle(path)
df = dfall['description_courte']
if isinstance(df, pd.Series): df = pd.DataFrame(df, columns=['description_courte'])
df.head()

Unnamed: 0,description_courte
0,Shatter is a retro-inspired brick-breaking gam...
1,"""Salomé, Salomé, dance for me. I pray thee dan..."
2,A piece of Steam history - THE FIRST EVER NON ...
3,Harvest: Massive Encounter is an award-winning...
4,The award-winning Shatter Official Videogame S...


In [19]:
# --- A. Chargement du Modèle (Optimisé Colab) ---
print("Chargement du modèle...")
model = SentenceTransformer('intfloat/e5-base-v2')
df['text_prepared'] = "passage: " + df['description_courte']

# --- B. Transformation en Embeddings ---
print("Encodage en cours sur le GPU...")
embeddings = model.encode(
    df['text_prepared'].tolist(), # On convertit la colonne en liste pour le modèle
    batch_size=32,                # Traite 32 textes à la fois
    device='cuda',                # Utilise le GPU T4 de Colab
    show_progress_bar=True,
    normalize_embeddings=True     # Important pour le clustering
)

Chargement du modèle...
Encodage en cours sur le GPU...


Batches:   0%|          | 0/2537 [00:00<?, ?it/s]

In [24]:
# embeddings est une matrice numpy de taille (Nombre_lignes, 768)

# --- C. Intégration comme FEATURES dans le DataFrame ---
# Créer une colonne par dimension (feat_0, feat_1, ...)
# Cela permet d'utiliser les embeddings directement
embedding_df = pd.DataFrame(
    embeddings,
    index=df.index,
    columns=[f'emb_{i}' for i in range(embeddings.shape[1])]
)

# On colle les nouvelles features au DataFrame original
df_final = pd.concat([dfall, embedding_df], axis=1)
# On nettoie la colonne temporaire
df_final = df_final.drop(columns=['description_courte'])

print(f"Taille du DataFrame final : {df_final.shape}")
print(df_final.head())

Taille du DataFrame final : (81157, 769)
   app_id     emb_0     emb_1     emb_2     emb_3     emb_4     emb_5  \
0   20820 -0.015515 -0.040634 -0.039410 -0.024553  0.060766 -0.016784   
1   27050  0.010040 -0.048632 -0.039975 -0.014079  0.054651 -0.001169   
2    1002 -0.059601 -0.053385 -0.044010  0.031389  0.063745 -0.049218   
3   15400 -0.021737 -0.057124 -0.024447 -0.019569  0.066276 -0.078666   
4   20828 -0.001495 -0.003197 -0.013747 -0.019420  0.031681 -0.051849   

      emb_6     emb_7     emb_8  ...   emb_758   emb_759   emb_760   emb_761  \
0  0.003307  0.029646 -0.020718  ...  0.001481 -0.006138  0.037936 -0.041931   
1  0.083036  0.043449 -0.053732  ...  0.045610 -0.057862  0.041564 -0.037886   
2 -0.008127  0.061023 -0.029774  ...  0.021553 -0.021234  0.050738 -0.035896   
3  0.018553  0.081685 -0.038769  ... -0.000309 -0.028253  0.047141 -0.054390   
4  0.025695  0.007324 -0.011575  ...  0.000799 -0.005049  0.044561 -0.058593   

    emb_762   emb_763   emb_764   emb_7

In [26]:
# Dans ton dossier Drive monté
df.to_pickle("/content/drive/MyDrive/SteamAnalyse/embeddings.pkl")