In [7]:
import pandas as pd
import numpy as np
import networkx as nx
import umap
import random
from sklearn.model_selection import train_test_split

In [39]:
df_emb=pd.read_csv('embedings.csv')
df_edges=pd.read_csv('graphe_complet_final.csv')
df_nodes=pd.read_csv('Nodes.csv')

In [43]:
G = nx.from_pandas_edgelist(
    df_edges,
    source="user_id",
    target="original_author"
)
labeled=list(df_nodes['Id'])
unlabeled=set(G.nodes)-set(labeled)
len(unlabeled)

143684

In [46]:

# On redéfinit tous les noms de colonnes : la 1ère est 'Id', les autres sont des numéros
df_emb.columns = ['Id'] + [str(i) for i in range(len(df_emb.columns) - 1)]

# On vérifie le résultat
df_emb.head()

Unnamed: 0,Id,0,1,2,3,4,5,6,7,8,...,22,23,24,25,26,27,28,29,30,31
0,2986,-0.796943,-0.366361,1.193189,-0.407961,0.361447,-1.005791,-0.391032,-0.29459,0.322338,...,0.24315,0.045082,-0.9909,0.335912,-0.821075,0.59485,-0.106388,-0.008981,0.512644,0.407136
1,963157886,-0.727243,-0.415978,1.02628,-0.539274,0.148927,-0.833597,-0.34968,-0.424079,0.198511,...,0.088982,0.284921,-1.145151,-0.092441,-0.616216,0.652329,-0.331621,-0.131574,0.29591,0.472644
2,3634,0.332749,0.082989,2.345172,-0.429265,0.424216,0.028194,-0.622177,-0.591872,0.214264,...,1.062456,-0.121421,-0.920816,0.374637,-1.346095,-0.274321,0.19882,-0.515055,-0.522131,-0.418714
3,366208117,0.126969,0.073324,2.165089,-0.116837,0.148732,-0.208062,-0.799889,-0.523327,0.08223,...,1.274916,0.152203,-0.758296,0.515065,-1.673265,-0.130704,0.319054,-0.451017,-0.897874,-0.630579
4,13621,-0.274961,-0.265757,0.863488,0.505986,-0.232664,-0.539835,-0.996888,-1.416056,-0.316091,...,0.293771,1.588904,0.139663,-0.403322,-1.080889,-0.856452,-1.244141,-0.295905,-1.585301,-1.470422


In [30]:
opinion_dict = (
    df_nodes
    .set_index("Id")["modularity_class"]
    .map({4: 1, 0: 0})
    .fillna(-1)
    .to_dict()
)

In [36]:
nodes_labeled = [node for node in G.nodes() if opinion_dict.get(node) != -1]
nodes_unlabeled = [node for node in G.nodes() if opinion_dict.get(node) == -1]

print(len(nodes_labeled))

144222


In [34]:
len(df_nodes)

612

In [48]:
df_emb_indexed = df_emb.set_index('Id')
nodes_to_use = [node for node in G.nodes() if node in df_emb_indexed.index]
X_all = df_emb_indexed.loc[nodes_to_use].values
y_all = np.array([opinion_dict.get(node, -1) for node in nodes_to_use])

In [52]:
print(f"Analyse lancée sur {len(X_all)} individus.")
print(f"Nombre de labels connus (0 ou 1) : {np.sum(y_all != -1)}")

Analyse lancée sur 144222 individus.
Nombre de labels connus (0 ou 1) : 538


In [54]:
X_train, X_test, y_train, y_test = train_test_split(
    X_all, 
    y_all, 
    test_size=0.20, 

)

In [56]:
reducer = umap.UMAP(
    n_neighbors=15, 
    min_dist=0.1, 
    n_components=2, 

)

In [57]:
X_train_umap = reducer.fit_transform(X_train, y=y_train)

In [58]:
X_test_2D = reducer.transform(X_test)

In [61]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

In [62]:
mask_train = (y_train != -1)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_umap[mask_train], y_train[mask_train])


X_test_umap = reducer.transform(X_test)


y_pred_test = knn.predict(X_test_umap)


mask_test_known = (y_test != -1)

if np.any(mask_test_known):
    acc = accuracy_score(y_test[mask_test_known], y_pred_test[mask_test_known])
    print(f"\n✅ Fiabilité du modèle : {acc:.2%}")
    print("\nRapport détaillé sur les labels connus :")
    print(classification_report(y_test[mask_test_known], y_pred_test[mask_test_known]))
else:
    print("\n⚠️ Aucun label connu dans le groupe de test pour valider la précision.")


✅ Fiabilité du modèle : 99.12%

Rapport détaillé sur les labels connus :
              precision    recall  f1-score   support

           0       0.98      1.00      0.99        48
           1       1.00      0.98      0.99        66

    accuracy                           0.99       114
   macro avg       0.99      0.99      0.99       114
weighted avg       0.99      0.99      0.99       114



In [63]:
predictions_test = knn.predict(X_test_2D)

# --- ÉTAPE C : Vérification ---
# On compare les prédictions avec la réalité (uniquement pour les connus)
mask_connus = (y_test != -1)
score = accuracy_score(y_test[mask_connus], predictions_test[mask_connus])

print(f"Le KNN a deviné juste à {score:.2%} sur les 20% de test.")

Le KNN a deviné juste à 99.12% sur les 20% de test.
