In [None]:


# ===== Imports =====
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

# ===== Load & visualize graph =====
path = "./../assignment2_files_2025/edges_train.edgelist"
G = nx.read_edgelist(path, delimiter=',', nodetype=int, create_using=nx.Graph())

print(f"Graph loaded with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges")

pos = nx.spring_layout(G, seed=42)
nx.draw(G, pos=pos, node_size=20, with_labels=False)
plt.show()

# ===== (optioneel) edges ongewijzigd wegschrijven =====
edges = list(G.edges())
pd.DataFrame(edges).to_csv("edges_train.edgelist", index=False, header=False)

# ===== Feature engineering =====

N = len(list(G.nodes))
pa = np.zeros((N, N))
preds = nx.preferential_attachment(G, [(i, j) for i in range(N) for j in range(N)])
for u, v, p in preds:
    pa[u, v] = p

def getFeature(G, i, j):
    # Node-level
    deg_i = G.degree(i)
    deg_j = G.degree(j)
    cc_i  = nx.clustering(G, i)
    cc_j  = nx.clustering(G, j)
    pa_ij = pa[i, j]

    # Neighborhood overlap
    common = list(nx.common_neighbors(G, i, j))
    cn_ij  = len(common)

    # Adamic–Adar
    aa_ij = 0.0
    for z in common:
        dz = G.degree(z)
        if dz > 1:
            aa_ij += 1.0 / np.log(dz)

    # Jaccard
    neigh_i = set(G.neighbors(i))
    neigh_j = set(G.neighbors(j))
    union_sz = len(neigh_i | neigh_j)
    jc_ij = (cn_ij / union_sz) if union_sz > 0 else 0.0

    # Resource Allocation (RA)
    ra_ij = 0.0
    for z in common:
        dz = G.degree(z)
        if dz > 0:
            ra_ij += 1.0 / dz

    return np.array([deg_i, deg_j, cc_i, cc_j, pa_ij, cn_ij, aa_ij, jc_ij, ra_ij], dtype=float)

# ===== Dataset bouwen (positief = bestaande edge; negatief = non-edge) =====
X, Y = [], []

# Positieve voorbeelden
for (i, j) in G.edges:
    X.append(getFeature(G, i, j))
    Y.append(1)

# Negatieve voorbeelden: evenveel als positieve (BALANS)
neg_count = len(G.edges)
rng = np.random.default_rng(42)
for _ in range(neg_count):
    i = int(rng.integers(0, N))
    j = int(rng.integers(0, N))
    while G.has_edge(i, j) or i == j:
        i = int(rng.integers(0, N))
        j = int(rng.integers(0, N))
    X.append(getFeature(G, i, j))
    Y.append(0)

# Stratified split (beide klassen evenredig verdeeld)
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42, stratify=Y
)

# (optioneel) check verdeling
print("Train class balance:", pd.Series(y_train).value_counts(normalize=True).to_dict())
print("Test  class balance:", pd.Series(y_test).value_counts(normalize=True).to_dict())

# ===== Model: SVM met scaling =====
clf = Pipeline([
    ("scaler", StandardScaler()),
    ("svm", SVC(kernel="rbf", C=1.0, gamma="scale", probability=True, random_state=42))
])
clf.fit(X_train, y_train)


# ===== Evaluatie op hold-out =====
y_pred_test = clf.predict(X_test)
y_prob_test = clf.predict_proba(X_test)[:, 1]
print("Accuracy (hold-out):", round(accuracy_score(y_test, y_pred_test), 4))
print("ROC-AUC (hold-out):", round(roc_auc_score(y_test, y_prob_test), 4))
print(classification_report(y_test, y_pred_test, digits=3))

# ===== Kaggle submission (exact: ID,prediction) =====
# Verwacht: solutionInput.csv met kolommen: ID, u, v  (ID = index)
inpTest = pd.read_csv('solutionInput.csv', sep=',', index_col='ID')

# Features voor elk (u,v)-paar
inp = np.array([getFeature(G, int(row[0]), int(row[1])) for _, row in inpTest.iterrows()])

# Voorspellingen (0/1) met SVM
predictionsSVM = clf.predict(inp)

# Schrijf in exact Kaggle-formaat: ID,prediction (zonder indexkolom)
sub = pd.DataFrame({'ID': inpTest.index, 'prediction': predictionsSVM})
sub.to_csv('predictionSVM.csv', index=False)

print("✅ Klaar: predictionSVM.csv aangemaakt in Kaggle-formaat (ID,prediction).")


ModuleNotFoundError: No module named 'sklearn'