In [3]:
import networkx as nx
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

print("="*80)
print("LINK PREDICTION - RANDOM FOREST")
print("="*80)

# ===== STAP 1: DATA LADEN =====
print("\n1. Loading data...")

# Graph
pathedgelist = "./../assignment2_files_2025/edges_train.edgelist"
G = nx.read_edgelist(pathedgelist, delimiter=',', nodetype=int, create_using=nx.Graph())
print(f"   ✓ Graph: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")

# Attributes
pathattributes = "./../assignment2_files_2025/attributes.csv"
attributes_df = pd.read_csv(pathattributes)
node_id_col = attributes_df.columns[0]
attribute_cols = [col for col in attributes_df.columns if col != node_id_col]

# Encode categorisch naar numeriek
print(attributes_df)
for col in attribute_cols:
    if attributes_df[col].dtype == 'object':
        le = LabelEncoder()
        attributes_df[col] = le.fit_transform(attributes_df[col].astype(str))
        print(f"   ✓ Encoded '{col}': {list(le.classes_)}")
print(attributes_df)

LINK PREDICTION - RANDOM FOREST

1. Loading data...
   ✓ Graph: 1500 nodes, 6600 edges
        ID attribute
0        0         l
1        1         x
2        2         x
3        3         x
4        4         x
...    ...       ...
1495  1495         l
1496  1496         l
1497  1497         l
1498  1498         f
1499  1499         l

[1500 rows x 2 columns]
   ✓ Encoded 'attribute': ['d', 'f', 'l', 'm', 'x', 'y']
        ID  attribute
0        0          2
1        1          4
2        2          4
3        3          4
4        4          4
...    ...        ...
1495  1495          2
1496  1496          2
1497  1497          2
1498  1498          1
1499  1499          2

[1500 rows x 2 columns]


In [6]:
attributes_dict = attributes_df.set_index(node_id_col).to_dict('index')
print(attributes_dict)

{0: {'attribute': 2}, 1: {'attribute': 4}, 2: {'attribute': 4}, 3: {'attribute': 4}, 4: {'attribute': 4}, 5: {'attribute': 4}, 6: {'attribute': 4}, 7: {'attribute': 4}, 8: {'attribute': 4}, 9: {'attribute': 4}, 10: {'attribute': 4}, 11: {'attribute': 4}, 12: {'attribute': 4}, 13: {'attribute': 4}, 14: {'attribute': 4}, 15: {'attribute': 4}, 16: {'attribute': 4}, 17: {'attribute': 4}, 18: {'attribute': 4}, 19: {'attribute': 4}, 20: {'attribute': 0}, 21: {'attribute': 4}, 22: {'attribute': 4}, 23: {'attribute': 4}, 24: {'attribute': 4}, 25: {'attribute': 4}, 26: {'attribute': 4}, 27: {'attribute': 4}, 28: {'attribute': 4}, 29: {'attribute': 4}, 30: {'attribute': 4}, 31: {'attribute': 4}, 32: {'attribute': 4}, 33: {'attribute': 4}, 34: {'attribute': 4}, 35: {'attribute': 3}, 36: {'attribute': 4}, 37: {'attribute': 4}, 38: {'attribute': 5}, 39: {'attribute': 4}, 40: {'attribute': 4}, 41: {'attribute': 4}, 42: {'attribute': 4}, 43: {'attribute': 4}, 44: {'attribute': 1}, 45: {'attribute': 4

In [None]:
# Test data
pathsolution = "./../assignment2_files_2025/solutionInput.csv"
solution_input = pd.read_csv(pathsolution, sep=',', index_col='ID')
print(f"   ✓ Test set: {len(solution_input)} pairs")

# ===== STAP 2: PRE-COMPUTE METRICS =====
print("\n2. Pre-computing graph metrics...")

N = G.number_of_nodes()

# Preferential Attachment
pa = np.zeros((N, N))
for u, v, p in nx.preferential_attachment(G, [(i, j) for i in range(N) for j in range(N)]):
    pa[u, v] = p

# PageRank
pagerank = nx.pagerank(G)

# Clustering
clustering = nx.clustering(G)

print("   ✓ Metrics computed")

# ===== STAP 3: FEATURE ENGINEERING =====
print("\n3. Feature engineering...")

def getFeature(G, i, j):
    """
    Features voor node pair (i, j)
    Totaal: 4 attribute + 20 graph features = 24 features
    """
    features = []
    
    # --- Attribute features (4) ---
    attrs_i = attributes_dict.get(i, {})
    attrs_j = attributes_dict.get(j, {})
    
    for col in attribute_cols:
        val_i = attrs_i.get(col, 0)
        val_j = attrs_j.get(col, 0)
        features.append(val_i)
        features.append(val_j)
        features.append(int(val_i != val_j))        # verschil
        features.append(int(val_i == val_j))       # zelfde waarde?
    
    # --- Basic node features (6) ---
    deg_i = G.degree(i)
    deg_j = G.degree(j)
    cc_i = clustering.get(i, 0)
    cc_j = clustering.get(j, 0)
    pr_i = pagerank.get(i, 0)
    pr_j = pagerank.get(j, 0)
    
    # --- Neighborhood features (7) ---
    common = list(nx.common_neighbors(G, i, j))
    cn_ij = len(common)
    
    neigh_i = set(G.neighbors(i))
    neigh_j = set(G.neighbors(j))
    union_sz = len(neigh_i | neigh_j)
    
    # Jaccard
    jc_ij = (cn_ij / union_sz) if union_sz > 0 else 0.0
    
    # Adamic-Adar
    aa_ij = sum(1.0 / np.log(G.degree(z)) for z in common if G.degree(z) > 1)
    
    # Resource Allocation
    ra_ij = sum(1.0 / G.degree(z) for z in common if G.degree(z) > 0)
    
    # Preferential Attachment
    pa_ij = pa[i, j]
    
    # Salton
    salton = cn_ij / np.sqrt(deg_i * deg_j) if (deg_i * deg_j) > 0 else 0
    
    # Sorensen
    sorensen = (2 * cn_ij) / (deg_i + deg_j) if (deg_i + deg_j) > 0 else 0
    
    # --- Distance feature (1) ---
    # Aantal gemeenschappelijke neighbors op distance 2
    dist2_count = nx.single_source_shortest_path_length(..., cutoff=2)
    
    # --- Derived features (6) ---
    deg_sum = deg_i + deg_j
    deg_diff = abs(deg_i - deg_j)
    deg_product = deg_i * deg_j
    deg_ratio = min(deg_i, deg_j) / max(deg_i, deg_j) if max(deg_i, deg_j) > 0 else 0
    cc_avg = (cc_i + cc_j) / 2
    pr_avg = (pr_i + pr_j) / 2
    
    # Combine all features
    features.extend([
        deg_i, deg_j, cc_i, cc_j, pr_i, pr_j,
        cn_ij, aa_ij, jc_ij, ra_ij, pa_ij, salton, sorensen,
        dist2_count,
        deg_sum, deg_diff, deg_product, deg_ratio, cc_avg, pr_avg
    ])
    
    return np.array(features, dtype=float)

print(f"   ✓ Features: {len(attribute_cols) * 4} attribute + 20 graph = {len(attribute_cols) * 4 + 20} total")

# ===== STAP 4: CREATE DATASET =====
print("\n4. Creating dataset...")

X, Y = [], []

# Positive samples (existing edges)
for i, j in G.edges():
    X.append(getFeature(G, i, j))
    Y.append(1)

# Negative samples (random non-edges, 1:1 ratio)
pos_count = len(G.edges())
rng = np.random.default_rng(42)

neg_added = 0
while neg_added < pos_count:
    i = int(rng.integers(0, N))
    j = int(rng.integers(0, N))
    if not G.has_edge(i, j) and i != j:
        X.append(getFeature(G, i, j))
        Y.append(0)
        neg_added += 1

print(f"   ✓ Dataset: {len(X)} samples ({pos_count} positive, {neg_added} negative)")

# ===== STAP 5: TRAIN MODEL MET 5-FOLD CROSS-VALIDATION =====
print("\n5. Training Random Forest met 5-Fold Cross-Validation...")

# Model configuratie
clf = RandomForestClassifier(
    n_estimators=200,       # Aantal trees
    max_depth=10,           # Minder diep (was 12)
    min_samples_split=12,   # Meer samples nodig (was 8)
    min_samples_leaf=5,     # Grotere leafs (was 4)
    max_features='sqrt',    # Features per split
    random_state=42,
    n_jobs=-1              # Gebruik alle CPU cores
)

# 5-Fold Cross-Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = []

print("\n   Cross-Validation Results:")
print("   " + "-" * 50)

for fold, (train_idx, val_idx) in enumerate(skf.split(X, Y), 1):
    # Split data
    X_train_fold = [X[i] for i in train_idx]
    y_train_fold = [Y[i] for i in train_idx]
    X_val_fold = [X[i] for i in val_idx]
    y_val_fold = [Y[i] for i in val_idx]
    
    # Train
    clf.fit(X_train_fold, y_train_fold)
    
    # Evaluate
    y_pred_val = clf.predict(X_val_fold)
    fold_acc = accuracy_score(y_val_fold, y_pred_val)
    cv_scores.append(fold_acc)
    
    print(f"   Fold {fold}: {fold_acc:.4f}")

cv_mean = np.mean(cv_scores)
cv_std = np.std(cv_scores)

print("   " + "-" * 50)
print(f"   Mean Accuracy:  {cv_mean:.4f}")
print(f"   Std Dev:        {cv_std:.4f}")
print(f"   95% CI:         [{cv_mean - 1.96*cv_std:.4f}, {cv_mean + 1.96*cv_std:.4f}]")

# Train final model op ALLE data (voor beste predictions)
print("\n   Training final model on all data...")
clf.fit(X, Y)
print("   ✓ Training complete")

# ===== STAP 6: FINAL EVALUATION =====
print("\n6. Final Evaluation...")

# Evalueer op volledige dataset (om feature importance te krijgen)
y_pred_all = clf.predict(X)
train_acc = accuracy_score(Y, y_pred_all)

print(f"\n{'='*80}")
print("RESULTS:")
print(f"{'='*80}")
print(f"Cross-Validation (5-Fold):")
print(f"  Mean Accuracy:   {cv_mean:.4f}")
print(f"  Std Dev:         {cv_std:.4f}")
print(f"  Expected Range:  [{cv_mean - cv_std:.4f}, {cv_mean + cv_std:.4f}]")
print(f"\nFull Training Set:")
print(f"  Accuracy:        {train_acc:.4f}")
print(f"\n💡 Verwachte Kaggle score: ~{cv_mean:.4f} (±{cv_std:.4f})")
print(f"{'='*80}")

# ===== STAP 7: FEATURE IMPORTANCE =====
print("\n7. Feature Importance Analysis...")
print("\nTop 10 Most Important Features:")
print("-" * 50)

feature_names = []
for col in attribute_cols:
    feature_names.extend([f'{col}_i', f'{col}_j', f'{col}_diff', f'{col}_same'])
feature_names.extend([
    'deg_i', 'deg_j', 'cc_i', 'cc_j', 'pr_i', 'pr_j',
    'cn', 'aa', 'jc', 'ra', 'pa', 'salton', 'sorensen',
    'dist2_count',
    'deg_sum', 'deg_diff', 'deg_prod', 'deg_ratio', 'cc_avg', 'pr_avg'
])

importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]

for i in range(min(10, len(feature_names))):
    print(f"  {i+1:2d}. {feature_names[indices[i]]:25s} {importances[indices[i]]:.4f}")

# ===== STAP 8: KAGGLE SUBMISSION =====
print(f"\n{'='*80}")
print("8. Generating Kaggle submission...")

# Compute features for test set
test_features = np.array([
    getFeature(G, int(row[0]), int(row[1])) 
    for _, row in solution_input.iterrows()
])

# Predictions
predictions = clf.predict(test_features)

# Save
submission = pd.DataFrame({'ID': solution_input.index, 'prediction': predictions})
submission.to_csv('prediction_RF.csv', index=False)

print(f"   ✓ Saved: prediction_RF.csv")
print(f"   ✓ Predictions: {sum(predictions)} positive, {len(predictions) - sum(predictions)} negative")
print(f"\n{'='*80}")
print(f"✅ DONE! Upload 'prediction_RF.csv' to Kaggle")
print(f"💡 Verwachte Kaggle score: ~{cv_mean:.4f} (±{cv_std:.4f})")
print(f"{'='*80}")

LINK PREDICTION - RANDOM FOREST

1. Loading data...
   ✓ Graph: 1500 nodes, 6600 edges
   ✓ Encoded 'attribute': ['d', 'f', 'l', 'm', 'x', 'y']
   ✓ Test set: 1466 pairs

2. Pre-computing graph metrics...
   ✓ Metrics computed

3. Feature engineering...
   ✓ Features: 4 attribute + 20 graph = 24 total

4. Creating dataset...
   ✓ Dataset: 13200 samples (6600 positive, 6600 negative)

5. Training Random Forest met 5-Fold Cross-Validation...

   Cross-Validation Results:
   --------------------------------------------------
   Fold 1: 0.8625
   Fold 2: 0.8633
   Fold 3: 0.8557
   Fold 4: 0.8644
   Fold 5: 0.8530
   --------------------------------------------------
   Mean Accuracy:  0.8598
   Std Dev:        0.0045
   95% CI:         [0.8509, 0.8687]

   Training final model on all data...
   ✓ Training complete

6. Final Evaluation...

RESULTS:
Cross-Validation (5-Fold):
  Mean Accuracy:   0.8598
  Std Dev:         0.0045
  Expected Range:  [0.8552, 0.8643]

Full Training Set:
  Accura

  getFeature(G, int(row[0]), int(row[1]))


   ✓ Saved: prediction_RF.csv
   ✓ Predictions: 609 positive, 857 negative

✅ DONE! Upload 'prediction_RF.csv' to Kaggle
💡 Verwachte Kaggle score: ~0.8598 (±0.0045)
