In [None]:
import warnings
warnings.filterwarnings("ignore")

Install necessary libraries

In [None]:
pip install igraph

In [None]:
pip install -U gensim

In [None]:
pip install codecarbon

In [None]:
from google.colab import drive
drive.mount('/content/drive')

If you want to use the codecarbon uncomment the below code

In [None]:
# from codecarbon import EmissionsTracker
# tracker = EmissionsTracker()
# tracker.start()

In [None]:
import re
import nltk
import torch
import random
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import spacy
import igraph as ig
import pickle
from tqdm import tqdm
from collections import defaultdict
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from gensim.parsing.preprocessing import remove_stopwords
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from multiprocessing import Pool


nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('stopwords')

**Loading Data Phase**

Load Every Data That We Have

In [None]:
# Load abstracts
abstracts = {}
with open("/content/drive/MyDrive/llms/abstracts.txt", "r", encoding="utf-8") as f:
    content = f.read()
    abstract_entries = content.split("\n")

    for entry in abstract_entries:
        if entry and "|--|" in entry:
            parts = entry.split("|--|")
            if len(parts) == 2:
                paper_id = int(parts[0])
                abstract_text = parts[1]
                abstracts[paper_id] = abstract_text
    print(f"Loaded {len(abstracts)} abstracts.")



# Load citation network
edges = []
with open("/content/drive/MyDrive/llms/edgelist.txt", "r") as f:
    for line in f:
        source, target = map(int, line.strip().split(","))
        edges.append((source, target))

# Create a directed graph
G = nx.DiGraph()
G.add_edges_from(edges)
# print(G.number_of_nodes())
# print(G.number_of_edges())


# Load authors
authors = {}
with open("/content/drive/MyDrive/llms/authors.txt", "r", encoding="utf-8") as f:
    for line in f:
        if "|--|" in line:
            parts = line.strip().split("|--|")
            paper_id = int(parts[0])
            author_list = parts[1].split(",")
            authors[paper_id] = author_list
    print(f"Loaded {len(authors)} authors.")



Test Pairs

In [None]:
# # Load test pairs
test_pairs_kaggle = []
with open("/content/drive/MyDrive/llms/test.txt", "r") as f:
    for line in f:
        source, target = map(int, line.strip().split(","))
        test_pairs_kaggle.append((source, target))

Node2Vec Embeddings

In [None]:
# Define the filename where the embeddings were saved
embeddings_filename = "/content/drive/MyDrive/llms/node_embeddings.pkl"

# Load the dictionary from the file
with open(embeddings_filename, 'rb') as f:
    node_embeddings = pickle.load(f)

all-MiniLM-L6-v2 Embedding(Sentence Transformer)

In [None]:
sentence_bert_embeddings = np.load('/content/drive/MyDrive/llms/all-MiniLM-L6-v2.npy', allow_pickle=True).item()

all-mpnet-base-v2 Embedding(Sentence Transformer)

In [None]:
sentence_bert_embeddings = np.load('/content/drive/MyDrive/llms/all-mpnet-base-v2.npy', allow_pickle=True).item()

SciBERT Embeddings

In [None]:
sci_bert = np.load('/content/drive/MyDrive/llms/scibert_embeddings.npy', allow_pickle=True).item()

Specter Embeddings

In [None]:
specter_bert = np.load('/content/drive/MyDrive/llms/specter_embeddings.npy', allow_pickle=True).item()

DistilBERT Embeddings

In [None]:
distill_bert = np.load('/content/drive/MyDrive/llms/distilbert_embeddings.npy', allow_pickle=True).item()

Here we will split our data into train,val and test in order to have propel splitting and avoid data leakage

In [None]:
random_state=42
random.seed(random_state)
np.random.seed(random_state)

test_ratio=0.80
val_ratio=0.10
test_ratio = 0.10

# Get all edges from original graph
all_edges = list(G.edges())
all_nodes = list(G.nodes())

print(f"Original graph - Nodes: {len(all_nodes)}, Edges: {len(all_edges)}")

# Split edges into test and train_val
train_val_edges, test_edges = train_test_split(
    all_edges,
    test_size=test_ratio,
    random_state=random_state
)

# Do this in order to have 80% training edges, 10% val and 10% test
val_size_relative_to_train_val = val_ratio / (1.0 - test_ratio)

train_edges, val_edges = train_test_split(
    train_val_edges,
    test_size=val_size_relative_to_train_val,
    random_state=random_state
)

print(f"Train edges: {len(train_edges)} ({len(train_edges)/len(all_edges)*100:.1f}%)")
print(f"Val edges: {len(val_edges)} ({len(val_edges)/len(all_edges)*100:.1f}%)")
print(f"Test edges: {len(test_edges)} ({len(test_edges)/len(all_edges)*100:.1f}%)")

# Create training graph
G_train = nx.DiGraph() if G.is_directed() else nx.Graph()
G_train.add_edges_from(train_edges)
G_train.add_nodes_from(all_nodes)

print(f"Training graph - Nodes: {G_train.number_of_nodes()}, Edges: {G_train.number_of_edges()}")

Here we generate negative samples for every train_pairs, val_pairs, test_pairs.
The method is simple. Every time the function iteratively picks random pairs of nodes, checks if they represent an existing edge in the training graph or any positive edge in the original graph, or if they have already been generated as negatives. If none of these conditions are met, the pair is added to the list of negative samples until the desired number is reached or a maximum number of attempts is exceeded.

In [None]:
def generate_negative_samples(graph, positive_edges_all, existing_negatives, n_samples, seed=42):
    print(f"Generating {n_samples} negative samples for directed graph...")
    random.seed(seed)
    nodes = list(graph.nodes())
    existing_edges = set(graph.edges())
    all_positive = set(positive_edges_all)

    negative_pairs = set()
    max_attempts = n_samples * 20
    attempts = 0

    with tqdm(total=n_samples) as pbar:
        while len(negative_pairs) < n_samples and attempts < max_attempts:
            u, v = random.sample(nodes, 2)
            edge = (u, v)

            if (edge not in existing_edges and
                edge not in all_positive and
                edge not in existing_negatives and
                edge not in negative_pairs):
                negative_pairs.add(edge)
                pbar.update(1)
            attempts += 1

    if len(negative_pairs) < n_samples:
        print(f"Only generated {len(negative_pairs)} negative samples out of requested {n_samples}.")

    return list(negative_pairs)

In [None]:
# Generate negative samples sequentially to avoid overlap
positive_edges_all = set(train_edges + val_edges + test_edges)

# Generate train negatives
train_negative = generate_negative_samples(
    G_train, positive_edges_all, set(), len(train_edges), seed=42
)

# Generate val negatives
val_negative = generate_negative_samples(
    G_train, positive_edges_all, set(train_negative), len(val_edges), seed=43
)

# Generate test negatives
test_negative = generate_negative_samples(
    G_train, positive_edges_all, set(train_negative + val_negative), len(test_edges), seed=44
)

Here we have properly created our data into train, val and test

In [None]:
train_pairs = train_edges + train_negative
train_labels = [1] * len(train_edges) + [0] * len(train_negative)

val_pairs = val_edges + val_negative
val_labels = [1] * len(val_edges) + [0] * len(val_negative)

test_pairs = test_edges + test_negative
test_labels = [1] * len(test_edges) + [0] * len(test_negative)

**Preprocessing Phase**

Here we do basic NLP techniques to preprocess our abstracts

In [None]:
# Preprocess abstracts
lemmatizer = WordNetLemmatizer()



def preprocess_text(text):
    # Remove special chars & lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())

    # Remove stopwords
    text = remove_stopwords(text)


    tokens = word_tokenize(text)

    # create stemmer object
    stemmer = nltk.stem.PorterStemmer()

    # stem each token
    stemmed_tokens = [stemmer.stem(token) for token in tokens]

    # # Lemmatize
    # tokens = word_tokenize(stemmed_tokens)
    # lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return ' '.join(stemmed_tokens)



def preprocess_batch(batch):
    results = {}
    for paper_id, abstract in batch:
        results[paper_id] = preprocess_text(abstract)
    return results



# Preprocess all abstracts using batches and multiprocessing
batch_size = 10000
num_processes = 12

with Pool(processes=num_processes) as pool:
    batches = [list(abstracts.items())[i:i + batch_size] for i in range(0, len(abstracts), batch_size)]
    results = pool.map(preprocess_batch, batches)


# Combine the results from all batches
preprocessed_abstracts = {}
for batch_result in results:
    preprocessed_abstracts.update(batch_result)

# Print the 5 first rows to see how the Preprocess abstracts are
df = pd.DataFrame(list(preprocessed_abstracts.items()), columns=['paper_id', 'abstract'])

print(df.head())




**Features Engineering**

**TF-IDF**: Tells you how important a single word is in a corpus
by assigning it a weight, and at the same time down-weight common words like “a”, “and”, and “the”

In [None]:
corpus = list(preprocessed_abstracts.values())


# features 20000, ngram_range specifies the boundary of this range which will be extracted from the corpus
vectorizer = TfidfVectorizer(max_features=20000,ngram_range=(1, 2))
tfidf_matrix = vectorizer.fit_transform(corpus)


# Map paper IDs to their TF-IDF vectors
paper_to_tfidf = {paper_id: tfidf_matrix[i] for i, paper_id in enumerate(preprocessed_abstracts.keys())}

# Convert sparse vectors to dense vectors once
paper_to_dense_tfidf = {
    paper_id: tfidf_matrix[i].toarray().flatten()
    for i, paper_id in enumerate(preprocessed_abstracts.keys())
}

# Precompute norms once
paper_to_norm = {
    paper_id: np.linalg.norm(vec)
    for paper_id, vec in paper_to_dense_tfidf.items()
}




**get_text_similarity**: Quantifies how similar the abstracts of two papers are based on the weighted importance of the words they contain

In [None]:
def get_text_similarity(paper1, paper2):
    if paper1 not in paper_to_dense_tfidf or paper2 not in paper_to_dense_tfidf:
        return 0.0
    vec1 = paper_to_dense_tfidf[paper1]
    vec2 = paper_to_dense_tfidf[paper2]
    norm1 = paper_to_norm[paper1]
    norm2 = paper_to_norm[paper2]
    denominator = norm1 * norm2 + 1e-8
    if denominator == 0:
        return 0.0
    return np.dot(vec1, vec2) / denominator

**get_author_overlap** : It measures the similarity between the authors of two papers

In [None]:
def get_author_overlap(paper1, paper2):
    if paper1 not in authors or paper2 not in authors:
        return 0.0
    authors1 = set(authors[paper1])
    authors2 = set(authors[paper2])
    overlap = len(authors1.intersection(authors2))
    union = len(authors1.union(authors2))
    return overlap / union if union > 0 else 0.0

**get_common_neighbors**: This function calculates the number of common "out-neighbors" between two papers in the training graph

In [None]:
def get_common_neighbors(paper1, paper2):
    if paper1 not in G_train.nodes or paper2 not in G_train.nodes:
        return 0
    neighbors1 = set(G_train.successors(paper1))
    neighbors2 = set(G_train.successors(paper2))
    return len(neighbors1.intersection(neighbors2))

**get_jaccard_coefficient**: It measures the similarity between the sets of papers cited by paper1 and paper2

In [None]:
def get_jaccard_coefficient(paper1, paper2):
    if paper1 not in G_train.nodes or paper2 not in G_train.nodes:
        return 0.0
    neighbors1 = set(G_train.successors(paper1))
    neighbors2 = set(G_train.successors(paper2))
    intersection = len(neighbors1.intersection(neighbors2))
    union = len(neighbors1.union(neighbors2))
    return intersection / union if union > 0 else 0.0

**get_preferential_attachment**: This function calculates the preferential attachment score between two papers based on their out-degrees in the training graph

In [None]:
def get_preferential_attachment(paper1, paper2):
    if paper1 not in G_train.nodes or paper2 not in G_train.nodes:
        return 0
    return G_train.out_degree(paper1) * G_train.out_degree(paper2)

**get_embedding_similarity** : This function calculates the cosine similarity between the vector representations (embeddings) of two nodes using **Node2Vec**

In [None]:
def get_embedding_similarity(source, target, node_embeddings):
    """
    Calculate cosine similarity between node embeddings with better error handling
    """
    if source not in node_embeddings or target not in node_embeddings:
        return 0.0

    source_emb = node_embeddings[source]
    target_emb = node_embeddings[target]

    dot_product = np.dot(source_emb, target_emb)
    norm_source = np.linalg.norm(source_emb)
    norm_target = np.linalg.norm(target_emb)

    if norm_source * norm_target == 0:
        return 0.0
    return dot_product / (norm_source * norm_target)

**calculate_adamic_adar** : This function calculates the Adamic-Adar index between two nodes (source and target) based on their common neighbors in the training graph (G_train)

In [None]:
import numpy as np
import networkx as nx
from collections import defaultdict


adamic_adar_cache = {}

def calculate_adamic_adar(source, target):
    if (source, target) in adamic_adar_cache:
        return adamic_adar_cache[(source, target)]

    adamic_adar = 0
    try:
        common_neighbors_set = set(G_train.neighbors(source)).intersection(set(G_train.neighbors(target)))
        adamic_adar = sum(1/np.log(1 + G_train.degree(n)) for n in common_neighbors_set)
    except:
        pass
    adamic_adar_cache[(source, target)] = adamic_adar
    return adamic_adar


**calculate_shortest_path** : This function calculates a feature related to the shortest path length between two nodes (source and target) in the training graph

In [None]:
shortest_path_cache = {}

def calculate_shortest_path(source, target):
  if (source, target) in shortest_path_cache:
      return shortest_path_cache[(source, target)]

  try:
      path_length = nx.shortest_path_length(G_train, source=source, target=target)
      shortest_path = 1.0 / (path_length + 1)
  except:
      shortest_path = 0

  shortest_path_cache[(source, target)] = shortest_path
  return shortest_path

**get_sentence_bert_features** : We calculate with the help of the sentece models the maximum similarity score (indicating the most similar sentence pair), the average similarity across all sentence pairs (providing an overall semantic relatedness measure), and the count of sentence pairs exceeding a specified similarity threshold (capturing the volume of highly similar content

In [None]:
def get_sentence_bert_features(paper1, paper2, sentence_bert_embeddings, threshold=0.8):

    if paper1 not in sentence_bert_embeddings or paper2 not in sentence_bert_embeddings:
        return 0.0, 0.0, 0

    emb1 = sentence_bert_embeddings[paper1]
    emb2 = sentence_bert_embeddings[paper2]

    sims = []
    for vec1 in emb1:
        for vec2 in emb2:
            sim = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2) + 1e-8)
            sims.append(sim)

    if not sims:
        return [0.0, 0.0, 0]

    avg_sim = np.mean(sims)
    max_sim = np.max(sims)
    min_sim = np.min(sims)
    count_above = sum(s > threshold for s in sims)

    return max_sim, avg_sim, count_above


**get_bert_similarity** :  This function calculates the cosine similarity between the embedding vectors generated by a BERT-based model (like SciBERT, Specter, or DistilBERT)

In [None]:
def get_bert_similarity(source, target, bert_embeddings):

    # Handle cases where embeddings are not found
    if source_emb is None or target_emb is None:
        return 0.0

    # Ensure embeddings are NumPy arrays
    source_emb = np.asarray(source_emb)
    target_emb = np.asarray(target_emb)

    # Handle cases where embeddings are zero vectors
    if np.all(source_emb == 0) or np.all(target_emb == 0):
        return 0.0

    dot_product = np.dot(source_emb, target_emb)
    norm_source = np.linalg.norm(source_emb)
    norm_target = np.linalg.norm(target_emb)

    if norm_source * norm_target == 0:
        return 0.0
    return dot_product / (norm_source * norm_target)

We calculate graph attributes like Clustering Coefficient , PageRank , •	HITS Algorithm , Betweenness Centrality

In [None]:
ig_G = ig.Graph.from_networkx(G_train)

cluster = nx.clustering(G_train)

rank = nx.pagerank(G_train)

h,a = nx.hits(G_train)

bet = ig_G.betweenness(directed=True, cutoff=5)

katz_centrality = nx.katz_centrality(G_train)

Here we generate the training data for the negatives and positives examples . Here the user can uncomment or comment out the features that want to have the models or the features that he does not want to have

In [None]:
import numpy as np
import networkx as nx
from collections import defaultdict
from tqdm import tqdm
import time

def generate_training_data(negatives, positives):


    X_data = []
    y_data = []

    # Process NEGATIVE examples
    for source, target in tqdm(negatives, desc="Processing negative examples"):
        #start_time = time.time()

        # max_sim, avg_sim, count_above = get_sentence_bert_features(source, target, sentence_bert_embeddings)


        # Combine original features with new ones
        features = [
            get_text_similarity(source, target),
            get_author_overlap(source, target),
            get_common_neighbors(source, target),
            get_jaccard_coefficient(source, target),
            get_preferential_attachment(source, target),
            get_embedding_similarity(source, target, node_embeddings),
            get_bert_similarity(source, target, sci_bert),
            cluster[source] + cluster[target],
            abs(cluster[source] - cluster[target]),
            rank[source] + rank[target],
            abs(rank[source] - rank[target]),
            h[source] + h[target],
            abs(h[source] - h[target]),
            a[source] + a[target],
            abs(a[source] - a[target]),
            bet[source] + bet[target],
            abs(bet[source] - bet[target]),
            katz_centrality[source] + katz_centrality[target],
            abs(katz_centrality[source] - katz_centrality[target]),
            calculate_adamic_adar(source, target),
            # calculate_shortest_path(source, target)
            # max_sim,
            # avg_sim,
            # count_above,
        ]



        #end_time = time.time()
        X_data.append(features)
        y_data.append(0)




    # Process POSITIVE examples
    for source, target in tqdm(positives, desc="Processing positive examples"):
        #start_time = time.time()  # Start timer

        # max_sim, avg_sim, count_above = get_sentence_bert_features(source, target, sentence_bert_embeddings)


        # Combine original features with new ones
        features = [
            get_text_similarity(source, target),
            get_author_overlap(source, target),
            get_common_neighbors(source, target),
            get_jaccard_coefficient(source, target),
            get_preferential_attachment(source, target),
            get_embedding_similarity(source, target, node_embeddings),
            get_bert_similarity(source, target, sci_bert),
            cluster[source] + cluster[target],
            abs(cluster[source] - cluster[target]),
            rank[source] + rank[target],
            abs(rank[source] - rank[target]),
            h[source] + h[target],
            abs(h[source] - h[target]),
            a[source] + a[target],
            abs(a[source] - a[target]),
            bet[source] + bet[target],
            abs(bet[source] - bet[target]),
            katz_centrality[source] + katz_centrality[target],
            abs(katz_centrality[source] - katz_centrality[target]),
            calculate_adamic_adar(source, target),
            # calculate_shortest_path(source, target)
            # max_sim,
            # avg_sim,
            # count_above,

        ]

        #end_time = time.time()
        X_data.append(features)
        y_data.append(1)  # Positive example



    return np.array(X_data), np.array(y_data)

**X_train, y_train sets**

In [None]:
X_train, y_train = generate_training_data(train_negative, train_edges)

**X_val, y_val sets**

In [None]:
X_val, y_val = generate_training_data(val_negative, val_edges)

**X_test, y_test sets**

In [None]:
X_test, y_test = generate_training_data(test_negative, test_edges)

In [None]:
# print the shape of our data
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

**Scale the Features to normalize the range, distribution, and magnitude of features, reducing potential biases and inconsistencies that may arise from variations in their values**

In [None]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

**MODELS**

Here we have our different models and architectures and some statistics for every model like **accuracy_score, precision_score, recall_score, f1_score**

More specific we have XGBOOST where we tested many parameters

In [None]:
!pip install xgboost

In [None]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
from xgboost import XGBClassifier
from sklearn.metrics import log_loss,accuracy_score, precision_score, recall_score, f1_score

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import (log_loss, roc_curve, auc, precision_recall_curve,
                           average_precision_score, confusion_matrix,
                           ConfusionMatrixDisplay, classification_report,
                           accuracy_score, precision_score, recall_score, f1_score)

#############################################
################ XGBOOST #####################

# # XGBoost Classifier with parameters
bst = XGBClassifier(
    n_estimators=100,
    max_depth=4,              # Reduced complexity
    learning_rate=0.05,       # Slower learning
    min_child_weight=3,       # More regularization
    gamma=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,            # L1 regularization
    reg_lambda=1.0,           # L2 regularization
    random_state=42,
    eval_metric='logloss'
)

# # Slightly more conservative parameters
# bst = XGBClassifier(
#     n_estimators=80,           # Reduced slightly
#     max_depth=5,               # Moderate depth
#     learning_rate=0.08,        # Moderate learning rate
#     min_child_weight=2,
#     gamma=0.05,
#     subsample=0.85,
#     colsample_bytree=0.85,
#     reg_alpha=0.05,
#     reg_lambda=0.5,
#     random_state=42,
#     eval_metric='logloss'
# )

# bst = XGBClassifier(
#     objective='binary:logistic',
#     eval_metric='logloss',
#     learning_rate=0.01,  # Much lower than default (0.3)
#     max_depth=4,         # Lower than default (6)
#     min_child_weight=10, # Higher than default (1)
#     subsample=0.8,       # Row sampling
#     colsample_bytree=0.8, # Feature sampling
#     reg_alpha=1.0,       # L1 regularization
#     reg_lambda=5.0,      # L2 regularization (higher for graph data)
#     scale_pos_weight=1,  # Adjust if class imbalance
#     random_state=42
# )



print("=== XGBoost Link Prediction Model ===\n")

# Cross-validation setup
print("Performing 10-fold cross-validation...")
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Perform cross-validation focusing on log loss
cv_scores = cross_val_score(bst, X_train_scaled, y_train, cv=kf, scoring='neg_log_loss', verbose=1)

print(f"Cross-validation scores (neg_log_loss): {cv_scores}")
print(f"Mean CV Log Loss: {-cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

# Train the model
print("\nTraining XGBoost model...")
bst.fit(X_train_scaled, y_train)

# Validation predictions
y_val_pred_proba = bst.predict_proba(X_val_scaled)[:, 1]
y_val_pred = bst.predict(X_val_scaled)

# Calculate validation metrics
val_loss = log_loss(y_val, y_val_pred_proba)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_precision = precision_score(y_val, y_val_pred)
val_recall = recall_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)

print(f"\n=== Validation Results ===")
print(f"Validation Log Loss: {val_loss:.4f}")
print(f"Validation Accuracy: {val_accuracy:.4f}")
print(f"Validation Precision: {val_precision:.4f}")
print(f"Validation Recall: {val_recall:.4f}")
print(f"Validation F1-Score: {val_f1:.4f}")



# Generate all visualizations
print(f"\n=== Generating Visualizations ===")


# Detailed classification report
print(f"\n=== Detailed Classification Report ===")
print(classification_report(y_val, y_val_pred, target_names=['No Link', 'Link']))


# FINAL TEST EVALUATION
print(f"\n=== FINAL TEST EVALUATION ===")


y_test_pred_proba = bst.predict_proba(X_test_scaled)[:, 1]
y_test_pred = bst.predict(X_test_scaled)

test_loss = log_loss(y_test, y_test_pred_proba)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Final Test Log Loss: {test_loss:.4f}")
print(f"Final Test Accuracy: {test_accuracy:.4f}")


print(f"\nModel training and validation complete!")

Also here we print the feature importance for our **XGB Models**

In [None]:
bst.feature_importances_

In [None]:
from xgboost import plot_importance
import matplotlib.pyplot as plt

bst.feature_importances_

plot_importance(bst)
plt.show()

Same for the **LogisticRegression Models**. This was our baseline model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score



model = LogisticRegression(max_iter=20000, random_state=42)


# K is 10 folds
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Perform cross-validation, focusing on log loss
cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=kf, scoring='neg_log_loss')

print(cv_scores)


model.fit(X_train_scaled, y_train)
y_pred_proba = model.predict_proba(X_val_scaled)[:, 1]
val_loss = log_loss(y_val, y_pred_proba)
print("K was 10 folds\n")


y_pred = model.predict(X_val_scaled)


# Calculate evaluation metrics
val_loss = log_loss(y_val, y_pred_proba)
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)

print(f"Logistic Regression - Validation Log Loss: {val_loss}")
print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1: {f1}")


y_test_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
y_test_pred = model.predict(X_test_scaled)

test_loss = log_loss(y_test, y_test_pred_proba)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Final Test Log Loss: {test_loss:.4f}")
print(f"Final Test Accuracy: {test_accuracy:.4f}")


print(f"\nModel training and validation complete!")

Our **LGB Models**

In [None]:
import lightgbm as lgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, precision_recall_curve, average_precision_score


lgb_model = lgb.LGBMClassifier(
    n_estimators=100,
    max_depth=4,              # Reduced complexity
    learning_rate=0.05,       # Slower learning
    min_child_weight=3,       # More regularization
    gamma=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,            # L1 regularization
    reg_lambda=1.0,           # L2 regularization
    verbose=-1,
    random_state=42)




# K is 10 folds
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Perform cross-validation, focusing on log loss
cv_scores = cross_val_score(lgb_model, X_train_scaled, y_train, cv=kf, scoring='neg_log_loss')

print(cv_scores)


lgb_model.fit(X_train_scaled, y_train)
y_pred_proba = lgb_model.predict_proba(X_val_scaled)[:, 1]
y_pred = lgb_model.predict(X_val_scaled)

val_loss = log_loss(y_val, y_pred_proba)
print("K was 10 folds\n")
print(f"LIGHTGBM Validation Log Loss: {val_loss}")

# Calculate evaluation metrics
val_loss = log_loss(y_val, y_pred_proba)
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)


print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1: {f1}")


# ROC Curve
fpr, tpr, thresholds = roc_curve(y_val, y_pred_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

# Precision-Recall Curve
precision, recall, _ = precision_recall_curve(y_val, y_pred_proba)
average_precision = average_precision_score(y_val, y_pred_proba)

plt.figure(figsize=(8, 6))
plt.step(recall, precision, where='post', label='Precision-Recall curve (AP = %0.2f)' % average_precision)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Precision-Recall Curve')
plt.legend(loc="lower left")
plt.show()



Again here we print the feature importance for our **LGB Model**

In [None]:
print(lgb_model.feature_importances_)

In [None]:
import lightgbm as lgb
import matplotlib.pyplot as plt

lgb.plot_importance(lgb_model, importance_type='split')
plt.title('Feature Importance')
plt.show()

**A simle MLP Model with 2 layers**

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler


mlp = MLPClassifier(
    hidden_layer_sizes=(10, 5),
    activation='relu',
    alpha=0.01,  # L2 regularization
    learning_rate_init=0.001,
    max_iter=500,
    early_stopping=True,
    validation_fraction=0.1,
    n_iter_no_change=20,
    random_state=42
)

mlp.fit(X_train_scaled, y_train)

# Validation predictions
y_val_pred_proba = mlp.predict_proba(X_val_scaled)[:, 1]
y_val_pred = mlp.predict(X_val_scaled)

# Calculate validation metrics
val_loss = log_loss(y_val, y_val_pred_proba)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_precision = precision_score(y_val, y_val_pred)
val_recall = recall_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)

print(f"\n=== Validation Results ===")
print(f"Validation Log Loss: {val_loss:.4f}")
print(f"Validation Accuracy: {val_accuracy:.4f}")
print(f"Validation Precision: {val_precision:.4f}")
print(f"Validation Recall: {val_recall:.4f}")
print(f"Validation F1-Score: {val_f1:.4f}")



# Generate all visualizations
print(f"\n=== Generating Visualizations ===")


# Detailed classification report
print(f"\n=== Detailed Classification Report ===")
print(classification_report(y_val, y_val_pred, target_names=['No Link', 'Link']))


# FINAL TEST EVALUATION
print(f"\n=== FINAL TEST EVALUATION ===")


y_test_pred_proba = mlp.predict_proba(X_test_scaled)[:, 1]
y_test_pred = mlp.predict(X_test_scaled)

test_loss = log_loss(y_test, y_test_pred_proba)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Final Test Log Loss: {test_loss:.4f}")
print(f"Final Test Accuracy: {test_accuracy:.4f}")

In the next two blocks we try to find the best params using the hyperopt library for our **XGB and LGB Models**

In [None]:
############################################################################
##########TRY TO FIND THE BEST PARAMS FOR OUR XGB MODEL#####################
############################################################################

from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from xgboost import XGBClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
import numpy as np



# Define objective function
def objective(space):
    model = XGBClassifier(
        objective='binary:logistic',
        use_label_encoder=False,
        eval_metric='logloss',
        tree_method='gpu_hist',
        gpu_id=0,
        n_estimators=space['n_estimators'],
        max_depth=space['max_depth'],
        learning_rate=space['learning_rate'],
        subsample=space['subsample'],
        colsample_bytree=space['colsample_bytree'],
        gamma=space['gamma'],
        min_child_weight=space['min_child_weight']
        # reg_alpha=space['reg_alpha'],
        # reg_lambda=space['reg_lambda']
    )


    cv_scores = cross_val_score(
        model, X_train_scaled, y_train,
        cv=3,  # 3-fold CV
        scoring='neg_log_loss',
        n_jobs=-1
    )


    loss = -cv_scores.mean()
    return {'loss': loss, 'status': STATUS_OK}

# Define Hyperopt search space
search_space = {
    'n_estimators': hp.choice('n_estimators', list(range(20, 205, 5))),
    'max_depth': hp.choice('max_depth', list(range(5, 30))),
    'learning_rate': hp.quniform('learning_rate', 0.01, 0.5, 0.01),
    'subsample': hp.quniform('subsample', 0.1, 1, 0.01),
    'colsample_bytree': hp.quniform('colsample_bytree', 0.1, 1.0, 0.01),
    'gamma': hp.quniform('gamma', 0, 0.50, 0.01),
    'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1)
}

# Run optimization
trials = Trials()
best_params = fmin(fn=objective,
                   space=search_space,
                   algo=tpe.suggest,
                   max_evals=100,
                   trials=trials,
                   rstate=np.random.default_rng(42)
                  )

print("Best hyperparameters from Hyperopt:", best_params)

best_params_actual = {
    'n_estimators': list(range(20, 205, 5))[best_params['n_estimators']],
    'max_depth': list(range(5, 30))[best_params['max_depth']],
    'learning_rate': best_params['learning_rate'],
    'subsample': best_params['subsample'],
    'colsample_bytree': best_params['colsample_bytree'],
    'gamma': best_params['gamma'],
    'min_child_weight': int(best_params['min_child_weight'])
}

print("Best hyperparameters (actual values):", best_params_actual)

In [None]:
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from lightgbm import LGBMClassifier
from sklearn.metrics import log_loss
import numpy as np
import warnings
warnings.filterwarnings("ignore")

def objective(space):
    model = LGBMClassifier(
        n_estimators=int(space['n_estimators']),
        max_depth=int(space['max_depth']),
        learning_rate=space['learning_rate'],
        min_child_weight=space['min_child_weight'],
        num_leaves=int(space['num_leaves']),
        min_child_samples=int(space['min_child_samples']),
        subsample=space['subsample'],
        colsample_bytree=space['colsample_bytree'],
        reg_alpha=space['reg_alpha'],
        reg_lambda=space['reg_lambda'],
        random_state=42,
        n_jobs=-1,
        device='gpu',
        gpu_platform_id=0,
        gpu_device_id=0,
        verbose=-1
    )


    cv_scores = cross_val_score(
        model, X_train_scaled, y_train,
        cv=3,  # 3-fold CV
        scoring='neg_log_loss',
        n_jobs=1
    )

    # Return positive log loss (lower is better)
    loss = -cv_scores.mean()
    return {'loss': loss, 'status': STATUS_OK}


search_space = {
    'n_estimators': hp.choice('n_estimators', [100, 200, 500, 1000]),
    'max_depth': hp.choice('max_depth', list(np.arange(5, 16, 1))),
    'learning_rate': hp.choice('learning_rate', list(np.arange(0.05, 0.31, 0.05))),
    'num_leaves': hp.quniform('num_leaves', 20, 50, 1),
    'min_child_samples': hp.quniform('min_child_samples', 5, 100, 1),
    'min_child_weight': hp.choice('min_child_weight', list(np.arange(1, 8, 1))),
    'subsample': hp.uniform('subsample', 0.8, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 0.8),
    'reg_alpha': hp.uniform('reg_alpha', 0.0, 1.0),
    'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.0)
}

trials = Trials()

best = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=trials,
    rstate=np.random.default_rng(42)
)

print("Best hyperparameters:", best)


Here we also do a **SHAP Analysis** for our top models from **XGB and LGB**

In [None]:
!pip install shap

In [None]:
import shap

In [None]:
# Create TreeExplainer object
explainer_xgb = shap.TreeExplainer(bst)

# Calculate SHAP values for validation data
shap_values_xgb = explainer_xgb.shap_values(X_val_scaled)

# Summarize the effects of all the features
shap.summary_plot(shap_values_xgb, X_val_scaled, feature_names= [
                                                                  'cluster_sum', 'cluster_diff',
                                                                  'rank_sum', 'rank_diff',
                                                                  'h_sum', 'h_diff',
                                                                  'a_sum', 'a_diff',
                                                                  'bet_sum', 'bet_diff',
                                                                  'katz_sum', 'katz_diff'
                                                                ])

In [None]:
# Create TreeExplainer object
explainer_lgb = shap.TreeExplainer(lgb_model)

# Calculate SHAP values for validation data
shap_values_lgb = explainer_lgb.shap_values(X_val_scaled)

# Summarize the effects of all the features
shap.summary_plot(shap_values_lgb, X_val_scaled, feature_names=[
                                                                  'cluster_sum', 'cluster_diff',
                                                                  'rank_sum', 'rank_diff',
                                                                  'h_sum', 'h_diff',
                                                                  'a_sum', 'a_diff',
                                                                  'bet_sum', 'bet_diff',
                                                                  'katz_sum', 'katz_diff'
                                                                ])

Function for Predictions for our test pairs **(Kaggle test pairs) **

In [None]:
def predict_for_test_set(test_pairs, model, scaler):
    predictions = []



    with tqdm(enumerate(test_pairs), desc="Test predictions") as pbar:

      for pair_id, (source, target) in pbar:

          # max_sim, avg_sim, count_above = get_sentence_bert_features(source, target, sentence_bert_embeddings)

          features = [
            get_text_similarity(source, target),
            get_author_overlap(source, target),
            get_common_neighbors(source, target),
            get_jaccard_coefficient(source, target),
            get_preferential_attachment(source, target),
            get_embedding_similarity(source, target, node_embeddings),
            get_bert_similarity(source, target, specter_bert_embeddings),
            cluster[source] + cluster[target],
            abs(cluster[source] - cluster[target]),
            rank[source] + rank[target],
            abs(rank[source] - rank[target]),
            h[source] + h[target],
            abs(h[source] - h[target]),
            a[source] + a[target],
            abs(a[source] - a[target]),
            bet[source] + bet[target],
            abs(bet[source] - bet[target]),
            katz_centrality[source] + katz_centrality[target],
            abs(katz_centrality[source] - katz_centrality[target]),
            calculate_adamic_adar(source, target),
            calculate_shortest_path(source, target)
            # max_sim,
            # avg_sim,
            # count_above,

        ]

          # Scale features
          features_scaled = scaler.transform([features])

          # Predict probability
          probability = model.predict_proba(features_scaled)[0, 1]

          predictions.append((pair_id, float(probability)))

    return predictions


Predictions For **XGB**

In [None]:
# Make predictions for test set
predictions_bst = predict_for_test_set(test_pairs_kaggle, bst, scaler)

# Create submission file
submission = pd.DataFrame(predictions_bst, columns=["ID", "Label"])
submission.to_csv("/content/drive/MyDrive/llms/bst_new_feat.csv", index=False)

#tracker.stop()

Prediction For **LGB**

In [None]:
# Make predictions for test set
predictions_lgb = predict_for_test_set(test_pairs_kaggle, lgb_model, scaler)

# Create submission file
submission = pd.DataFrame(predictions_lgb, columns=["ID", "Label"])
submission.to_csv("/content/drive/MyDrive/llms/lgb_new_feat.csv", index=False)

**Combine The Two Models Predictions**

In [None]:
import pandas as pd


predictions_bst_df = pd.DataFrame(predictions_bst, columns=["ID", "Label_bst"])
predictions_lgb_df = pd.DataFrame(predictions_lgb, columns=["ID", "Label_lgb"])


combined_predictions = pd.merge(predictions_bst_df, predictions_lgb_df, on="ID")

# Calculate the average prediction
combined_predictions["Label"] = 0.5 * combined_predictions["Label_bst"] + 0.5 * combined_predictions["Label_lgb"]

# Extract the final predictions as a list of tuples
final_preds = list(zip(combined_predictions["ID"], combined_predictions["Label"]))


submission = pd.DataFrame(final_preds, columns=["ID", "Label"])
submission.to_csv("/content/drive/MyDrive/llms/combine_new_feats.csv", index=False)

Predictions For **Stack Model(Meta Model)**

In [None]:
from sklearn.ensemble import StackingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression

stacked_model = StackingClassifier(
    estimators=[
        ('xgb', XGBClassifier()),
        ('lgb', LGBMClassifier()),
        ('logistic', LogisticRegression())
    ],
    final_estimator=LogisticRegression()
)

stacked_model.fit(X_train_scaled, y_train)

predictions_stack_model = predict_for_test_set(test_pairs_kaggle, stacked_model, scaler, node_embeddings)

# Create submission file
submission = pd.DataFrame(predictions_lgb, columns=["ID", "Label"])
submission.to_csv("/content/drive/MyDrive/llms/stacked_model_new_feats.csv", index=False)
