In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,roc_curve,auc,classification_report,make_scorer
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier


In [None]:
from link_prediction import *

In [None]:
#Load the entity embeddings dict
with open("fb15k237_transe_entity_embeddings.pkl", "rb") as f:
    entity_embeddings = pickle.load(f)
#Load the predicate embeddings dict
with open("fb15k237_transe_predicate_embeddings.pkl", "rb") as f:
    predicate_embeddings = pickle.load(f)

In [None]:
train_triples = pd.read_csv('fb15k237_train.txt', dtype=str, sep='\t', header=None, names=['head', 'relation', 'tail'])
test_triples = pd.read_csv('fb15k237_test.txt', dtype=str, sep='\t', header=None, names=['head', 'relation', 'tail'])
#valid_triples = pd.read_csv('fb15k237_valid.txt', dtype=str, sep='\t', header=None, names=['head', 'relation', 'tail'])

In [None]:
tripleEvaluator=TripleEvaluator(entity_embeddings,predicate_embeddings,train_triples,test_triples, model='TransE', k=50)

Precomputing top-k predictions: 100%|██████████| 20466/20466 [33:19<00:00, 10.24it/s]


In [None]:
#Create or acquire the training data (negative samples)
#tail_df = tripleEvaluator.create_training_data_filtered(n=1, creating_for="tail")
#tail_df.to_csv('/content/drive/MyDrive/link_prediction/nguyen_fb15k237_transe_tail_df_1_ns.csv', index=False)
tail_df = pd.read_csv('nguyen_fb15k237_transe_tail_df_1_ns.csv')
#head_df = tripleEvaluator.create_training_data_filtered(n=1, creating_for="head")
#head_df.to_csv('/content/drive/MyDrive/link_prediction/nguyen_fb15k237_transe_head_df_1_ns.csv', index=False)
head_df = pd.read_csv('nguyen_fb15k237_transe_head_df_1_ns.csv')

In [None]:
train_triples['label']=1

In [None]:
train_df = pd.concat([head_df,tail_df,train_triples], axis=0)
# Reset the index
train_df.reset_index(drop=True, inplace=True)
# Drop duplicate rows
train_df.drop_duplicates(inplace=True)

In [None]:
train_df.label.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,272115
0,146168


In [None]:
# Step 1: Prepare the training data
train_df['embedding'] = train_df.apply(lambda row: tripleEvaluator.get_embedding(row), axis=1)
X_train = np.vstack(train_df['embedding'].values)  # Stack embeddings into a matrix
y_train = train_df['label'].values  # Labels

In [None]:
# Step 1: Split the data into training and test sets with stratification
X_train, X_test, y_train, y_test = train_test_split(
    np.vstack(train_df['embedding'].values),  # Embeddings matrix
    train_df['label'].values,                # Labels
    test_size=0.1,                           # 10% of the data for testing
    random_state=42,                         # For reproducibility
    stratify=train_df['label'].values        # Maintain label distribution
)

In [None]:
# Step 1: Initialize the MLP model
mlp_model = MLPClassifier(
    hidden_layer_sizes=(128, 128),  # Two layers with k units each
    activation='relu',              # ReLU activation function
    solver='adam',                  # Adam optimizer
    alpha=0.0001,                   # L2 regularization
    learning_rate_init=0.001,       # Learning rate
    early_stopping=True,            # Stops if validation score does not improve
    max_iter=25,                     # Number of iterations
    batch_size=32,                  # Same batch size
    random_state=42                 # For reproducibility
)
mlp_model.fit(X_train, y_train)

# Step 4: Evaluate the model on the test set
y_pred = mlp_model.predict(X_test)

# Print evaluation metrics
print("Accuracy on test set:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
# Create a LightGBM model 
lgbm_model = LGBMClassifier(
    #class_weight={0: 2, 1: 1}, # To handle imbalanced data
    n_estimators=100,        # Number of boosting rounds
    max_depth=15,            # Maximum depth of a tree
    #num_leaves= 100,              # Maximum number of leaves in one tree
    #min_child_samples=25 ,       # Minimum data per leaf
    #subsample= 1,
    random_state=42          # Set a random state for reproducibility
)
# Fit the model
lgbm_model.fit(X_train, y_train)

# Make predictions
y_pred_lgbm = lgbm_model.predict(X_test)

# Print evaluation metrics
print("LightGBM Classification Report:\n", classification_report(y_test, y_pred_lgbm))

In [None]:
# Create an XGBoost model 
xgb_model = XGBClassifier(
    #scale_pos_weight=0.5,    # Adjust the weight for the positive class (useful for imbalance)
    n_estimators=100,       # Number of boosting rounds
    max_depth=10,           # Maximum depth of a tree
    #min_child_weight=,
    #subsample=,
    #colsample_bytree=
    random_state=42         # Set a random state for reproducibility
)

# Fit the model
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred_xgb = xgb_model.predict(X_test)

# Print evaluation metrics
print("XGBoost Classification Report:\n", classification_report(y_test, y_pred_xgb))

In [None]:
# Create the voting classifier
ensemble_model = VotingClassifier(
    estimators=[
        ('xgb', xgb_model),
        ('lgbm', lgbm_model),
        ('mlp', mlp_model)
    ],
    voting='soft',  # Soft voting to use probabilities
    n_jobs=-1       # Use all cores for parallel processing
)

# Fit the ensemble model (if needed)
ensemble_model.fit(X_train, y_train)

# Evaluate the ensemble
y_pred_ensemble = ensemble_model.predict(X_test)

print("Ensemble Accuracy:", accuracy_score(y_test, y_pred_ensemble))
print("Classification Report for Ensemble:\n", classification_report(y_test, y_pred_ensemble))

In [None]:
# Create the voting classifier
ensemble_model = VotingClassifier(
    estimators=[
        ('xgb', xgb_model),
        ('lgbm', lgbm_model),
        ('mlp', mlp_model)
    ],
    voting='soft',  # Soft voting to use probabilities
    n_jobs=-1       # Use all cores for parallel processing
)

# Fit the ensemble model (if needed)
ensemble_model.fit(X_train, y_train)

# Evaluate the ensemble
y_pred_ensemble = ensemble_model.predict(X_test)

print("Ensemble Accuracy:", accuracy_score(y_test, y_pred_ensemble))
print("Classification Report for Ensemble:\n", classification_report(y_test, y_pred_ensemble))

**ORIGINIAL**

In [None]:
results = {}

# Evaluate for k from 1 to 10 and store the results in a dictionary
for k in [1, 3, 5, 10, 20, 30, 40, 50, 100]:
    total_count_head, hit_count_head = tripleEvaluator.evaluate_hitk_original_2_on_patterns(k=k, evaluate_for="head")
    total_count_tail, hit_count_tail = tripleEvaluator.evaluate_hitk_original_2_on_patterns(k=k, evaluate_for="tail")

    overall_hit_count = hit_count_head + hit_count_tail
    overall_total_count = total_count_head + total_count_tail
    overall_percentage = (overall_hit_count / overall_total_count) * 100

    results[k] = {
        "head": {
            "total_count": len(tripleEvaluator.precomputed_top_k_head),
            "hit_count": hit_count_head,
            "percentage": hit_count_head / total_count_head * 100
        },
        "tail": {
            "total_count": len(tripleEvaluator.precomputed_top_k_tail),
            "hit_count": hit_count_tail,
            "percentage": hit_count_tail / total_count_tail * 100
        },
        "overall": {
            "hit_count": overall_hit_count,
            "total_count": overall_total_count,
            "percentage": overall_percentage
        }
    }

# Print the results nicely
for k, data in results.items():
    print(
        f"k={k}: "
        f"Head: {data['head']['percentage']:.2f}% | "
        f"Tail: {data['tail']['percentage']:.2f}% | "
        f"Overall: {data['overall']['percentage']:.2f}%"
    )

**MLP**

In [None]:
results = []

# Define the mapping of k values to their respective threshold lists
k_threshold_mapping = {
    1: [1, 3, 5, 10, 20, 30, 40, 50],
    3: [3, 5, 10, 20, 30, 40, 50],
    5: [5, 10, 20, 30, 40, 50],
    10: [10, 20, 30, 40, 50]
}

# Iterate through the mapping
for k, threshold_values in k_threshold_mapping.items():
    for threshold in threshold_values:
        # Evaluate for head
        total_count_head, hit_count_head = tripleEvaluator.rerank(k=k, model=mlp_model, threshold=threshold, evaluate_for="head")
        head_percentage = hit_count_head / total_count_head * 100

        # Evaluate for tail
        total_count_tail, hit_count_tail = tripleEvaluator.rerank(k=k, model=mlp_model, threshold=threshold, evaluate_for="tail")
        tail_percentage = hit_count_tail / total_count_tail * 100

        # Calculate overall metrics
        overall_hit_count = hit_count_head + hit_count_tail
        overall_total_count = total_count_head + total_count_tail
        overall_percentage = (overall_hit_count / overall_total_count) * 100

        # Store the results in a dictionary
        results.append({
            "k": k,
            "threshold": threshold,
            "head": f"{head_percentage:.2f}%",
            "tail": f"{tail_percentage:.2f}%",
            "overall": f"{overall_percentage:.2f}%"
        })

# Print the results in a formatted way
for result in results:
    print(
        f"k={result['k']} | th={result['threshold']} | "
        f"head: {result['head']} | tail: {result['tail']} | overall: {result['overall']}"
    )

**XGB**

In [None]:
results = []

# Define the mapping of k values to their respective threshold lists
k_threshold_mapping = {
    1: [1, 3, 5, 10, 20, 30, 40, 50],
    3: [3, 5, 10, 20, 30, 40, 50],
    5: [5, 10, 20, 30, 40, 50],
    10: [10, 20, 30, 40, 50]
}

# Iterate through the mapping
for k, threshold_values in k_threshold_mapping.items():
    for threshold in threshold_values:
        # Evaluate for head
        total_count_head, hit_count_head = tripleEvaluator.rerank(k=k, model=xgb_model, threshold=threshold, evaluate_for="head")
        head_percentage = hit_count_head / total_count_head * 100

        # Evaluate for tail
        total_count_tail, hit_count_tail = tripleEvaluator.rerank(k=k, model=xgb_model, threshold=threshold, evaluate_for="tail")
        tail_percentage = hit_count_tail / total_count_tail * 100

        # Calculate overall metrics
        overall_hit_count = hit_count_head + hit_count_tail
        overall_total_count = total_count_head + total_count_tail
        overall_percentage = (overall_hit_count / overall_total_count) * 100

        # Store the results in a dictionary
        results.append({
            "k": k,
            "threshold": threshold,
            "head": f"{head_percentage:.2f}%",
            "tail": f"{tail_percentage:.2f}%",
            "overall": f"{overall_percentage:.2f}%"
        })

# Print the results in a formatted way
for result in results:
    print(
        f"k={result['k']} | th={result['threshold']} | "
        f"head: {result['head']} | tail: {result['tail']} | overall: {result['overall']}"
    )

**LGBM**

In [None]:
results = []

# Define the mapping of k values to their respective threshold lists
k_threshold_mapping = {
    1: [1, 3, 5, 10, 20, 30, 40, 50],
    3: [3, 5, 10, 20, 30, 40, 50],
    5: [5, 10, 20, 30, 40, 50],
    10: [10, 20, 30, 40, 50]
}

# Iterate through the mapping
for k, threshold_values in k_threshold_mapping.items():
    for threshold in threshold_values:
        # Evaluate for head
        total_count_head, hit_count_head = tripleEvaluator.rerank(k=k, model=lgbm_model, threshold=threshold, evaluate_for="head")
        head_percentage = hit_count_head / total_count_head * 100

        # Evaluate for tail
        total_count_tail, hit_count_tail = tripleEvaluator.rerank(k=k, model=lgbm_model, threshold=threshold, evaluate_for="tail")
        tail_percentage = hit_count_tail / total_count_tail * 100

        # Calculate overall metrics
        overall_hit_count = hit_count_head + hit_count_tail
        overall_total_count = total_count_head + total_count_tail
        overall_percentage = (overall_hit_count / overall_total_count) * 100

        # Store the results in a dictionary
        results.append({
            "k": k,
            "threshold": threshold,
            "head": f"{head_percentage:.2f}%",
            "tail": f"{tail_percentage:.2f}%",
            "overall": f"{overall_percentage:.2f}%"
        })

# Print the results in a formatted way
for result in results:
    print(
        f"k={result['k']} | th={result['threshold']} | "
        f"head: {result['head']} | tail: {result['tail']} | overall: {result['overall']}"
    )

**SCKIT LEARN ENSEMBLE**

In [None]:
results = []

# Define the mapping of k values to their respective threshold lists
k_threshold_mapping = {
    1: [1, 3, 5, 10, 20, 30, 40, 50],
    3: [3, 5, 10, 20, 30, 40, 50],
    5: [5, 10, 20, 30, 40, 50],
    10: [10, 20, 30, 40, 50]
}

# Iterate through the mapping
for k, threshold_values in k_threshold_mapping.items():
    for threshold in threshold_values:
        # Evaluate for head
        total_count_head, hit_count_head = tripleEvaluator.rerank(k=k, model=ensemble_model, threshold=threshold, evaluate_for="head")
        head_percentage = hit_count_head / total_count_head * 100

        # Evaluate for tail
        total_count_tail, hit_count_tail = tripleEvaluator.rerank(k=k, model=ensemble_model, threshold=threshold, evaluate_for="tail")
        tail_percentage = hit_count_tail / total_count_tail * 100

        # Calculate overall metrics
        overall_hit_count = hit_count_head + hit_count_tail
        overall_total_count = total_count_head + total_count_tail
        overall_percentage = (overall_hit_count / overall_total_count) * 100

        # Store the results in a dictionary
        results.append({
            "k": k,
            "threshold": threshold,
            "head": f"{head_percentage:.2f}%",
            "tail": f"{tail_percentage:.2f}%",
            "overall": f"{overall_percentage:.2f}%"
        })

# Print the results in a formatted way
for result in results:
    print(
        f"k={result['k']} | th={result['threshold']} | "
        f"head: {result['head']} | tail: {result['tail']} | overall: {result['overall']}"
    )