In [None]:
import pandas as pd
import time
from sklearn.manifold import TSNE
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

In [None]:
from pyrdf2vec import RDF2VecTransformer
from pyrdf2vec.embedders import Word2Vec
from pyrdf2vec.graphs import KG
from pyrdf2vec.walkers import RandomWalker

In [None]:
# Initialize the pseudo-random number generator for reproducibility (commented out here).
# RANDOM_STATE = 22

# Load the test and training data from TSV files
test_data = pd.read_csv("../mutag/testSet.tsv", sep="\t")
train_data = pd.read_csv("../mutag/trainingSet.tsv", sep="\t")

# Extract entities and labels from training and test datasets
train_entities = [entity for entity in train_data["bond"]]
train_labels = list(train_data["label_mutagenic"])

test_entities = [entity for entity in test_data["bond"]]
test_labels = list(test_data["label_mutagenic"])

# Combine train and test entities and labels
entities = train_entities + test_entities
labels = train_labels + test_labels


In [None]:
# Define different vector sizes to evaluate
vector_sizes = [100, 200, 300, 400, 500]

# Loop through each vector size to train and evaluate embeddings
for vector_size in vector_sizes:
    # Initialize RDF2VecTransformer with Word2Vec parameters
    embeddings = RDF2VecTransformer(
        Word2Vec(
            vector_size=vector_size,  # The size of the word vectors
            window=5,                 # The maximum distance between the current and predicted word
            min_count=0,              # Ignores all words with total frequency lower than this
            workers=1,                # Number of worker threads to train the model
            sg=1,                     # Use skip-gram
            hs=1,                     # Use hierarchical softmax
            negative=0,               # Number of "noise words" to draw
            alpha=0.025,              # Initial learning rate
            min_alpha=0.0001,         # Minimum learning rate after training
            epochs=5,                 # Number of training iterations
            seed=42                   # Random seed for reproducibility
        ),
        walkers=[RandomWalker(max_depth=2)],  # Use RandomWalker with max depth of 2
    ).fit_transform(
        KG(
            "../mutag/carcinogenesis.owl",  # Path to the knowledge graph
            skip_predicates={"http://dl-learner.org/carcinogenesis#isMutagenic"},  # Skip specific predicates
        ),
        entities
    )
    
    # Split the embeddings into training and test sets
    train_embeddings = embeddings[0][:len(train_entities)]
    test_embeddings = embeddings[0][len(train_entities):]
    
    # Perform Grid Search to find the best SVM C-parameter
    clf = GridSearchCV(
        SVC(), {"C": [10**i for i in range(-3, 4)]}, cv=5
    )
    clf.fit(train_embeddings, train_labels)

    # Predict using the best SVM model and evaluate accuracy
    predictions = clf.predict(test_embeddings)
    print(
        f"Vector Size '{vector_size}' , accuracy : "
        + f"{accuracy_score(test_labels, predictions) * 100:.4f}%"
    )