In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

from lazypredict.Supervised import LazyClassifier
from pyrdf2vec import RDF2VecTransformer
from pyrdf2vec.embedders import Word2Vec, FastText
from pyrdf2vec.graphs import KG
from pyrdf2vec.samplers import WideSampler, UniformSampler
from pyrdf2vec.walkers import HALKWalker, NGramWalker, CommunityWalker, RandomWalker, WalkletWalker, Walker, WLWalker

RANDOM_STATE = 22
PATH = "bias_lens_data/URI_label_pairs.tsv"

In [2]:
df = pd.read_csv(PATH, sep= '\t')
df.head()
df = df.rename(columns={"Unnamed: 0":"uri", "0":"label"})

train_size= int(len(df)*0.2)
test_data = df.iloc[:train_size]
train_data = df.iloc[train_size:]

train_entities = [entity for entity in train_data['uri']]
train_labels = list(train_data["label"])

test_entities = [entity for entity in test_data['uri']]
test_labels = list(test_data["label"])


entities = train_entities+test_entities
labels = train_labels + test_labels

In [3]:
kg = KG(
    location = "C:/Users/majal/Desktop/GP2/Bias-lensGP2/bias_lens_data/bias_lens_graph.ttl",
    skip_predicates={"http://biaslens.com/bias"},
    skip_verify = True
    )


In [4]:
## Halk Walks
rdf2vec = RDF2VecTransformer(Word2Vec(workers=2, epochs=20), 
        walkers=[
        HALKWalker(
            2,
            None,
            n_jobs=4,
            sampler=UniformSampler(),
            random_state=RANDOM_STATE,
            md5_bytes=None,
        )
    ],
verbose=1
)
embeddings, literals = rdf2vec.fit_transform(kg,  np.array(entities))

100%|██████████| 37604/37604 [00:09<00:00, 4055.46it/s]


Extracted 354155 walks for 37604 entities (136.1311s)
Fitted 354155 walks (78.0584s)


In [5]:
train_embeddings = np.array(embeddings[:len(train_entities)])
test_embeddings = np.array(embeddings[len(train_entities):])

clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(train_embeddings, test_embeddings, train_labels, test_labels)
models

 97%|█████████▋| 28/29 [12:34<00:13, 13.78s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.044555 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25500
[LightGBM] [Info] Number of data points in the train set: 30084, number of used features: 100
[LightGBM] [Info] Start training from score -1.064884
[LightGBM] [Info] Start training from score -1.242166
[LightGBM] [Info] Start training from score -1.008374
[LightGBM] [Info] Start training from score -6.399726


100%|██████████| 29/29 [12:47<00:00, 26.48s/it]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
KNeighborsClassifier,0.96,0.96,,0.96,16.74
SVC,0.96,0.96,,0.96,57.68
LGBMClassifier,0.95,0.95,,0.95,13.83
ExtraTreesClassifier,0.95,0.95,,0.95,15.05
RandomForestClassifier,0.94,0.94,,0.94,88.23
QuadraticDiscriminantAnalysis,0.94,0.94,,0.94,1.39
LogisticRegression,0.94,0.94,,0.94,4.22
LinearSVC,0.93,0.93,,0.93,43.6
CalibratedClassifierCV,0.93,0.93,,0.93,165.74
SGDClassifier,0.93,0.93,,0.93,2.71


In [6]:
## Ngram Walks
rdf2vec = RDF2VecTransformer(Word2Vec(workers=2, epochs=20), 
        walkers=[
        NGramWalker(
            2,
            None,
            n_jobs=4,
            sampler=UniformSampler(),
            random_state=RANDOM_STATE,
            md5_bytes=None,
        )
    ],
verbose=1
)
embeddings, literals = rdf2vec.fit_transform(kg,  np.array(entities))

100%|██████████| 37604/37604 [00:22<00:00, 1639.01it/s]


Extracted 354155 walks for 37604 entities (148.4089s)
Fitted 354155 walks (71.4007s)


In [7]:
train_embeddings = np.array(embeddings[:len(train_entities)])
test_embeddings = np.array(embeddings[len(train_entities):])

clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(train_embeddings, test_embeddings, train_labels, test_labels)
models

 97%|█████████▋| 28/29 [16:48<00:26, 26.96s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.040257 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25500
[LightGBM] [Info] Number of data points in the train set: 30084, number of used features: 100
[LightGBM] [Info] Start training from score -1.064884
[LightGBM] [Info] Start training from score -1.242166
[LightGBM] [Info] Start training from score -1.008374
[LightGBM] [Info] Start training from score -6.399726


100%|██████████| 29/29 [17:03<00:00, 35.30s/it]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
SVC,0.92,0.92,,0.92,134.76
LGBMClassifier,0.91,0.91,,0.91,15.58
KNeighborsClassifier,0.91,0.91,,0.91,17.56
QuadraticDiscriminantAnalysis,0.9,0.91,,0.9,1.58
ExtraTreesClassifier,0.91,0.91,,0.91,16.0
RandomForestClassifier,0.9,0.9,,0.9,96.26
CalibratedClassifierCV,0.89,0.88,,0.89,236.31
LinearSVC,0.88,0.88,,0.89,74.86
LogisticRegression,0.88,0.88,,0.88,4.83
SGDClassifier,0.88,0.88,,0.88,4.41


In [8]:
# Community walks
rdf2vec = RDF2VecTransformer(Word2Vec(workers=2, epochs=20), 
        walkers=[
        CommunityWalker(
            2,
            None,
            n_jobs=4,
            sampler=UniformSampler(),
            random_state=RANDOM_STATE,
            md5_bytes=None,
        )
    ],
verbose=1
)
embeddings, literals = rdf2vec.fit_transform(kg,  np.array(entities))

  1%|          | 310/37604 [14:54<29:53:52,  2.89s/it]


KeyboardInterrupt: 

In [None]:
train_embeddings = np.array(embeddings[:len(train_entities)])
test_embeddings = np.array(embeddings[len(train_entities):])

clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(train_embeddings, test_embeddings, train_labels, test_labels)
models

In [None]:
## Random Walks
rdf2vec = RDF2VecTransformer(Word2Vec(workers=2, epochs=20), 
        walkers=[
        RandomWalker(
            2,
            None,
            n_jobs=4,
            sampler=UniformSampler(),
            random_state=RANDOM_STATE,
            md5_bytes=None,
        )
    ],
verbose=1
)
embeddings, literals = rdf2vec.fit_transform(kg,  np.array(entities))

In [None]:
train_embeddings = np.array(embeddings[:len(train_entities)])
test_embeddings = np.array(embeddings[len(train_entities):])

clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(train_embeddings, test_embeddings, train_labels, test_labels)
models

In [None]:
#WL walks
rdf2vec = RDF2VecTransformer(Word2Vec(workers=2, epochs=20), 
        walkers=[
        WLWalker(
            2,
            None,
            n_jobs=4,
            sampler=UniformSampler(),
            random_state=RANDOM_STATE,
            md5_bytes=None,
        )
    ],
verbose=1
)
embeddings, literals = rdf2vec.fit_transform(kg,  np.array(entities))

In [None]:
train_embeddings = np.array(embeddings[:len(train_entities)])
test_embeddings = np.array(embeddings[len(train_entities):])

clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(train_embeddings, test_embeddings, train_labels, test_labels)
models

In [None]:
#Walklet walks
rdf2vec = RDF2VecTransformer(Word2Vec(workers=2, epochs=20), 
        walkers=[
        WalkletWalker(
            2,
            None,
            n_jobs=4,
            sampler=UniformSampler(),
            random_state=RANDOM_STATE,
            md5_bytes=None,
        )
    ],
verbose=1
)
embeddings, literals = rdf2vec.fit_transform(kg,  np.array(entities))

In [None]:
train_embeddings = np.array(embeddings[:len(train_entities)])
test_embeddings = np.array(embeddings[len(train_entities):])

clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(train_embeddings, test_embeddings, train_labels, test_labels)
models