In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import spacy
from spacy.lang.en import English

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import classification_report

# import tensorflow as tf
# from tensorflow import keras
# from tensorflow.keras import layers

from ast import literal_eval


In [23]:
data_train = pd.read_csv("../../data/data_train.csv").drop(["Unnamed: 0"], axis=1)
data_test = pd.read_csv("../../data/data_test.csv").drop(["Unnamed: 0"], axis=1)

nlp = spacy.load('en_core_web_sm')

In [24]:
# Funktion zur Tokenisierung, Lemmatisierung, Stop-Word-Removal der Texte
def preprocess_text(text):
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    num_sentences = len(sentences)
    
    words_without_stopwords = []
    words_with_stopwords = []
    lemmas = []
    stops = []

    for token in doc:
        if not token.is_stop and not token.is_punct:
            words_without_stopwords.append(token.text)
            lemmas.append(token.lemma_)
        elif not token.is_punct:
            words_with_stopwords.append(token)
            stops.append(token)

    num_words_without_stopwords = len(words_without_stopwords)
    num_words_with_stopwords = len(words_with_stopwords)
    num_stops = len(stops)

    return sentences, num_sentences, words_with_stopwords, num_words_with_stopwords, words_without_stopwords, num_words_without_stopwords, lemmas, stops, num_stops

data_train['sentences'], data_train['num_sentences'], data_train['words_with_stopwords'], data_train['num_words_with_stopwords'], data_train['words_without_stopwords'], data_train['num_words_without_stopwords'], data_train['lemmas'], data_train['stops'], data_train['num_stops'] = zip(*data_train['text'].apply(preprocess_text))
data_test['sentences'], data_test['num_sentences'], data_test['words_with_stopwords'], data_test['num_words_with_stopwords'], data_test['words_without_stopwords'], data_test['num_words_without_stopwords'], data_test['lemmas'], data_test['stops'], data_test['num_stops'] = zip(*data_test['text'].apply(preprocess_text))
data_train.to_csv("../../data/data_with_features/data_train_with_features.csv")
data_test.to_csv("../../data/data_with_features/data_test_with_features.csv")

# etwa 10 Minuten

KeyboardInterrupt: 

In [None]:
data_train = pd.read_csv("../../data/data_with_features/data_train_with_features.csv").drop(["Unnamed: 0"], axis=1)
data_test = pd.read_csv("../../data/data_with_features/data_test_with_features.csv").drop(["Unnamed: 0"], axis=1)

data_train = data_train.sample(frac=1).reset_index(drop=True)
data_test = data_test.sample(frac=1).reset_index(drop=True)

In [None]:
x = data_train["sentences"]
literal_eval(x[3])[0]

'Officials of India and Bangladesh on Thursday agreed to construct gates along the border to allow free and safe passage for wild elephants.'

In [None]:
data_train

Unnamed: 0,classification,text,sentences,num_sentences,words_with_stopwords,num_words_with_stopwords,words_without_stopwords,num_words_without_stopwords,lemmas,stops,num_stops
0,Scientific,the gravitational astronomy is now becoming a ...,['the gravitational astronomy is now becoming ...,148,"[the, is, now, becoming, a, the, such, as, and...",2045,"['gravitational', 'astronomy', 'reality', '.in...",2877,"['gravitational', 'astronomy', 'reality', '.in...","[the, is, now, becoming, a, the, such, as, and...",2045
1,Scientific,"a geometry lies in the foundation of physics ,...",['a geometry lies in the foundation of physics...,212,"[a, in, the, of, and, a, of, is, very, for, th...",3052,"['geometry', 'lies', 'foundation', 'physics', ...",3836,"['geometry', 'lie', 'foundation', 'physics', '...","[a, in, the, of, and, a, of, is, very, for, th...",3052
2,news,Sports Direct's Mike Ashley has won a lawsuit ...,"[""Sports Direct's Mike Ashley has won a lawsui...",3,"['s, has, a, over, a, made, in, a, in, In, the...",29,"['Sports', 'Direct', 'Mike', 'Ashley', 'won', ...",35,"['Sports', 'Direct', 'Mike', 'Ashley', 'win', ...","['s, has, a, over, a, made, in, a, in, In, the...",29
3,news,Officials of India and Bangladesh on Thursday ...,['Officials of India and Bangladesh on Thursda...,3,"[of, and, on, to, along, the, to, and, for, ha...",25,"['Officials', 'India', 'Bangladesh', 'Thursday...",32,"['official', 'India', 'Bangladesh', 'Thursday'...","[of, and, on, to, along, the, to, and, for, ha...",25
4,story,Every one knows through what adventure King Fr...,['Every one knows through what adventure King ...,60,"[Every, one, through, what, the, first, of, th...",1208,"['knows', 'adventure', 'King', 'Francis', 'tak...",899,"['know', 'adventure', 'King', 'Francis', 'take...","[Every, one, through, what, the, first, of, th...",1208
...,...,...,...,...,...,...,...,...,...,...,...
1995,reviews,this is a great movie its not 40yr old virgin ...,['this is a great movie its not 40yr old virgi...,4,"[this, is, a, its, not, or, up, but, its, a, o...",41,"['great', 'movie', '40yr', 'old', 'virgin', 'f...",39,"['great', 'movie', '40yr', 'old', 'virgin', 'f...","[this, is, a, its, not, or, up, but, its, a, o...",41
1996,reviews,This games makes even amazing games like starc...,['This games makes even amazing games like sta...,3,"[This, even, and, has, it, all, and, does, it,...",14,"['games', 'makes', 'amazing', 'games', 'like',...",22,"['game', 'make', 'amazing', 'game', 'like', 's...","[This, even, and, has, it, all, and, does, it,...",14
1997,Scientific,many quantum information protocols involve non...,['many quantum information protocols involve n...,345,"[many, to, their, beyond, the, by, is, by, a, ...",2380,"['quantum', 'information', 'protocols', 'invol...",4000,"['quantum', 'information', 'protocol', 'involv...","[many, to, their, beyond, the, by, is, by, a, ...",2380
1998,reviews,Many parts of this book are difficult to under...,['Many parts of this book are difficult to und...,3,"[Many, of, this, are, to, for, the, and, It, i...",31,"['parts', 'book', 'difficult', 'understand', '...",23,"['part', 'book', 'difficult', 'understand', 'b...","[Many, of, this, are, to, for, the, and, It, i...",31


In [None]:
vectorizer = TfidfVectorizer()
features = vectorizer.fit_transform(data_train["text"])

X_train, X_test, y_train, y_test = train_test_split(features, data_train["classification"], test_size=0.2, random_state=42)

model = svm.SVC(probability=True)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

  Scientific       1.00      1.00      1.00       104
        news       0.94      0.99      0.96        97
     reviews       0.96      0.96      0.96       113
       story       1.00      0.94      0.97        86

    accuracy                           0.97       400
   macro avg       0.97      0.97      0.97       400
weighted avg       0.97      0.97      0.97       400



In [None]:
data_train["text"][0]

"the gravitational astronomy is now becoming a reality .in fact , the ground - based laser interferometers such as tama300 @xcite and the first ligo @xcite are beginning to take data at sensitivities where astrophysical events are predicted . for the detectors including geo600 and virgo , core - collapse supernovae especially in our galaxy ,have been supposed to be the most plausible sources of gravitational waves ( see , for example , @xcite for review ) . since the gravitational wave ( plus neutrinos ) is the only tool which gives us the information in the innermost part of evolved massive stars , the detection is important not only for the direct confirmation of gravitational waves but also for the understanding of supernova physics itself .so far , most of the theoretical predictions of gravitational waves from supernovae have focused on the bounce signal in the context of rotational @xcite and magnetorotational @xcite core collapse . in most of the previous studies , the iron core

In [None]:
t = "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.   Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi. Lorem ipsum dolor sit amet."

In [None]:
new_text = data_test["text"][432]
new_text = t
new_text_features = vectorizer.transform([new_text])
probabilities = model.predict_proba(new_text_features)
predicted_class = model.predict(new_text_features)

# Wahrscheinlichkeiten und vorhergesagte Klasse ausgeben
for i, probs in enumerate(probabilities):
    class_probabilities = ["{:.2f}%".format(prob * 100) for prob in probs]
    print("Klasse {}: {}".format(i, class_probabilities))
print("Vorhergesagte Klasse:", predicted_class)

Klasse 0: ['0.10%', '97.30%', '2.55%', '0.04%']
Vorhergesagte Klasse: ['news']


In [None]:
data_test["classification"][432]

'Scientific'

In [None]:
data_train = pd.read_csv("../../data/data_with_features/data_train_with_features.csv").drop(["Unnamed: 0"], axis=1)
data_test = pd.read_csv("../../data/data_with_features/data_test_with_features.csv").drop(["Unnamed: 0"], axis=1)

data_train = data_train.sample(frac=1).reset_index(drop=True)
data_test = data_test.sample(frac=1).reset_index(drop=True)

In [None]:
vectorizer = TfidfVectorizer()
features = vectorizer.fit_transform(data_train["text"])

X_train, X_test, y_train, y_test = train_test_split(features, data_train["classification"], test_size=0.2, random_state=42)

In [None]:
model = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(features.shape[1],)),
    layers.Dense(64, activation='relu'),
    layers.Dense(4, activation='softmax')
])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

NameError: name 'keras' is not defined

In [None]:
label_to_int = {label: i for i, label in enumerate(np.unique(data_train["classification"]))}
y_train = np.array([label_to_int[label] for label in y_train])
y_test = np.array([label_to_int[label] for label in y_test])

In [None]:
model.fit(X_train.toarray(), y_train, epochs=10, batch_size=16, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x23b90075e50>

In [None]:
loss, accuracy = model.evaluate(X_test.toarray(), y_test, verbose=1)
print('Test Loss:', loss)
print('Test Accuracy:', accuracy)

Test Loss: 0.016573261469602585
Test Accuracy: 0.9975000023841858


In [None]:
t = """I've been thinking e-everyday
I've been thinking 'bout what you say
But words just get in the way, yeah
And I stress 'cause I don't wanna make a mess
When it comes to you
I'll give my best, yeah yeah
I'm trying to impress

Oh oh oh oh oh and everyday
Is like I see you for the first time
Oh oh oh oh oh and over and over I try
But words won't come my way

Baby no oh oh oh oh
This ain't just a love song
Another love song
Just random words
On the same sad chords
It's true, my song is all about you

All my friends say I try too much
They say it's just a little crush
But you took over my heart
And I stress 'cause I always
Tend to make a mess
Even though I try to give my best
Yeah yeah, I'm trying to impress
Yeah yeah! 

Oh oh oh oh oh and everyday
Is like I see you for the first time
Oh oh oh oh oh and over and over I try
But words won't come my way

Baby no oh oh oh oh
This ain't just a love song
Another love song
Just random words
On the same sad chords
It's true, my song is all about
Yo-o-o-ou
Yo-o-o-ou
Just random words
On the same sad chords
It's true, my song is all about

You, the one that I can't escape
The one that can take my breath
The only one that keeps me coming back
And 'cause my words fall short
I'm singing you this song
This song

Baby no oh oh oh oh
This ain't just a love song
Another love song
Just random words
On the same sad chords
It's true, this song is all about youuuu

Baby no oh oh oh oh
This ain't just a love song
Another love song
Just random words
On the same sad chords
It's true, my song is all about youuuu
Yo-o-o-ou
Yo-o-o-ou
Just random words
On the same sad chords
It's true, this song is all about you"""

lorem_ipsum = "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.   Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi. Lorem ipsum dolor sit amet."

In [None]:
new_text = data_test["text"][0]
new_text = t
new_text_features = vectorizer.transform([new_text])
predictions = model.predict(new_text_features.toarray())
predicted_class = np.argmax(predictions, axis=1)
predicted_probability = np.max(predictions, axis=1)

int_to_label = {i: label for label, i in label_to_int.items()}

predicted_labels = [int_to_label[prediction] for prediction in predicted_class]
for label, probability in zip(predicted_labels, predicted_probability):
    print(f"Vorhergesagte Klasse: {label}, Wahrscheinlichkeit: {probability}")

Vorhergesagte Klasse: Scientific, Wahrscheinlichkeit: 0.9995403289794922


In [None]:
data_test["classification"][101]

'news'

In [None]:
label_to_int

{'Scientific': 0, 'news': 1, 'reviews': 2, 'story': 3}

In [None]:
data_test["classification"][78]

'story'

In [None]:
data_test["text"][0]

"collective excitations of superfluid helium confined in various porous media have been studied by neutron scattering since early 90 s , and by now a wealth of information about helium in aerogel , vycor and geltech has been collected @xcite-@xcite .aerogel is an open gel structure formed by silica strands ( sio@xmath0 ) .typical pore sizes range from few   to few hundred   , without any characteristic pore size .vycor is a porous glass , where pores form channels of about 70    diameter .geltech resembles aerogel , except that the nominal pore size is 25    @xcite .liquid @xmath1he is adsorbed in these matrices in the form of atomic layers , the first layer is expected to be solid ; on a more strongly binding substrate , such as graphite , one expects two solid layers .energies and lifetimes of phonon  roton excitations for confined @xmath1he are nearly equal to their bulk superfluid @xmath1he values @xcite , but differences appear at partial fillings .the appearance of ripplons is ti

In [None]:
t = """What are graph databases and how can quality
be verified for their data?
Abstract
Loosing customers, missing opportunities and coming to wrong decisions are symptoms of
the lack of accurate data in many enterprises. Every company can achieve big advantages
by processing data correctly and efficiently, and in order to work with a huge amount of
data, the data has to be trustworthy and of high quality. Boehringer Ingelheim is a global,
family-owned researching pharmaceutical company that focuses on researching, developing,
producing and selling prescription drugs for humans and animals. Pharmaceutical companies like Boehringer Ingelheim invest billions of euros and years of work into researching
and developing; and sometimes without even finding a satisfying result. The development of
a drug can cost 1.0 to 1.6 billion US-Dollars and can last for over 13 years; only for one age
class. Because of that, data quality and trustworthy data in general play an important role for
the company.
This project paper simply explains how data quality can be verified using automatic generation of test cases from graph databases and how a framework could look like that ensures high
quality data. It defines necessary vocabulary and explains required concepts, languages and
functionalities like RDF, OWL, SHACL, SPARQL, the Semantic Web, graph databases (like
knowledge graphs), the Astrea-Tool and the RDFUnit Testing Suite. The final result of this
project paper is a software concept, that links the Astrea-Tool and RDFUnit Testing Suite to
enable automatic generation of data shapes, as well as test cases for those data shapes. This
final software concept only needs data stored in RDF triples and the corresponding ontologies to automatically create an inspection report, which clearly depicts errors or irregularities
in any dataset.
III
1 Introduction 1
1 Introduction
1.1 The Importance of Data Quality
“Accurate data is a fundamental requirement of good information systems” (Olson, 2008, p.
6) and the lack of it can have negative consequences for companies, like loosing customers,
missing opportunities and coming to wrong decisions. In order to ensure data quality a company must put effort in form of time and money into data quality assurance programs. Nowadays, the worlds biggest internet companies and web services Microsoft, Apple, Facebook,
Google and Amazon use the data of their users to generate huge profits. Every company can
achieve big advantages by processing data correctly and efficiently. But in order to work
with huge datasets, it has to be trustworthy and of high quality. Data quality experts estimate
that some businesses, including governmental and educational organizations, lose 15-25% of
their profit due to working with poor-quality data (cf. Olson, 2008, p. 9). Data quality issues
lead to a huge waste of time, energy and money for people, companies and their staff. The
problem is, that those issues seem to be accepted or ignored nowadays. They have become
invisible since many people declare those costs as normal and routine (cf. Olson, 2008, p.
14). This shows, that a rethinking is needed to improve the data quality verification processes and that it needs big investments to overcome the current standards. Nevertheless, the
outcome could be very substantial for everyone.
Boehringer Ingelheim and Data Quality in the Pharmaceutical Industry
Boehringer Ingelheim Pharma GmbH & Co.KG (BI) is a global, family-owned researching
pharmaceutical company whose headquarter is located in Ingelheim am Rhein, Germany. BI
focuses on researching, developing, producing and selling prescription drugs for humans and
animals. BI has developed several so-called “Blockbusters” like e.g. “Jardiance” to treat type
2 diabetes or “Spiriva” for chronic obstructive pulmonary diseases. Those “Blockbusters”
pulled in sales amounting to billions. The year 2020 was, despite Covid-19, a good year
for BI. The generated turnover was 19.57 billion euros; 3% more than in the preceding
year. The goal of Boehringer Ingelheim is to become the number one in animal health
and biopharmaceutical contract manufacture. (cf. Boehringer Ingelheim Pharma GmbH &
Co. KG, 2020). An ambitious goal, since medicine, pharmacy and health are critical and
important areas of research, that can be very expensive, long-lasting and risky. Companies
like Boehringer Ingelheim invest billions of euros and years of work into researching and
developing; and sometimes without even finding a satisfying result. The goal of everyone
working for BI is to improve the steps behind drug development. This includes searching
for a possible active ingredient, synthesizing it, testing it on animals and people (includes
1.2 Objective of this Paper and Project 2
finding volunteers), adjusting the drug, finding the correct dosage, waiting for the clinical
studies to finish, gaining approval by the authorities, patenting the drug, manufacturing and
selling it. This process can last for over 13 years; only for one age class. On average, 1.0
to 1.6 billion US-Dollars have to be invested into the development of one drug. (cf. Die
forschenden Pharma-Unternehmen, 2018). Ensuring data quality at Boehringer Ingelheim
or other pharmaceutical companies can help saving time and money in every area that was
just mentioned. Wrong data leads to wrong information, which leads to wrong knowledge
and to wrong conclusions, which lead to wrong results in research, to wrong decisions and
a lot of wasted time, money and effort. And in addition, especially in pharmacy, wrong data
can lead to dangerous situations for humans.
1.2 Objective of this Paper and Project
The objective of this project is to automate generation and execution of data quality checkings. How could a framework look like that checks high quality data and helps bringing
buried data to the fore for BI or other pharmaceutical companies? A framework that allows
evaluating the quality of already existing or completely new datasets? This paper documents
the approach to this project. It defines necessary vocabulary and explains required concepts,
languages and functionalities. It contains reviews of the singular software tools and an explanation of how they could be connected.
1.3 Proposed Approach to the Project
In order to understand the necessity of the final software concept and how it works, it is
inevitable to clearly define technical terms to fully understand the basics that lie behind
Knowledge Graphs (KGs) or the semantic web, and to get to know standardized formats
and languages supporting those basics. This is why chapter 2 will focus on explaining the
background information and context of the project. Chapter 3 contains the practical content,
like the review of the Astrea-Tool and RDFUnit Testing Suite. The last chapter is about
the final result and gives a glimpse of how this project will affect the future at Boehringer
Ingelheim.
2 Foundations 3
2 Foundations
2.1 Definition of Data Quality
The six following dimensions are very commonly used to describe and rate data quality.
(Sources: (Olson, 2008, p. 24f.) & (Herzog et al., 2007, p. 8f.) & (Fleckenstein and
Fellows, 2018, p. 103f.))
1. Accuracy: Is the information correct?
2. Timeliness: Is the information up-to-date?
3. Relevance: Does the information help to answer the important (relevant) questions?
4. Completeness: Is information missing?
5. Credibility: Are there multiple versions of the same information?
6. Validity: Does the information conform to the definition?
Additional often used data quality dimensions are Currency, Consistency, Flexibility, Precision, Format, Interpretability, Content, Efficiency, Importance, Sufficiency, Usableness,
Usefulness, Clarity, Comparability, Conciseness, Freedom of bias, Informativeness, level of
detail, Quantitativeness, Scope, Understandability. (cf. Haug et al., 2011, p. 4f.)
2.2 Definition of Knowledge
It is not easy to define the term “knowledge”, especially in the English language. “Knowledge” has basically two different meanings: On the one hand “having knowledge of something” and on the other hand “having knowledge about something”. (cf. Machlup, 1981, p.
27ff.) In other languages, like German, there are two words to describe those two cases of
knowledge (“wissen” & “kennen”). Due to the lack of such a distinction in English, it is
important to correctly define what kind of knowledge is meant when someone talks about
it, so that everyone has the same understanding of the term. The knowledge portrayed by
Knowledge Graphs (or other graph databases) does not represent what is really “known”
about something in the epistemological sense. It is not about having a skill or understanding
something, but instead it represents the information about different entities, things or objects
and how they relate to each other in form of data and metadata. By doing this, Knowledge
Graphs become the perfect tool to transform implicit knowledge buried in huge datasets (cf.
Figure 2.1) into explicit knowledge. (cf. Blumauer and Nagy, 2020, p. 91)
Explicit Knowledge is knowledge which can be encoded with literals, strings, mathematical equations or more and is knowledge that can be stored or be processed. Explicit knowl-
2.3 Basics about the Semantic Web 4
edge can be understood in an objective way and as something that exists physically in a kind
of static collection of statements, facts or ideas. (cf. Schilcher, 2006, p. 19)
Implicit Knowledge is knowledge “that can only be understood by the author himself.”
(Blumauer and Nagy, 2020, p. 35). It is knowledge that cannot really get noted down because
it is based on someones personal experiences, memories and feelings. Implicit Knowledge
is close to practical skills and everyone could interpret it in another way (like a mindmap).
People know very much, but cannot tell everything they know because they are not aware of
some aspects of their knowledge and it is hard to separate the knowledge used in daily life
from emotions, feelings and instinct.
Figure 2.1: A huge part of the knowledge someone has, no matter if private person or
company, is implicit. (Blumauer and Nagy, 2020, p. 91)
An Example: If someone wants to explain how to make a cake, the cook would write down
the used ingredients and when and how long they put them into the oven. This is called a
recipe and is the perfect example for explicit knowledge. The knowledge by the cook is noted
down so that everyone can easily copy it.
But the cook bakes very often and collected a lot of practical baking skills. They know when
the dough is perfectly finished, know little tricks to improve the final result and know how
to react to unforeseen situations because of their experience. The cook has a lot of implicit
knowledge which they sometimes use without even realizing and therefore it cannot be noted
down in a recipe. That is why food in the restaurant or cakes in the bakery often taste better
than the self-made ones at home.
2.3 Basics about the Semantic Web
The Internet, or World Wide Web (WWW), as it is known today was founded in 1989 by Tim
Berners-Lee as a project at the European Organization for Nuclear Research facility, also
known as CERN. Berners-Lee’s goal was to create a “wide-area hypermedia information
retrieval initiative aiming to give universal access to a large universe of documents.” (cf.
2.3 Basics about the Semantic Web 5
Frysyk, 1994). The basic WWW is about connecting documents that contain data in the
form of text or pictures. But already in 2001 Tim Berners-Lee talked about the next big step,
namely the “Semantic Web”, which is more about connecting the data itself than connecting
documents. Due to the World Wide Web Consortium (W3C), an organization founded by
Tim Berners-Lee to create standardized web formats, the Semantic Web is about two things:
(cf. W3C et al., 2013)
1. Common formats for data integration and combination from diverse sources.
2. Recording how data relates to real world objects.
Metadata is the key in the mentioned idea of the Semantic Web. “Metadata is ’data about
data’.” (Riley, 2017, p. 1). It provides more information about the actual data, allowing us
to derive knowledge out of it. It is not only used to describe the appearance of the data, but
also to describe and denote its meaning and relation to other data. For Example: A weather
station stores data about temperature, air moisture and wind strength; but it stores not only
the raw values, but also timestamps and coordinates. This is metadata and it can be found
everywhere (Riley, 2017, p. 1). Using this additional metadata, the weather stations can
create statistics for specific time periods or locations, which they can use to trace diverse
weather developments. Other Systems, when they can rely on metadata, are able to understand words and phrases that are equivalent. (Davies et al., 2007, 3) A common example:
When searching Google for „Jaguar“ in the context of motor industry, the Google Search
Engine „knows“ the user is not searching for the animal because in this context the semantic
relations to the animal are very poor (Davies et al., 2007, 3). There are different types of
metadata that should give a good overview and understanding of what metadata really is and
why it can be found basically everywhere (cf. Riley, 2017, 6):
• Descriptive metadata: Understand what the data is about or what the data means.
• Administrative metadata:
– Technical metadata: How to decode the data or how data has to be processed.
– Preservation metadata: How to store files for a longer time.
– Metadata about rights: Additional information about the intellectual property
rights.
• Structural Metadata: Relationships between data.
• Markup languages: Integration of metadata for additional structural or semantic features (e.g. XML).
This already shows that metadata is the first step to ensure high quality data. It provides
information about the different data quality dimensions from chapter 2.1, e.g. about time
2.3 Basics about the Semantic Web 6
(Timeliness), format (Validity) and planned usage/description (Relevance) of the data. More
metadata helps evaluating the quality of data.
Standardization is also an important factor in semantic web development. In the last
decades Tim Berners-Lee and the W3C published several standards and recommendations
to facilitate the development of the Semantic Web and Semantic Web applications. In addition, those standardization simplifies the compatibility of different applications. The most
important standards, that will also play a part in this paper, are (Blumauer and Nagy, 2020,
25):
1. The Resource Description Framework (RDF) as a recommended approach to describe
and store metadata.
2. The Resource Description Framework Schema (RDFS) enabled the representation of
the data in the WWW.
3. The Web Ontology Language (OWL) was developed to allow users defining and instantiating web ontologies.
4. The Simple Protocol and RDF Query Language (SPARQL) was developed to retrieve
and manipulate data stored in RDF.
5. The Shapes Constraint Language (SHACL) is used to validate graph-based data against
a set of conditions.
Figure 2.2: The Semantic Web is structured in different layers of formalisms and recommendations. (cf. Kingsley Uyi Idehen, 13.07.2017)
2.4 Structure of Knowledge Graphs 7
Figure 2.2 shows the structure of the Semantic Web and the interaction of the mentioned
recommendations. The next chapters will introduce everything needed to follow this project.
2.4 Structure of Knowledge Graphs
A knowledge graph is a database that integrates data using structures known from the geometric graph theory. “The knowledge graph represents a collection of interlinked descriptions of entities.” (cf. Ontotext.com, 2018). Just like graphs in mathematics, knowledge
graphs consist of nodes and directed edges between those nodes. Those edges connect several nodes with specific properties. Imagine a relational database in form of a table (cf.
Figure 2.3): In a graph database, the first node conforms to the row, the property to the
column and the second node to the value in this row and column. (like in Figure 2.4).
Figure 2.3: What a typical relational database looks like. (Created with Visual Paradigm
Online)
Figure 2.4: How the data of Figure 2.3 “looks like” when stored in a graphical database.
(Created with Visual Paradigm Online)
2.4.1 Resource Description Framework
The Resource Description Framework (RDF) is a standard model for data interchange in
the semantic web. Data in RDF is stored in so-called “triples”, which consist of a subject, a
predicate and an object. When several triples are connected to each other, this is called “RDF
graph” (cf. RDF Working Group (2014)). The advantage of RDF and the storing of data in
those triples is, that it is easy to read by machines as well as humans. RDF statements can
2.4 Structure of Knowledge Graphs 8
be visualized using directed graphs (cf. Figure 2.5) (cf. Blumauer and Nagy, 2020, p. 96ff.).
Subjects and objects are represented as nodes. The predicate is a directed edge connecting
two nodes. There are three different kinds of nodes:
1. Unique Resource Identifier (URI) or International Resource Identifier (IRI)
2. literal nodes
3. blank nodes
Figure 2.5: RDF triple consisting of a subject, a predicate and an object. (Created with
Visual Paradigm Online)
URIs are “Unique Resource Identifiers” that are used to clearly identify a thing or object
in the real world. A URI points to a namespace or vocabulary in which the subject, predicate
or object of the data is unambiguously defined. For example the predicate “hasTitle”: Does
it mean the title of a book or the jobtitle in a company?. Namespaces/Vocabularies were
created to avoid those complications and to differ between words with multiple possible
meanings. IRIs are basically URIs with a wider range of possible characters (e.g Chinese
symbols) that can be used to address an ontology. This is what a typical URI looks like:
"http://xmlns.com/foaf/0.1/". Its structure is very similar to URLs used to address documents
on web servers, because URLs are just special URIs. (cf. DuCharme, 2013, Chapter 2:
URLs, URIs, IRIs and Namespaces)
Literal nodes instead denote a literal value like strings or other data types. In Figure A.1
the unique subject “Person:1337” has three objects that are literal values (cf. Blumauer and
Nagy, 2020, p. 96). The “Person:” in this example is a so-called “prefix” that points to a
specific namespace in which the subject 1337 is uniquely defined and described.
Blank nodes are used to group data. Figure A.2 shows the address of a person stored
using triples without blank nodes (cf. DuCharme, 2013, Chapter 2: URLs, URIs, IRIs and
Namespaces). Figure A.3 shows how a blank node is used to group the postal address of the
person. This helps arranging the graph and improves readability.
The Serialization of RDF triples in an RDF graph data base is necessary to make the data
machine-readable. There are several different formats that can be used to serialize the triples.
Some examples are: Turtle, JSON-LD, N3 or RDF/XML (cf. Blumauer and Nagy, 2020,
2.4 Structure of Knowledge Graphs 9
p. 97f.). Every format has its advantages and disadvantages, but since Turtle is specially
designed for human-understanding, Turtle will be the format used for RDF statements in
this paper. Listing 2.1 shows the serialization of Figure A.1. In the first two lines of the
code the prefixes “eo” and “foaf” get defined. The URIs at the end of the line point to an
vocabulary/namespace where several objects and relations are unambiguously defined. The
storing of data starts in line 3. Whatever or whoever is uniquely defined as “Person1337” in
the example ontology, that is called “eo”, has the relation "has_firstName", which is clearly
defined by the “Friend-of-a-Friend” (foaf) vocabulary, pointing to a literal value of type
string with the content of “James”. Same is with his lastname and his age. In addition, what
is not part of Figure A.1, Subject “Person1337” knows the object that is clearly defined as
“Person1338” in the ontology. A dot always signals the end of a statement. This example
is a bit simplified but it shows how easy it is for humans to understand Turtle and how
the knowledge is stored and connected. The data stored in Listing 2.1 can be easily read
by every non-computer scientist: “Person1337” is called James Parker, is 21 years old and
knows “Person1338”.
1 @prefix eo : < http :// www . exampleOntology . de / exampleOntology #> .
2 @prefix foaf : < http :// xmlns . com / foaf /0.1/ > .
3
4 eo : Person1337 foaf : has_firstName " James " .
5 eo : Person1337 foaf : has_lastName " Parker " .
6 eo : Person1337 foaf : has_age 21 .
7 eo : Person1337 foaf : knows eo : Person1338 .
Listing 2.1: Example for RDF triples in Turtle
It is also possible to assign specific datatypes to the literal values. It is necessary to differ
between a date and an IBAN, although both have to be denoted as a string. Using the XML
Schema Definition (XSD) specification by the W3C, datatypes can get defined in the triples,
like the birthdate of “Person1337” in Listing A.1. The last important feature of RDF statements is the connecting of multiple triples. Obviously it is not very efficient to note down
every relation of a subject separately. RDF allows the definition of several relations for the
same subject by separating the statements with semicolons. The statement of Listing A.1
can get shortened like it’s done in A.2.
2.4.2 Reasoning Data with OWL
Taxonomies are concepts or structures, which are unintentionally used by humans to find
and classify things in hierarchies. In order to make the world more explainable and understandable and to arrange knowledge, things get assigned to other things that belong together
(cf. Blumauer and Nagy, 2020, p. 98ff.).
2.4 Structure of Knowledge Graphs 10
An Example: Scientists divide natural sciences into chemistry, biology, physics etc. Then
those sciences get divided into even more specific sciences, e.g chemistry into organic and
inorganic chemistry etc. Everything is in some way part of a taxonomy created to draw basic
relations between things (cf. Figure 2.6). “A taxonomy is a controlled vocabulary consisting
of preferred terms, all of which are connected in a hierarchy or polyhierarchy.” (ANSI/NISO,
2010, p. 18)
Figure 2.6: Example Taxonomy that shows (roughly) how the natural sciences are divided
into different areas of studies (inspired by (ANSI/NISO, 2010, p. 18)) (Created with Visual
Paradigm Online)
Ontologies are the heart of Semantic Web applications and are used to make knowledge
machine-readable (cf. Landhäußer, n.d., p. 22). “An ontology is a formal, explicit specification of a shared conceptualization” (Studer et al., 1998, p. 25).
• “Formal” means machine-readable.
• “Explicit specification” implies the usage of concepts, attributes and relations.
• It is a “conceptualization” because an ontology is an abstract model of “real world”
phenomenons.
• “Shared” means that the knowledge is coincident and not for private individuals, but
to be accepted by a group
(Source: (Studer et al., 1998, p. 25)). Ontologies are used to give more dimensionality
to a KG (cf. Blumauer and Nagy, 2020, p. 102) by extending its structure and providing
supplementary semantic information for the taxonomies (cf. Hüttenegger, 2006, p. 183).
When a person has a pet of type cat, this implies that the person has a pet of type mammal.
In addition and in contrast to a taxonomy, the ontology delivers the semantic meaning of
the terms “person”, “cat” and “mammal” (cf. Hüttenegger, 2006, p. 183). Figure 2.7
shows another example. It shows an ontology consisting of four concepts/classes and three
instances of those classes that are in different relations to each other. Leonardo DaVinci is
2.4 Structure of Knowledge Graphs 11
a “human” who created the painting of Mona Lisa, a “painting” with values for width and
height, that shows Mona Lisa, who is also a “human”. Because DaVinci drew a “painting”,
he is also an instance of class “painter” (indicated by the red arrow) (cf. Landhäußer, n.d.,
p. 23).
Figure 2.7: Example ontology: The concepts/classes in normal bold; Individuals/Instances
in an italic bold. (inspired by Landhäußer, n.d., p. 23) (Created with Visual Paradigm Online)
OWL stands for “Web Ontology Language” and is a standardized language by the W3C
to describe knowledge about things or groups of things and to define inferences/relations
in datasets. An OWL document is nothing else than an ontology (cf. OWL Working Group,
2012). OWL is property oriented and builds on RDFS. It allows defining domains and ranges
and creating classes and subclasses. In addition OWL supports existence and cardinality
constraints, so that the user can e.g. say that every person in the dataset must have exactly
one biological mother. As mentioned when explaining taxonomies, those relationships are
necessary to express and understand knowledge. OWL also enables transitive, inverse or
symmetrical relations. For Example: The user can define that the relation “isPartOf” is the
opposite of the relation “hasPart” (inverse) and that the relation “touches” counts in both directions (symmetrical). To define the ontologies with OWL, Turtle can be used again. Listing
2.2 shows the definition of a class “Musician”. Using the RDF and RDFS vocabularies, the
“musician” gets assigned to the type of “class”. In addition a label and a comment/description of the class is defined. The class “MusicalInstrument” is created as well as the property
“playsInstrument”. This property has a domain pointing to the Musician class and a range
pointing to the MusicalInstrument class. Because of this connection, whenever a person has
the property “playsInstrument”, this person will automatically become a musician (by running an inference procedure enabling this kind of reasoning) and the object this person uses
to make music will automatically become a musical instrument. Note that “a” (like in line
10) is just short for “rdf:type”.
2.5 Using SPARQL to access a Knowledge Graph 12
1 @prefix eo : < http :// www . exampleOntology . de / exampleOntology # >.
2 @prefix rdf : < http :// www . w3 . org /1999/02/22 - rdf - syntax - ns # > .
3 @prefix rdfs : < http :// www . w3 . org /2000/01/ rdf - schema #> .
4
5 eo : Musician
6 rdf : type rdfs : Class ;
7 rdfs : label " Musician " ;
8 rdfs : comment " Someone who plays a musical instrument " .
9 eo : MusicalInstrument
10 a rdfs : Class ;
11 rdfs : label " Musical instrument " .
12 eo : playsInstrument
13 rdf : type rdf : Property ;
14 rdfs : comment " Identifies the instrument that someone plays " ;
15 rdfs : label " plays instrument " ;
16 rdfs : domain eo : Musician ;
17 rdfs : range eo : MusicalInstrument .
Listing 2.2: Creating classes and properties with OWL
2.5 Using SPARQL to access a Knowledge Graph
SPARQL stands for “Simple Protocol and RDF Query Language” and is, as the name implies, a language to create queries to select specific data out of RDF-based graphs. SPARQL
allows the user to filter the database. Tim Berners-Lee, the inventor of HTML, founder of
the WWW and director of the W3C said: “Trying to use the Semantic Web without SPARQL
is like trying to use a relational database without SQL” (cf. W3C, 2008). If the user wants
specific data that meets specific conditions, they can use SPARQL queries with different
commands to filter out exactly the data they is searching for. In Listing 2.3 SPARQL, is used
to find the name of everyone Subject “P1337” knows (cf. Listing 2.1). Everyone familiar
with relational databases and “Structured Query Language (SQL)” will recognize the similarities between SPARQL and SQL. The “SELECT” statement defines the variables which
will be part of the final output and the “WHERE” statement contains the triple patterns to
match. A question mark always indicates a variable. The result of the “SELECT” query is
a table. Each selected variable becomes a column and each matched pattern becomes a row.
Listing A.3 shows the data to work with in following examples. Listing 2.3 shows a typical
SPARQL query. The SELECT statement declares the variables that will contain the values,
which the query will generate as the output. The output will be the first name and last name
of every person who knows P1338.
1 PREFIX eo : < http :// www . exampleOntology . de / exampleOntology # >
2 PREFIX d: < http :// www . ownOntology . de / data # >
3 PREFIX foaf : < http :// xmlns . com / foaf /0.1/ >
4
2.5 Using SPARQL to access a Knowledge Graph 13
5 SELECT ? first ? last
6 WHERE {
7 ? person foaf : knows d: P1338 .
8 ? person eo : has_firstName ? first .
9 ? person eo : has_lastName ? last .
10 }
Listing 2.3: A simple SPARQL query to filter the first name and last name of everyone who
knows P1338 in the dataset (Listing A.3)
The final output of Listing 2.3 is:
Table 2.1: Result of Listing 2.3
first last
"James" "Parker"
"Jim" "Hammilton"
SPARQL is a simple yet very powerful tool to query and filter huge graph data bases. Additional important SPARQL keywords to follow this project are:
• FILTER - to implement supplementary conditions, e.g. to filter every Person born
before a specific date.
• CONCAT - to concatenate two or more variables together.
• BIND - to give value(s) an alias using the keyword “AS”, e.g. to store the “firstName”
and “lastName” AS “fullName” after concatenating them. “AS” can also be used without “BIND” to create a new variable after arithmetic operations, like adding different
prices together and storing them AS “totalPrice”.
• CONSTRUCT - is a query form (meaning an alternative for the SELECT keyword)
that returns triples by pulling them out of a data source without changing them. The
values in those datasets can be used to create new triples. That is why SPARQL can
be used to copy, create and convert data stored in RDF triples, which is very important
for the review of the Astrea-Tool in Section 3.2.
• OPTIONAL - to return a value only if it exists, e.g. used to search for former incidents,
but only in case there are any. If none are found for a subject, no error will occur.
• VALUES - to directly write or add data into a pattern or query. It allows specifying
multiple variables in a data block.
• URI - converts a string into a URI.
Sources: (DuCharme, 2013, p. 47-182 (Chapters 3-5)) and the official SPARQL documentation Harris et al. (2013). There are many more possible keywords to use in SPARQL queries,
but in order to understand this project paper, the ones mentioned above are the only ones
needed to know. In the appendix, there is a table where all these statements are summarized
again (cf. Table A.1).
2.6 Advantages of Knowledge Graphs as data bases 14
2.6 Advantages of Knowledge Graphs as data bases
Transforming explicit knowledge into implicit knowledge is only one big advantage of KGs.
They are the perfect tool to link data in enterprise management systems and can be used in
many different scenarios like (cf. Blumauer and Nagy, 2020, p. 21f.):
• Searching the Web (Google, Bing, Maps)
• Crawl for product information (Amazon or other retailers)
• Smart Assistants (Siri, Echo, Cortana)
• Science Applications:
– data exploration, data searching
– finding buried connections in data
– Analysis
– Machine Learning
Since the SPARQL queries can provide different methods of converting heterogeneous data,
KGs can facilitate data integration from multiple sources and domains. It’s not difficult to
transform relational data from different sources into triples, which are then stored in a KG.
Those new triples and the KGs can get merged by comparing the data and drawing new
connections (cf. Blumauer and Nagy, 2020, p. 69f.). The performance of graph databases
like KGs and its queries remains relatively constant (or rather proportional) as datasets get
bigger, while queries of relational databases tend to perform slower. In addition, graph
databases are additive and easy to extend without any interference (cf. Robinson et al., 2015,
p. 8f.). Another advantage of KGs is the overcoming of so-called “data silos”. Data silos
inhibit productivity in companies and cause wasted resources, because only a specific group
of people can fully access a set of data. When using KGs, replacing and migrating data
becomes unnecessary. Instead data integration and linking of data get focused. This is done
by using already existing data models to build semantic knowledge models, like ontologies.
Those semantic solution approaches combine the benefits of data lakes and data warehouses
and exactly mirror the ideas and interests of the semantic web (cf. Blumauer and Nagy,
2020, p. 33f.). It is the data that matters, not the databases. The connection of data creates a
data-centric knowledge foundation.
2.7 Already existing Knowledge Graphs
World Knowledge Graphs do not focus on a single field of knowledge. Instead they try
to gather and connect all knowledge of the whole world. Examples for this kind of KGs are
the Google Knowledge Graph, Wikidata or DBpedia. A company or even a private person
could use subsets of those graphs containing relevant information for their concerns. World
2.7 Already existing Knowledge Graphs 15
Knowledge Graphs often provide useful information about general topics, like geographic
information, that can be included in someone’s own KGs. (cf. Blumauer and Nagy, 2020, p.
106f.)
Domain Knowledge Graphs are already existing KGs for specific domains like (cf. Blumauer and Nagy, 2020, p. 107f.):
• Business & Finance
• Pharmacy & Medicine
• Cultural Heritage
• Sustainable Development
• Geographic Information
The medical sector is a pioneer in knowledge graph development (cf. Blumauer and Nagy,
2020, p. 109f.) and for a researching pharmacy company like BI this domain could be of
special interest. Using the web page of the Ontology Lookup Service (OLS) it is possible to
gain access to the latest ontologies of the Pharmacy & Medical domain like the:
• Chemical Entities of Biological Interest Ontology (ChEBI)
• SNOMED Clinical Terms (SNOMED CT)
• Gene Ontology
Commonly used Vocabularies & Namespaces
In chapter 2.4 the so-called “prefixes” were introduced. Those prefixes point to a specific
namespace or vocabulary (in form of an ontology) which support the uniqueness of labels
and objects. Reminder: Words like “title” can have different meanings in a different context.
Table A.2 contains the most used vocabularies and their (common) prefixes. In addition,
the Linked Open Vocabularies (LOV), a huge online collection of the biggest and most frequent used vocabularies and namespaces in the web, is a good source to get to know more
ontologies (https://lov.linkeddata.es/dataset/lov/).
3 Realisation of the Proposed Concept 16
3 Realisation of the Proposed Concept
3.1 The Shapes Constraint Language (SHACL)
SHACL stands for “Shapes Constraint Language” and is a standardized language by the
W3C to validate datasets and its individuals by creating so-called “shapes”, using already
existing ontologies. Those shapes are applied to a set of data to verify its quality. In this
case, quality means that the data fits every criteria, form and aspect, the user wants the data
to fulfill. Just because a set of data is validated by SHACL, it does not mean that the data
is automatically “high quality”. The data is just in the correct, user-demanded shape. In
contrast to OWL, SHACL is there for validating data instead of inferencing data.
An Example: If the shape defines that every individual of type person has to have exactly
one integer value for the property age and this constraint should be violated, an exception
will occur.
3.1.1 Shapes
are conjunctions of constraints that the targets must satisfy. Shapes are distinguished in
“Node Shapes” and “Property Shapes”. Node Shapes declare constraints directly on a node
while Property Shapes declare them on the property connected to the node through a “path”.
A path is a sequence of edges connecting properties to nodes. Listing 3.1 shows a typical
node shape of a vegetarian pizza in a pizza shop. In Listing 3.2 a property shape is defined in
lines 4-7. The property shape inside the “RealItalianPizzaShape” demands that the base of
this pizza has to be of type “ThinAndCrispyBase”. In case there is a “RealItalianPizza” in the
dataset which does not have the value “ThinAndCrispyBase” for the “hasBase” predicate, an
error will occur, since the data does not fit the SHACL shape.
1 pizza : VegetarianPizzaShape
2 rdf : type sh : NodeShape ;
3 sh : targetClass pizza : VegetarianPizza ;
4 sh : nodeKind sh : IRI .
Listing 3.1: A typical node shape
1 pizza : RealItalianPizzaShape
2 a sh : NodeShape ;
3 sh : nodeKind sh : IRI ;
4 sh : property [ a sh : PropertyShape ;
5 sh : class pizza : ThinAndCrispyBase ;
6 sh : path pizza : hasBase
7 ] ;
8 sh : targetClass pizza : RealItalianPizza .
3.1 The Shapes Constraint Language (SHACL) 17
Listing 3.2: A typical property shape which is inside of a node shape
3.1.2 Targets
are used to specifically select certain nodes which have to be validated. It is important to
differentiate between the kinds of targets, in order to understand SHACL:
1. Node targets - targets a specific node in the graph (sh:targetNode).
For Example: Validates every triple that exactly targets the node: “SalamiPizza”. (cf.
Listing A.4)
2. Class targets - targets a specific class (sh:targetClass)
For Example: Validates every triple that targets every node that is of class “Pizza”. (cf.
Listing A.5)
3. Subjects-of-targets - targets every subject of a specific property (sh:targetSubjectsOf)
For Example: Validates every subject of a triple that has the specific predicate/property
“has_ingredient” (cf. Listing A.6)
4. Objects-of-targets - targets every object of a specific property (sh:targetObjectsOf)
For Example: Validates every object of a triple that has the specific predicate/property
“has_ingredient” (cf. Listing A.7)
3.1.3 Patterns
(sh:pattern) are used to apply shapes on triples that fulfill specific criteria. For Example:
Listing 3.3 shows a set of triples, that assign names to subjects of an ontology. The SHACL
shape in Listing 3.4 is applied to every triple, where “eo:has_firstName” is the predicate and
where the object follows the pattern “J” (starting with the letter “J”), which are the statements
in line 3 and 5 of Listing 3.3.
1 PREFIX eo : < http :// www . exampleOntology . de / exampleOntology # >
2
3 eo : P1337 eo : has_firstName " James " .
4 eo : P1338 eo : has_firstName " Marry " .
5 eo : P1339 eo : has_firstName " Jim " .
Listing 3.3: RDF statements that connect subjects to first names
1 PREFIX eo : < http :// www . exampleOntology . de / exampleOntology # >
2 PREFIX sh : < http :// www . w3 . org / ns / shacl # >
3
4 eo : NameWithJExample
5 a sh : NodeShape ;
3.1 The Shapes Constraint Language (SHACL) 18
6 sh : targetNode eo : P1337 , eo : P1338 , eo : P1339 ;
7 sh : property [
8 sh : path eo : has_firstName ;
9 sh : pattern "^J" ; # apply for every first name starting with J
10 ] .
Listing 3.4: A SHACL shape that is applied on every triple declaring a first name starting
with J using “sh:pattern”
3.1.4 Validation
Only the lines 1 and 3 of Listing 3.5 are validated by the shape defined in Listing 3.1, because
the node kind of the statement in line 5 is an integer, not an IRI (like the shape demands it).
In line 3 a URI / IRI points to the node, in line 1 the Margarita is an “instance” of this URI /
IRI. That’s why line 1 is also a valid statement. Remember that “a” stands for “rdf:type”.
1 pizza : Margarita a pizza : VegetarianPizza .
2
3 < www . pizza . de / margarita > a pizza : VegetarianPizza .
4
5 :5 a pizza : VegetarianPizza .
Listing 3.5: Only the RDF statements in line 1 and line 3 are valid according to Listing 3.1
Using SHACL shapes, it is also possible to filter the data and to put special constraints on it.
Cardinality constraints, like the “minCount” or “maxCount” constraint, for example. They
can be put on classes or nodes to say that every pizza must have at least one topping and can
have a maximum of four toppings.
The basics that were just explained is everything needed to understand this paper. In case
something new appears, it will be explained in the corresponding chapter.
3.1.5 SHACL vs. ShEx
Shape Expressions (ShEx) is a data modeling language developed to validate and describe
Resource Description Frameworks, just like SHACL does. So why use SHACL instead of
ShEx? Since they were developed to fulfill the same tasks, SHACL and ShEx have many
similarities. Nevertheless, there are some differences. Although both languages work with
shapes and constraints and both have many features and syntactic rules in common (cf. Labra
Gayo et al., 2018, Chapter 7.1: ’Common Features’ & Chapter 7.2: ’Syntactic Differences’),
they differ in their foundation and their way of how they work. While SHACL works with
constraints to validate RDF graphs by checking if the constraints are satisfied, ShEx can
rather be understood as a grammatic schema, that describes a RDF graph. This means that
the validation results of ShEx look different than SHACL’s results. It shows which nodes
3.2 Astrea-Tool: Generation of Shapes 19
and shapes were matched (in form of an annotaded graph), instead of clearly depicting which
constraint has been violated and why. SHACL’s error description is way more detailed and
precise. It is easier to detect and fix issues of RDF graphs using SHACL (cf. Labra Gayo
et al., 2018, Chapter 7.3: ’Foundation: Schema vs. Constraints’). This is exactly what BI
wants to do with its knowledge graphs. That is why SHACL is preferred to ShEx.
3.2 Astrea-Tool: Generation of Shapes
“Astrea” is an open source tool that uses own KG mappings consisting of SPARQL construct queries to generate SHACL shapes automatically for a set of ontologies. Astrea uses
the “Astrea-KG”, that contains 158 mappings, all relating a different ontology constraint
pattern with an equivalent SHACL constraint pattern (cf. Andrea Cimmino et al., n.d., p.
498). The implemented SPARQL queries consist of CONSTRUCT statements containing
the SHACL construct patterns and WHERE statements containing the ontology construct
patterns. The ontology patterns get recognized and translated into the equivalent SHACL
patterns (cf. Andrea Cimmino et al., n.d., p. 500f) using the Astrea-KG and the mappings
defined in the “Queries.csv” file located in the “material” directory of the Astrea-Tool. Every
mapping in the CSV file (Comma-Separated-Values) consists of nine columns:
• The IMPLEMENTED and the ORDER columns are not relevant to understand the tool,
since the values of those are always the same. (just metadata for the mapping)
• The TOPIC column classifies, what the mapping is about to do, like “Class definiton”
or “Object Property definition”.
• The OWL CONSTRUCT column contains one or more ontology construct patterns
from OWL, RDFS and XSD specifications. The SHACL CONSTRUCT column contains the equivalent SHACL construct patterns. (cf. Figure 3.1)
• The SHACL CONSTRUCT TYPE column (additional metadata for the mapping)
• The GRAPH PATTERN SOURCE and GRAPH PATTERN TARGET columns contain
the patterns, that will get recognized (either the ontology pattern with OWL/RDFS/XSD statements, or the SHACL pattern). The mapping works in both directions.
(cf. Figure 3.1)
• The QUERY column, that contains the SPARQL query executing the mapping/translation (cf. Figure 3.1)
Figure A.4 in the appendix shows how the mapping by Astrea works in detail. Since it
would be to much to review every line of code (over 4000 lines), chapter A.1 in the appendix
contains some code examples of following queries.
3.2 Astrea-Tool: Generation of Shapes 20
Figure 3.1: Simple overview of how Astrea works (Andrea Cimmino et al., n.d., p. 503).
3.2.1 Generating Shapes using Queries
Listing 3.6 shows the query of a “Restriction Pattern”, the type of pattern that appears most
frequently (77 times) in the code. Other important types will also be explained here, but their
code examples can only be found in the appendix.
1 PREFIX rdfs : < http :// www . w3 . org /2000/01/ rdf - schema #>
2 PREFIX sh : < http :// www . w3 . org / ns / shacl #>
3 PREFIX rdf : < http :// www . w3 . org /1999/02/22 - rdf - syntax - ns #>
4 PREFIX xsd : < http :// www . w3 . org /2001/ XMLSchema # >
5
6 CONSTRUCT {
7 ? shapeUrl a sh : PropertyShape ;
8 sh : pattern ? restrictionPattern .
9 } WHERE {
10 ? property a ? propertyType .
11 VALUES ? propertyType { owl : DatatypeProperty rdfs : Datatype }
12 ? property owl : withRestrictions ? restrictionsList .
13 ? restrictionsList rdf : rest */ rdf : first ? restrictionElement .
14 OPTIONAL { ? restrictionElement xsd : pattern ? restrictionPattern .
}
15 FILTER (! isBlank (? property )) .
16 BIND ( URI ( CONCAT (’https :// astrea . linkeddata .es/ shapes #’, MD5 ( STR (?
property )))) AS ? shapeUrl ) .
17 }
Listing 3.6: A Query of type “Pattern Restriction”
Reminder: The CONSTRUCT statement contains the SHACL pattern and the WHERE statement the ontology pattern. The query in Listing 3.6 creates a SHACL Property Shape with
a “sh:pattern” restriction, which specifies a regular expression, that the value node needs to
match. The “restriction pattern(s)” come from the ontology, as can be seen in the WHERE
3.2 Astrea-Tool: Generation of Shapes 21
statement. Table A.1 in the appendix compactly summarizes the most important SPARQL
statements mentioned earlier for refreshing. Other Astrea-Queries:
Object Property Definition
• appear seven times (second most often) in the mappings.
• code example can be found in Listing A.8 .
• Those queries create SHACL property shapes to ensure that their node kind is “BlankNodeOrIRI”. Property shapes obviously can’t be literals, because they represent relations between subjects and objects. The properties have to be unambiguously defined,
either using an IRI or a blank node as a placeholder.
Comment Annotations
• code example can be found in Listing A.9
• used to create SHACL shapes containing descriptions (or alternative labels) for entities
Label Annotations
• code example can be found in Listing A.12
• Looks very similar to Comment Annotations. Here the shape contains the name of
entities. Both the “Label annotation” and the “Comment annotation” use “rdfs:label”
in the CONSTRUCT query. This is because the “rdfs:label” can be used in both ways
and to express names as well as descriptions or comments.
3.2.2 Functionality of Astrea
Finally, Figure 3.2 illustrates the functionality of Astrea in six steps. (cf. Andrea Cimmino
et al., n.d., p. 504f)
1. Ontology Manager is fed with a set of ontology URLs as input.
2. Ontology Manager checks the owl:import statements.
3. Ontology Manager downloads the referenced ontologies.
4. Ontology Manager sends the downloaded ontologies to the KG-Manager.
5. KG-Manager reads the mappings of the Astrea-KG.
6. KG-Manager produces an RDF graph containing SHACL shapes fitting to the ontology
construct mappings encoded in the CONSTRUCT query for every ontology sent by the
Ontology Manager. An RDF graph containing all SHACL shapes is returned.
3.3 RDFUnit Testing Suite: Generation of Test Cases 22
Figure 3.2: Architecture and Functionality of the Astrea-Tool
Since Astrea only takes ontologies into account where no instances are expected in the data,
restrictions referring to those instances are not supported by the tool. In addition Astrea does
not support restrictions that require a practitioner to establish them. (cf. Andrea Cimmino
et al., n.d., 506f.) For more and very detailed information visit the official documentation
of the Astrea-Tool and its github repository or read the official paper explaining the backgrounds of the tool. (cf. Andrea Cimmino et al., n.d.) Accordingly, modifying Astrea and
its mappings is easily done by adjusting the SPARQL queries in the “Queries.csv” file.
3.3 RDFUnit Testing Suite: Generation of Test Cases
The project team of the research group “Agile Knowledge Engineering and Semantic Web
(AKSW)” that developed “RDFUnit” describes the tool as a “test driven data-debugging
framework that can run automatically generated (based on a schema) and manually generated
test cases against an endpoint” (Auer et al. (2014)). They use SPARQL queries to execute
the test cases with pattern-based transformation.
3.3.1 Terminology & Basic Notions
• Test cases are data constraints consisting of one or more triples. In addition, “a test
case is an input on which the program [or dataset; author’s note] under test is executed
during testing.” (cf. Zhu et al., 1997, p. 3).
• Test suites (or test sets) are sets of test cases to apply to a dataset.
• The status of a test case or suite can be Success, Fail or Error.
3.3 RDFUnit Testing Suite: Generation of Test Cases 23
• A Data Quality Test Pattern (DQTP) can be understood as a tuple consisting of a
set of typed pattern variables and a SPARQL query template with placeholders for
those variables. DQTPs can be used to compare the values of multiple properties. (cf.
Listing A.13)
• Test Pattern Bindings are valid instances of a DQTP. They are triples consisting of a
variable mapping, a SPARQL query template and an error classification.
• Data Quality Test Cases are formed when the mappings of the pattern binding are
applied to the SPARQL query template. This creates an executable SPARQL query that
returns results. Results can be: Success (empty result), violation (results are returned)
or a timeout (test is marked for further inspection). (cf. Listing A.14)
• A Test Auto Generator (TAG) converts RDFS/OWL axioms/schemes into concrete
test cases. TAGs consist of two parts: The detection part querying against a schema
(cf. Listing A.15) and an execution part instantiating a test case from the respective
pattern (cf. Listing A.16).
Sources: Auer et al. (2014) & (Kontokostas et al., 2014, p. 2f)
3.3.2 Functionality of RDFUnit
Figure 3.3 illustrates the operating principle of RDFUnit. Test cases can be generated using
already existing RDFS/OWL schemes as input for the TAGs. RDFUnit has additional methods and features that can automatically improve a schema to create further test cases (But
those test cases are explicitly labeled, so that everybody knows, that those are less reliable
than other test cases). Integrated in RDFUnit is a pattern library that enables reusing test
cases when vocabularies/namespaces are detected that were already used before. The pattern
library of RDFUnit contains 17 DQTPs that can be applied to the shapes that appear most
often in SHACL or OWL. Table A.3 contains every pattern together with a description and
an example binding (Kontokostas et al., 2014, p. 3ff.). The team of AKSW created 32293
total unique test cases using 297 LOVs (cf. Kontokostas et al., 2014, p. 5).
The Inspection Report, created after RDFUnit is finished, provides information about
the generated test cases: How many cases passed, how many cases failed or timed out and
how many errors occurred in general. This inspection report clearly depicts the quality of
the tested datasets. Less errors imply a better dataset. Or at least a dataset that fits the given
shape. It also shows which axioms cause the most errors. But only because a dataset contains
many errors or violations it is not straightforward of "low quality". The user still has to look
up what has caused the errors. Sometimes, only a few missing statements in disadvantageous
positions that are demanded by some vocabularies can cause millions of consequential errors
3.3 RDFUnit Testing Suite: Generation of Test Cases 24
Figure 3.3: Flowchart of the RDFUnit functionality. Left: Different possible input sources.
Middle: Different ways to instantiate the patterns. Right: Different Data Quality Test Cases.
(Kontokostas et al., 2014, p. 4)
that build up on each other (cf. Kontokostas et al., 2014, p. 8). The Inspection Report looks
like in Figure 3.4. It shows the header of every test report summarizing the most important
numbers. The header shows the link to the dataset that has been tested, timecodes, how
many test cases were executed and what their results were. In case there are violations in the
dataset, those will be displayed tabularly beneath the header. This table has four columns
and looks like Table 3.1.
Figure 3.4: Summary of an inspection report by RDFUnit.
The 1st column contains the type of violation that occurred. The 2nd one the message that
belongs to the corresponding violation. The “Resource” column shows exactly which object
of the dataset caused the violation and the “Test Case” column shows in which of the test
cases that were generated the violation appeared. Every violation is explained, traceable and
thus rectifiable.
3.4 Data Structure at Boehringer Ingelheim 25
Table 3.1: An example Inspection Report by RDFUnit
Level Message Resource Test Case
<Violation> <Reason of Violation> <Link to Source of Violation> <Test ID>
ERROR Value does not match pattern https://ontology.com/object1 URN:1
WARNING Expected Value missing https://ontology.com/object2 URN:1
TIMEOUT Can’t connect to URI https://ontology.com/object3 URN:1
3.4 Data Structure at Boehringer Ingelheim
Boehringer Ingelheim has an intern graph database accessible for every employee who has
the necessary rights by the URI “https://data.boehringer.com/ontology/”. In the
following examples the “substance” ontology part of the database will be used to explain the
data structure and how Astrea and RDFUnit can work together to verify the data at BI.
Example RDF Triples of the BI intern ontology:
1 @prefix sub : < https :// data . boehringer . com / ontology / substance / >.
2 @prefix rdf : < http :// www . w3 . org /1999/02/22 - rdf - syntax - ns # > .
3 @prefix owl : < http :// www . w3 . org /2002/07/ owl #> .
4 @prefix rdfs : < http :// www . w3 . org /2000/01/ rdf - schema #> .
5 @prefix xsd : < http :// www . w3 . org /2001/ XMLSchema #> .
6 @prefix sh : < http :// www . w3 . org / ns / shacl #>
Listing 3.7: Consider those prefixes to be active in every following listing of this section.
1 sub : C1 a owl : class .
2 sub : C2 a owl : class .
3
4 sub : A2 a owl : FunctionalProperty , owl : DatatypeProperty .
5 sub : A3 a owl : FunctionalProperty , owl : DatatypeProperty .
6
7 sub : R1 a owl : ObjectProperty , owl : AsymmetricProperty , owl :
IrreflexiveProperty .
8 sub : R2 a owl : ObjectProperty , owl : AsymmetricProperty , owl :
IrreflexiveProperty .
Listing 3.8: Examples for creating classes & types. Reminder: “a” stands for “rdf:type” !
Listing 3.8 shows six different subjects.
• C1 and C2 are substances that get assigned to the rdf:type of “class”. Listing 3.9 labels
them “Substance” and makes them a subclass of “owl:Thing”.
• A2 and A3 are FunctionalProperties & DatatypeProperties. DatatypeProperties are
subclasses of FunctionalProperties, making their declaration redundant. A2 and A3
assign “ChEMBL codes” (codes for a chemical database) to classes (cf. Listing 3.9
lines 5-8)
3.4 Data Structure at Boehringer Ingelheim 26
• Subjects R1 and R2 are asymmetric, irreflexive ObjectProperties used to depict molecule
heritages (cf. Listing 3.9 lines 10-13).
1 sub : C1 rdfs : comment "Any matter of defined composition that has discrete
existence , whose origin may be biological , mineral or chemical ." @en .
2 sub : C1 rdfs : label " Substance "@en .
3 sub : C1 rdfs : subClassOf owl : Thing .
4
5 sub : A2 rdfs : comment " ChEMBL codes identifies the substances depicted by
the ChEMBL DB."
6 sub : A2 rdfs : label " has ChEMBL code " @en .
7 sub : A2 rdfs : domain sub : C2 .
8 sub : A2 rdfs : range xsd : string .
9
10 sub : R1 rdfs : comment " Parent molecule of its alternative form ." @en .
11 sub : R1 rdfs : label " has parent molecule " @en .
12 sub : R1 rdfs : domain sub : C2 .
13 sub : R1 rdfs : range sub : C2 .
Listing 3.9: BI internal triples that further describe the subjects C1, A2 and R1 in the dataset.
4 Final Results & Outlook to the Future 27
4 Final Results & Outlook to the Future
4.1 Verification of Data Quality
4.1.1 Appyling Astrea to BI data
Listing 4.1 shows the SHACL shape (or schema) of the C1 subject which is generated by the
Astrea tool when applied to the set of data in Listings 3.8 and 3.9. The SHACL shapes for
the subjects A2 and R1 from the former chapter can be found in the appendix (cf. Listings
A.17 and A.18).
1 < https :// astrea . linkeddata . es / shapes #68 cdee7760285c160898001ac30720b0 >
2 a sh : NodeShape ;
3 rdfs : label " Substance " @en ;
4 rdfs : seeAlso " https :// schema . org / Substance "^^ xsd : anyURI ;
5 sh : description " Any matter of defined composition that has
discrete existence , whose origin may be biological , mineral
or chemical ." @en ;
6 sh : name " Substance "@en ;
7 sh : nodeKind sh : IRI ;
8 sh : property < https :// astrea . linkeddata . es / shapes #8
f9c4d0490ddd476d5d86b9b49a14653 > ;
9 sh : targetClass < https :// data . boehringer . com / ontology / substance /
C1 > .
Listing 4.1: SHACL Shape for Subject C1 of Listings 3.8 and 3.9
4.1.2 Running RDFUnit with BI data shapes
After Astrea generated the SHACL shapes corresponding to the “substance” ontology part of
Boehringer Ingelheim’s database, RDFUnit accordingly created an inspection report. This
report shows no violations in the dataset. The header of the report looks like Figure 3.4.
All test cases passed without any error. The corresponding table that shows the violations in
detail is consequently empty.
4.2 Workflow-Concept of Astrea & RDFUnit
Figure 4.1 shows how the process of data validation looks like if the tools are split. An
employee has to create the shapes using the online tool of Astrea manually. Then they have
to put the shapes into a specific folder before they execute RDFUnit with the necessary
parameters (link to the ontology, path to shapes etc.). This is the process that was used for
the section above. If necessary, an employee can manually edit the shapes as well (e.g. to
fix small errors caused by trivial details). Astrea and RDFUnit work together to generate a
report, that depicts irregularities and errors in the tested datasets.
4.2 Workflow-Concept of Astrea & RDFUnit 28
Figure 4.1: Toolchain of the manual concept for data quality testing at BI currently. (Created
with Visual Paradigm Online)
Figure 4.2 shows how the process looks like after connecting the tools. If they have been
installed correctly, those tools can simply be combined by executing several commands in
a console, since Astrea can easily be imported as a library into other Java projects (like
RDFUnit). In Astrea’s documentation is a simple explanation of how to use the tool. A
model has to be created which needs the URI to the ontology that the tool shall generate the
shapes of. RDFUnit then only needs to know where to find the shapes and the dataset to
validate. Listing 4.2 shows the final command to run RDFUnit.
1 $ java - jar rdfunit - validate -0.8.24 - SNAPSHOT - standalone . jar
2 -d < LINK TO DATASET >
3 -e < LINK TO ONTOLOGY >
4 -eu < username > -ep < password >
5 -s < LINK TO SCHEMAS ( SHAPES ) >
6 -r shacl
Listing 4.2: Command to run RDFUnit in Linux.
4.3 The Effect to Boehringer Ingelheim 29
Figure 4.2: Toolchain of a possible automated concept for data quality testing. (Created
with Visual Paradigm Online)
4.3 The Effect to Boehringer Ingelheim
This project paper summarizes and explains the basics about knowledge graphs and the semantic web. It shows how graph-based data stores work and what their advantages are. This
topic is not completely new to Boehringer Ingelheim. There are already huge ontologies
and RDF-datasets existing in the company. Despite that, this topic is still very new. The
tools showcased in this project paper enable an easy verification of data from BI or other
companies. By combining the tools as mentioned, it is possible to execute the automated test
cases for every RDF-based dataset with only a few parameter adjustments. Since BI is a researching company, new data, new information and new knowledge is going to be generated
continuously. Astrea and RDFUnit are going to play an important part in validating data that
is going to be added to BI’s knowledge graph.
4.4 The next steps
The next steps in this development are going to be mainly about two things:
1. Implement traceability to comprehend the results of Astrea & RDFUnit.
2. Modify & adjust to the tools to get rid of misleading violations and to adapt to BI
structures.
4.4 The next steps 30
At the moment, the internal functionality of Astrea and RDFUnit is not very transparent. It
is not easy to reason why a shape looks like it does or why and how RDFUnit interprets
something as an error or a warning. The tools help verifying data quality, implying that
the data also becomes trustworthy, but now it is the tools that are not trustworthy enough to
completely rely on them. A company like Boehringer Ingelheim or other global companies
should be able to give suitable reasons for their statements and actions. Astrea and especially
RDFUnit have to get analyzed very deeply and probably have to undergo many modifications
before BI can go the next steps of creating a user-friendly framework that enables a simple
usage for everyone. This project paper lies the foundation for such analyses and future
projects.
"""

In [None]:
# mit Allem: 99% Scientific
# ohne Anhang & Vezeichnisse: 99% Scientific

In [None]:
t= """Humans are social beings. Whether we like it or not, nearly everything we do in our lives takes place in the company of others. Few of
our activities are truly solitary and scarce are the times when we are
really alone. Thus the study of how we are able to interact with one
another, and what happens when we do, would seem to be one of the
most fundamental concerns of anyone interested in human life. Yet
strangely enough, it was not until relatively recently – from about the
beginning of the nineteenth century onwards – that a specialist interest in this intrinsically social aspect of human existence was treated
with any seriousness. Before that time, and even since, other kinds
of interests have dominated the analysis of human life. Two of the
most resilient, non-social approaches to human behaviour have been
‘naturalistic’ and ‘individualistic’ explanations.
Rather than seeing social behaviour as the product of interaction,
these theories have concentrated on the presumed qualities inherent
in individuals. On the one hand, naturalistic explanations suppose
that all human behaviour – social interaction included – is a product
of the inherited dispositions we possess as animals. We are, like animals,
biologically programmed by nature. On the other hand, individualistic
explanations baulk at such grand generalizations about the inevitability of behaviour. From this point of view we are all ‘individual’ and
‘different’. Explanations of human behaviour must therefore always
rest ultimately on the particular and unique psychological qualities
of individuals. Sociological theories are in direct contrast to these
2 An Introduction to Sociological Theories
‘non-social’ approaches. Looking a little closer at them, and discovering
what is wrong or incomplete about them, makes it easier to understand
why sociological theories exist.
Naturalistic theories
Naturalistic explanations of human activity are common enough. For
example, in our society it is often argued that it is only natural for
a man and a woman to fall in love, get married and have children.
It is equally natural for this nuclear family to live as a unit on their
own, with the husband going out to work to earn resources for his
dependants, while his wife, at least for the early years of her children’s
lives, devotes herself to looking after them – to being a mother. As
they grow up and acquire more independence, it is still only ‘natural’
for the children to live at home with their parents, who are responsible for them, at least until their late teens. By then it is only natural
for them to want to ‘leave the nest’, to start to ‘make their own way in
the world’ and, in particular, to look for marriage partners. Thus
they, too, can start families of their own.
The corollary of these ‘natural’ practices is that it is somehow unnatural not to want to get married, or to marry for reasons other than
love. It is equally unnatural for a couple not to want to have children,
or for wives not to want to be mothers, or for mothers not to want to
devote the whole of their lives to child-rearing. Though it is not right
or natural for children to leave home much younger than eighteen,
it is certainly not natural for them not to want to leave home at all
in order to start a family of their own. However, these ‘unnatural’
desires and practices are common enough in our society. There are
plenty of people who prefer to stay single, or ‘marry with an eye on
the main chance’. There are plenty of women who do not like the idea
of motherhood, and there is certainly any number of women who do
not want to spend their lives solely being wives and mothers. There
are plenty of children who want to leave home long before they are
eighteen while there are many who are quite happy to stay as members of their parents’ households until long after that age.
Why is this? If human behaviour is, in fact, the product of a disposition inherent in the nature of the human being then why are such
deviations from what is ‘natural’ so common? We can hardly put
down the widespread existence of such ‘unnatural’ patterns of behaviour to some kind of large-scale, faulty genetic programming.
In any case, why are there so many variations from these notions
of ‘normal’ family practices in other kinds of human societies? Both
An Introduction to Sociological Theories 3
history and anthropology provide us with stark contrasts in family life.
In his book on family life in Medieval Europe, Centuries of Childhood
(1973), Philippe Ariès paints a picture of marriage, the family and
child-rearing which sharply contradicts our notions of normality. Families were not then, as they are for us today, private and isolated units,
cut off socially, and physically separated from the world at large.
Families were deeply embedded in the community, with people living
essentially public, rather than private, lives. They lived in households
whose composition was constantly shifting: relatives, friends, children,
visitors, passers-by and animals all slept under the same roof. Marriage
was primarily a means of forging alliances rather than simply the
outcome of ‘love’, while women certainly did not look upon mothering
as their sole destiny. Indeed, child-rearing was a far less demanding
and onerous task than it is in our world. Children were not cosseted
and coddled to anywhere near the extent we consider ‘right’. Many
more people – both other relatives and the community at large – were
involved in child-rearing, and childhood lasted a far shorter time than
it does today. As Ariès (1973) puts it, ‘as soon as he had been weaned,
or soon after, the child became the natural companion of the adult’.
In contemporary non-industrial societies, too, there is a wide range
of variations in family practices. Here again, marriage is essentially a
means of establishing alliances between groups, rather than simply a
relationship between individuals. Monogamy – one husband and one
wife – is only one form of marriage. Polygyny, marriage between a
husband and more than one wife, and polyandry, between a wife and
more than one husband, are found in many societies. Domestic life is
also far more public and communal than it is in industrial societies.
Each family unit is just a part of a much wider, cooperating group
of mainly blood relatives associated with a local territory, usually a
village. As in Medieval Europe, therefore, child-rearing is not considered the principal responsibility of parents alone, but involves a far
greater number of people, relatives and non-relatives.
Clearly, then, to hope to explain human life simply by reference to
natural impulses common to all is to ignore the one crucial fact that
sociology directs attention to: human behaviour varies according to
the social settings in which people find themselves.
Individualistic theories
What of individualistic explanations? How useful is the argument that
behaviour is the product of the psychological make-up of individuals?
The employment of this kind of theory is extremely common. For
4 An Introduction to Sociological Theories
example, success or failure in education is often assumed to be merely
a reflection of intelligence: bright children succeed and dim children
fail. Criminals are often taken to be people with certain kinds of
personality: they are usually seen as morally deficient individuals, lacking any real sense of right or wrong. Unemployed people are equally
often condemned as ‘work-shy’, ‘lazy’ or ‘scroungers’ – inadequates
who would rather ‘get something for nothing’ than work for it. Suicide
is seen as the act of an unstable person – an act undertaken when, as
coroners put it, ‘the balance of the mind was disturbed’. This kind of
explanation is attractive for many people and has proved particularly
resilient to sociological critique. But a closer look shows it to be
seriously flawed.
If educational achievement is simply a reflection of intelligence then
why do children from manual workers’ homes do so badly compared
with children from middle-class homes? It is clearly nonsensical to
suggest that doing one kind of job rather than another is likely to
determine the intelligence of your child. Achievement in education
must in some way be influenced by the characteristics of a child’s
background.
Equally, the fact that the majority of people convicted of a crime
come from certain social categories must cast serious doubt on the
‘deficient personality’ theory. The conviction rate is highest for young
males, especially blacks, who come from manual, working-class
or unemployed backgrounds. Can we seriously believe that criminal
personalities are likely to be concentrated in such social categories?
As in the case of educational achievement, it is clear that the conviction of criminals must somehow be influenced by social factors.
Again, is it likely that the million or so people presently unemployed are typically uninterested in working when the vast majority
of them have been forced out of their jobs, either by ‘downsizing’ or
by the failure of the companies they worked for – as a result of social
forces quite outside their control?
Suicide would seem to have the strongest case for being explained
as a purely psychological act. But if it is simply a question of
‘an unsound mind’, then why does the rate of suicide vary between
societies? Why does it vary between different groups within the same
society? Also, why do the rates within groups and societies remain
remarkably constant over time? As in other examples, social factors
must be exerting some kind of influence; explanations at the level of
the personality are clearly not enough.
Variations such as these demonstrate the inadequacy of theories of
human behaviour which exclusively emphasize innate natural drives,
An Introduction to Sociological Theories 5
or the unique psychological make-up of individuals. If nature is at the
root of behaviour, why does it vary according to social settings? If we
are all different individuals acting according to the dictates of unique
psychological influences, why do different people in the same social
circumstances behave similarly and in ways others can understand?
Clearly there is a social dimension to human existence, which requires
sociological theorizing to explain it.
All sociological theories thus have in common an emphasis on the
way human belief and action is the product of social influences. They
differ as to what these influences are, and how they should be investigated and explained. This book is about these differences.
We shall now examine three distinct kinds of theory – consensus,
conflict and action theories – each of which highlights specific social
sources of human behaviour. Though none of the sociologists whose
work we will spend the rest of the book examining falls neatly into
any one of these three categories of theory, discussing them now will
produce two benefits:
• it will serve as an accessible introduction to theoretical debates in
sociology; and
• it will act as useful reference points against which to judge and
compare the work of the subject’s major theorists.
Society as a structure of rules
The influence of culture on behaviour
Imagine you live in a big city. How many people do you know well?
Twenty? Fifty? A hundred? Now consider how many other people
you encounter each day, about whom you know nothing. For example, how many complete strangers do people living in London
or Manchester or Birmingham come into contact with each day? On
the street, in shops, on buses and trains, in cinemas or night clubs
– everyday life in a big city is a constant encounter with complete
strangers. Yet even if city dwellers bothered to reflect on this fact,
they would not normally leave their homes quaking with dread about
how all these hundreds of strangers would behave towards them.
Indeed, they hardly, if ever, think about it. Why? Why do we take
our ability to cope with strangers so much for granted? It is because
nearly all the people we encounter in our everyday lives do behave in
ways we expect. We expect bus passengers, shoppers, taxi-drivers,
6 An Introduction to Sociological Theories
passers-by, and so on, to behave in quite definite ways even though
we know nothing about them personally. City dwellers in particular
– though it is true of all of us to some extent – routinely enter settings
where others are going about their business both expecting not to
know them, and yet also expecting to know how they will behave.
And, more than this, we are nearly always absolutely right in both
respects. We are only surprised if we encounter someone who is not a
stranger – ‘Fancy meeting you here! Isn’t it a small world!’ – or if one
of these strangers actually does behave strangely – ‘Mummy, why is
that man shouting and waving his arms about?’ Why is this? Why do
others do what we expect of them? Why is disorder or the unexpected
among strangers so rare?
Structural-consensus theory
One of the traditional ways in which sociologists explain the order
and predictability of social life is by regarding human behaviour as
learned behaviour. This approach is known – for reasons that will
become apparent – as structural-consensus theory. The key process
this theory emphasizes is called socialization. This term refers to the
way in which human beings learn the kinds of behaviour expected
of them in the social settings in which they find themselves. From
this point of view, societies differ because the kinds of behaviour
considered appropriate in them differ. People in other societies think
and behave differently because they have learned different rules about
how to behave and think. The same goes for different groups within
the same society. The actions and ideas of one group differ from
those of another because its members have been socialized into different rules.
Consensus sociologists use the term culture to describe the rules
that govern thought and behaviour in a society. Culture exists prior
to the people who learn it. At birth, humans are confronted by a
social world already in existence. Joining this world involves learning
‘how things are done’ in it. Only by learning the cultural rules of a
society can a human interact with other humans. Because they have
been similarly socialized, different individuals will behave similarly.
Consensus theory thus argues that a society’s cultural rules determine, or structure, the behaviour of its members, channelling their
actions in certain ways rather than others. They do so in much the
same way that the physical construction of a building structures the
actions of the people inside it. Take the behaviour of students in a
An Introduction to Sociological Theories 7
school. Once inside the school they will display quite regular patterns
of behaviour. They will all walk along corridors, up and down stairs,
in and out of classrooms, through doors, and so on. They will, by and
large, not attempt to dig through floors, smash through walls, or
climb out of windows. Their physical movements are constrained by
the school building. Since this affects all the students similarly, their
behaviour inside the school will be similar – and will exhibit quite
definite patterns. In consensus theory, the same is true of social life.
Individuals will behave similarly in the same social settings because
they are equally constrained by cultural rules. Though these social
structures are not visible in the way physical structures are, those who
are socialized into their rules find them comparably determining.
The levels at which these cultural rules operate can vary. Some
rules, like laws for instance, operate at the level of the whole society
and structure the behaviour of everyone who lives in it. Others are
much less general, structuring the behaviour of people in quite specific social settings. For example, children in a classroom are expected
to behave in an orderly and attentive fashion. In the playground
much more license is given them, while away from school their behaviour often bears little resemblance to that expected of them during
school hours. Similarly, when police officers or nurses or members of
the armed forces are ‘on duty’, certain cultural rules structure their
behaviour very rigidly. Out of uniform and off duty these constraints
do not apply, though other ones do instead – those governing their
behaviour as fathers and mothers, or husbands and wives, for instance.
This shows how the theory of a social structure of cultural rules
operates. The rules apply not to the individuals themselves, but to the
positions in the social structure they occupy. Shoppers, police officers,
traffic wardens, schoolteachers or pupils are constrained by the cultural expectations attached to these positions, but only when they
occupy them. In other circumstances, in other locations in the social
structure – as fathers or mothers, squash players, football supporters,
church members, and so on – other rules come into play.
Sociologists call positions in a social structure roles. The rules that
structure the behaviour of their occupants are called norms. There
are some cultural rules that are not attached to any particular role
or set of roles. Called values, these are in a sense summaries of approved ways of living, and act as a base from which particular norms
spring. So, for example: ‘education should be the key to success’;
‘family relationships should be the most important thing to protect’;
‘self-help should be the means to individual fulfilment’. All these
are values, and they provide general principles from which norms
8 An Introduction to Sociological Theories
directing behaviour in schools and colleges, in the home and at work
are derived.
According to this sociological theory, socialization into norms
and values produces agreement, or consensus, between people about
appropriate behaviour and beliefs without which no human society
can survive. This is why it is called structural-consensus theory.
Through socialization, cultural rules structure behaviour, guarantee a
consensus about expected behaviour, and thereby ensure social order.
Clearly, in a complex society there are sometimes going to be competing norms and values. For example, while some people think it is
wrong for mothers to go out to work, many women see motherhood
at best as a real imposition and at worst as an infringement of their
liberty. Children often encourage each other to misbehave at school
and disapprove of their peers who refuse to do so. Teachers usually
see this very much the other way round! The Tory Party Conference
is annually strident in its condemnation of any speaker who criticizes
the police. Some young blacks would be equally furious with any
of their number who had other than a strongly belligerent attitude
towards them.
Consensus theorists explain such differences in behaviour and
attitude in terms of the existence of alternative cultural influences,
characteristic of different social settings. A good example of this
emphasis is their approach to educational inequality.
Educational inequality: a consensus theory analysis
Educational research demonstrates, in the most conclusive fashion,
that achievement in education is strongly linked to class membership,
gender and ethnic origin. There is overwhelming evidence, for example, that working-class children of similar intelligence to children
from middle-class backgrounds achieve far less academically than their
middle-class counterparts.
To explain this, consensus theorists turn to stock concepts in their
approach to social life – norms, values, socialization and culture. Starting from the basic assumption that behaviour and belief are caused by
socialization into particular rules, their explanation of working-class
underachievement in education seeks to identify:
• the cultural influences which propel middle-class children to academic success
• the cultural influences which drag working-class children down to
mediocrity."""