In [96]:
import pandas as pd
 
# load the csv file containing the feedbacks into a pandas dataframe and format/parse the columns
df = pd.read_csv("feedbacks.csv", sep=";", header=0)
df['Actual Start'] = pd.to_datetime(df['Actual Start'], format='%d.%m.%Y %H:%M:%S')
df['Actual End'] = pd.to_datetime(df['Actual End'], format='%d.%m.%Y %H:%M:%S')
df['Planned Start'] = pd.to_datetime(df['Planned Start'], format='%d.%m.%Y %H:%M:%S')
df['Planned Finish'] = pd.to_datetime(df['Planned Finish'], format='%d.%m.%Y %H:%M:%S')
df['CurrentTime'] = pd.to_datetime(df['CurrentTime'], format='%d.%m.%Y %H:%M:%S')
df['Shift'] = df['Shift'].astype('category')
df['Temperature'] = df['Temperature'].str.replace(',', '.').astype(float)
df['TimeSinceLastInterrupt'] = pd.to_timedelta(df['TimeSinceLastInterrupt']).dt.total_seconds() / (60 * 60 * 24)

# create a new dataframe with only the columns we need
data = df[['Temperature', 'TimeSinceLastInterrupt', 'Shift']]
data['DurationFactor'] = (df['Actual End'] - df['Actual Start']) / (df['Planned Finish'] - df['Planned Start'])
data['IsWorkingDay'] = df['CurrentTime'].dt.dayofweek < 5

# print the columns of the dataframe and their data types
print(data.dtypes)

Temperature                float64
TimeSinceLastInterrupt     float64
Shift                     category
DurationFactor             float64
IsWorkingDay                  bool
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['DurationFactor'] = (df['Actual End'] - df['Actual Start']) / (df['Planned Finish'] - df['Planned Start'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['IsWorkingDay'] = df['CurrentTime'].dt.dayofweek < 5


In [97]:
# Discretize Temperature column
temperature_bins = [float('-inf'), 19, 21, float('inf')]
temperature_labels = ['<19', '19-21', '>21']
data['Temperature'] = pd.cut(data['Temperature'], bins=temperature_bins, labels=temperature_labels)

# Discretize DaysSinceLastInterrupt column
days_bins = [float('-inf'), 3.5, 4.5, 5, float('inf')]
days_labels = ['<3.5', '3.5-4.5', '4.5-5', '>5']
data['TimeSinceLastInterrupt'] = pd.cut(data['TimeSinceLastInterrupt'], bins=days_bins, labels=days_labels)
print(data.dtypes)

Temperature               category
TimeSinceLastInterrupt    category
Shift                     category
DurationFactor             float64
IsWorkingDay                  bool
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Temperature'] = pd.cut(data['Temperature'], bins=temperature_bins, labels=temperature_labels)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['TimeSinceLastInterrupt'] = pd.cut(data['TimeSinceLastInterrupt'], bins=days_bins, labels=days_labels)


In [98]:
from pgmpy.estimators import ExhaustiveSearch
from pgmpy.models import BayesianNetwork

actual_model = BayesianNetwork(
    [
        ('Temperature', 'DurationFactor'),
        ('IsWorkingDay', 'DurationFactor'),
        ('TimeSinceLastInterrupt', 'DurationFactor'),
        ('Shift', 'DurationFactor')
    ]
)


In [99]:
import warnings
warnings.filterwarnings('ignore', message=r'overflow encountered')

In [100]:
from pgmpy.estimators import BDeuScore

bdeu = BDeuScore(data)

es_bdeu = ExhaustiveSearch(data, scoring_method=bdeu)

scores_bdeu = list(es_bdeu.all_scores())

In [101]:
print("Best 5 DAGs by score:")
for score, dag in reversed(scores_bdeu[-5:]):
    print(score, dag.edges())
    
print("Worst 5 DAGs by score:")
for score, dag in scores_bdeu[:5]:
    print(score, dag.edges())


print("\nBest model: ")
best_model = scores_bdeu[-1][1]

[print(x) for x in list(best_model.edges())]

print("\nScore of best model: ", bdeu.score(best_model))
print("Score of actual model: ", bdeu.score(actual_model))

Best 5 DAGs by score:
-20212.264642586415 [('DurationFactor', 'TimeSinceLastInterrupt'), ('IsWorkingDay', 'TimeSinceLastInterrupt'), ('IsWorkingDay', 'DurationFactor'), ('Temperature', 'TimeSinceLastInterrupt'), ('Temperature', 'DurationFactor'), ('Temperature', 'Shift'), ('Shift', 'DurationFactor')]
-20212.26464258642 [('DurationFactor', 'TimeSinceLastInterrupt'), ('IsWorkingDay', 'TimeSinceLastInterrupt'), ('IsWorkingDay', 'DurationFactor'), ('Shift', 'Temperature'), ('Shift', 'DurationFactor'), ('Temperature', 'TimeSinceLastInterrupt'), ('Temperature', 'DurationFactor')]
-20215.012277786547 [('IsWorkingDay', 'TimeSinceLastInterrupt'), ('IsWorkingDay', 'DurationFactor'), ('TimeSinceLastInterrupt', 'DurationFactor'), ('TimeSinceLastInterrupt', 'Temperature'), ('Shift', 'Temperature'), ('Shift', 'TimeSinceLastInterrupt'), ('Shift', 'DurationFactor'), ('Temperature', 'DurationFactor')]
-20215.527096402628 [('DurationFactor', 'TimeSinceLastInterrupt'), ('IsWorkingDay', 'TimeSinceLastInte

In [102]:
from pgmpy.estimators import K2Score

k2 = K2Score(data)

es_k2 = ExhaustiveSearch(data, scoring_method=k2)

scores_k2 = list(es_k2.all_scores())

In [103]:
print("Best 5 DAGs by score:")
for score, dag in reversed(scores_k2[-5:]):
    print(score, dag.edges())
    
print("Worst 5 DAGs by score:")
for score, dag in scores_k2[:5]:
    print(score, dag.edges())


print("\nBest model: ")
best_model = scores_k2[-1][1]

[print(x) for x in list(best_model.edges())]

print("\nScore of best model: ", k2.score(best_model))
print("Score of actual model: ", k2.score(actual_model))

Best 5 DAGs by score:
-20489.87911579811 [('DurationFactor', 'IsWorkingDay'), ('DurationFactor', 'Shift'), ('DurationFactor', 'Temperature'), ('DurationFactor', 'TimeSinceLastInterrupt'), ('Shift', 'IsWorkingDay'), ('Temperature', 'IsWorkingDay'), ('Temperature', 'Shift'), ('TimeSinceLastInterrupt', 'IsWorkingDay'), ('TimeSinceLastInterrupt', 'Shift'), ('TimeSinceLastInterrupt', 'Temperature')]
-20492.56515688797 [('DurationFactor', 'IsWorkingDay'), ('DurationFactor', 'Shift'), ('DurationFactor', 'Temperature'), ('DurationFactor', 'TimeSinceLastInterrupt'), ('Shift', 'IsWorkingDay'), ('Temperature', 'TimeSinceLastInterrupt'), ('Temperature', 'IsWorkingDay'), ('Temperature', 'Shift'), ('TimeSinceLastInterrupt', 'IsWorkingDay'), ('TimeSinceLastInterrupt', 'Shift')]
-20492.60377767089 [('DurationFactor', 'IsWorkingDay'), ('DurationFactor', 'Shift'), ('DurationFactor', 'Temperature'), ('DurationFactor', 'TimeSinceLastInterrupt'), ('IsWorkingDay', 'Shift'), ('Temperature', 'IsWorkingDay'), 

In [104]:
from pgmpy.estimators import BicScore

bic = BicScore(data)

es_bic = ExhaustiveSearch(data, scoring_method=bic)

scores_bic = list(es_bic.all_scores())

In [105]:
print("Best 5 DAGs by score:")
for score, dag in reversed(scores_bic[-5:]):
    print(score, dag.edges())
    
print("Worst 5 DAGs by score:")
for score, dag in scores_bic[:5]:
    print(score, dag.edges())


print("\nBest model: ")
best_model = scores_bic[-1][1]

[print(x) for x in list(best_model.edges())]

print("\nScore of best model: ", bic.score(best_model))
print("Score of actual model: ", bic.score(actual_model))

Best 5 DAGs by score:
-23959.218402563114 [('DurationFactor', 'TimeSinceLastInterrupt'), ('IsWorkingDay', 'DurationFactor'), ('Temperature', 'DurationFactor'), ('Temperature', 'Shift')]
-23959.218402563114 [('DurationFactor', 'TimeSinceLastInterrupt'), ('Shift', 'Temperature'), ('Temperature', 'DurationFactor'), ('IsWorkingDay', 'DurationFactor')]
-23963.672310066428 [('DurationFactor', 'TimeSinceLastInterrupt'), ('Shift', 'Temperature'), ('Shift', 'IsWorkingDay'), ('Temperature', 'DurationFactor'), ('IsWorkingDay', 'DurationFactor')]
-23963.67231006643 [('DurationFactor', 'TimeSinceLastInterrupt'), ('IsWorkingDay', 'DurationFactor'), ('Temperature', 'DurationFactor'), ('Temperature', 'Shift'), ('Shift', 'IsWorkingDay')]
-23963.67231006643 [('DurationFactor', 'TimeSinceLastInterrupt'), ('IsWorkingDay', 'Shift'), ('IsWorkingDay', 'DurationFactor'), ('Shift', 'Temperature'), ('Temperature', 'DurationFactor')]
Worst 5 DAGs by score:
-39558.25233927849 [('DurationFactor', 'TimeSinceLastInt