<a href="https://colab.research.google.com/github/Marjola1/MERN/blob/main/DataMiningLab22_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip -q install pm4py scikit-learn pandas matplotlib


In [None]:
import pandas as pd
from pm4py.objects.log.util import dataframe_utils
from pm4py.objects.conversion.log import converter as log_converter

df = pd.read_csv("/content/Hotel_Reservation_BPMN_Diverse_v2.csv")


df = df.rename(columns={
    "CaseID": "case:concept:name",
    "Activity": "concept:name",
    "Timestamp": "time:timestamp"
})

df["time:timestamp"] = pd.to_datetime(df["time:timestamp"])
df = df.sort_values(["case:concept:name", "time:timestamp"])

df = dataframe_utils.convert_timestamp_columns_in_df(df)

log = log_converter.apply(df, variant=log_converter.Variants.TO_EVENT_LOG)

df.head()



In [None]:
from pm4py.algo.discovery.alpha import algorithm as alpha_miner
from pm4py.algo.discovery.heuristics import algorithm as heuristics_miner

net_alpha, im_alpha, fm_alpha = alpha_miner.apply(log)
net_heur, im_heur, fm_heur = heuristics_miner.apply(log)



In [None]:

first_trace = log[0]
first_event = first_trace[0]
print(first_event.keys())
print(first_event["concept:name"], first_event["time:timestamp"])


In [None]:
!pip install -q pm4py==2.7.11


In [None]:
from pm4py.algo.evaluation.replay_fitness import algorithm as replay_fitness
from pm4py.algo.evaluation.precision import algorithm as precision

# Fitness (alignment-based)
fit_alpha = replay_fitness.apply(log, net_alpha, im_alpha, fm_alpha)["log_fitness"]
fit_heur  = replay_fitness.apply(log, net_heur,  im_heur,  fm_heur )["log_fitness"]

# Precision
prec_alpha = precision.apply(log, net_alpha, im_alpha, fm_alpha)
prec_heur  = precision.apply(log, net_heur,  im_heur,  fm_heur )

print("Alpha  -> fitness:", fit_alpha, " precision:", prec_alpha)
print("Heur   -> fitness:", fit_heur,  " precision:", prec_heur)


In [None]:
import pm4py

pm4py.view_petri_net(net_alpha, im_alpha, fm_alpha)
pm4py.view_petri_net(net_heur, im_heur, fm_heur)


In [None]:
import pandas as pd

case_features = df.groupby("case:concept:name").agg(
    num_events=("concept:name", "count"),
    num_unique_activities=("concept:name", "nunique"),
    start_time=("time:timestamp", "min"),
    end_time=("time:timestamp", "max")
)

case_features["total_duration_minutes"] = (
    case_features["end_time"] - case_features["start_time"]
).dt.total_seconds() / 60

# Keep only numeric columns for clustering
case_features = case_features.drop(columns=["start_time", "end_time"])

case_features.head()


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(case_features)


In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

inertia = []
K_range = range(1, 6)

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init="auto")
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

plt.plot(list(K_range), inertia, marker="o")
plt.xlabel("Number of clusters (k)")
plt.ylabel("Inertia")
plt.title("Elbow Method for KMeans")
plt.show()


In [None]:
k = 2
kmeans = KMeans(n_clusters=k, random_state=42, n_init="auto")

case_features["cluster"] = kmeans.fit_predict(X_scaled)

case_features.head()


In [None]:
plt.scatter(
    case_features["num_events"],
    case_features["total_duration_minutes"],
    c=case_features["cluster"]
)

plt.xlabel("Number of events")
plt.ylabel("Total duration (minutes)")
plt.title("Case Clustering Result")
plt.show()


In [None]:
case_features["cluster"].value_counts()


In [None]:
case_features.groupby("cluster").mean()
