In [None]:
import pymongo
import logging
import matplotlib.pyplot as plt
import statistics
import dotenv
import os

from src.metrics.connection_time import connection_time
from src.metrics.latency import latency
from src.metrics.query_execution_time import query_execution_time
from src.metrics.throughput import throughput
from src.metrics.write_performance import write_performance
from src.utils.filter_outliers import filter_outliers
from src.utils.statistical_importance import is_statistically_significant

dotenv.load_dotenv(dotenv_path="resources/secrets.env")

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(message)s",
)

In [None]:
client = pymongo.MongoClient(os.environ["MONGO_URI"])

In [None]:
for name in client.list_database_names():
    print(name)

### Latency test

In [None]:
avg_latency_1, each_run_latency_1 = latency(
    n=1000,
    client=client,
    database="nieruchomosci",
    collection="boston",
)
avg_latency_2, each_run_latency_2 = latency(
    n=1000,
    client=client,
    database="nieruchomosci",
    collection="boston",
)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 3))
axes[0].hist(each_run_latency_1, bins=10, edgecolor="black")
axes[0].set_title("Latency times (1st run)")
axes[0].set_xlabel("Response time")
axes[0].set_ylabel("Frequency")

axes[1].hist(each_run_latency_2, bins=10, edgecolor="black")
axes[1].set_title("Latency times (2nd run)")
axes[1].set_xlabel("Response time")
axes[1].set_ylabel("Frequency")
plt.show()

In [None]:
filtered_each_run_latency_1 = filter_outliers(each_run_latency_1)
filtered_each_run_latency_2 = filter_outliers(each_run_latency_2)

In [None]:
print(
    f"Executed 1000 queries.\nAverage time: {sum(filtered_each_run_latency_1) / len(filtered_each_run_latency_1):.6f} sec"
)
print(f" Median time: {statistics.median(filtered_each_run_latency_1):.6f} sec")
print(
    f"Executed 1000 queries.\nAverage time: {sum(filtered_each_run_latency_2) / len(filtered_each_run_latency_2):.6f} sec"
)
print(f" Median time: {statistics.median(filtered_each_run_latency_2):.6f} sec")

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 3))
axes[0].hist(filtered_each_run_latency_1, bins=10, edgecolor="black")
axes[0].set_title("Latency times (1st run) without outliers")
axes[0].set_xlabel("Response time")
axes[0].set_ylabel("Frequency")

axes[1].hist(filtered_each_run_latency_2, bins=10, edgecolor="black")
axes[1].set_title("Latency times (2nd run) without outliers")
axes[1].set_xlabel("Response time")
axes[1].set_ylabel("Frequency")
plt.show()

In [None]:
is_statistically_significant(
    first_data=filtered_each_run_latency_1, second_data=filtered_each_run_latency_2
)

### Query execution time test

In [None]:
each_run_query_execution_time_1 = query_execution_time(
    n=1000,
    client=client,
    database="nieruchomosci",
    collection="boston",
)
each_run_query_execution_time_2 = query_execution_time(
    n=1000,
    client=client,
    database="nieruchomosci",
    collection="boston",
)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 3))
axes[0].hist(each_run_query_execution_time_1, bins=10, edgecolor="black")
axes[0].set_title("Query execution times (1st run)")
axes[0].set_xlabel("Response time")
axes[0].set_ylabel("Frequency")

axes[1].hist(each_run_query_execution_time_2, bins=10, edgecolor="black")
axes[1].set_title("Query execution times (2nd run)")
axes[1].set_xlabel("Response time")
axes[1].set_ylabel("Frequency")
plt.show()

In [None]:
filtered_each_run_query_execution_time_1 = filter_outliers(
    each_run_query_execution_time_1
)
filtered_each_run_query_execution_time_2 = filter_outliers(
    each_run_query_execution_time_2
)

In [None]:
print(
    f"Executed 1000 queries.\nAverage time: {sum(filtered_each_run_query_execution_time_1) / len(filtered_each_run_query_execution_time_1):.6f} sec"
)
print(
    f" Median time: {statistics.median(filtered_each_run_query_execution_time_1):.6f} sec"
)
print(
    f"Executed 1000 queries.\nAverage time: {sum(filtered_each_run_query_execution_time_2) / len(filtered_each_run_query_execution_time_2):.6f} sec"
)
print(
    f" Median time: {statistics.median(filtered_each_run_query_execution_time_2):.6f} sec"
)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 3))
axes[0].hist(filtered_each_run_query_execution_time_1, bins=10, edgecolor="black")
axes[0].set_title("Query execution time (1st run) without outliers")
axes[0].set_xlabel("Execution time")
axes[0].set_ylabel("Frequency")

axes[1].hist(filtered_each_run_query_execution_time_2, bins=10, edgecolor="black")
axes[1].set_title("Query execution time (2nd run) without outliers")
axes[1].set_xlabel("Execution time")
axes[1].set_ylabel("Frequency")
plt.show()

In [None]:
is_statistically_significant(
    first_data=filtered_each_run_query_execution_time_1,
    second_data=filtered_each_run_query_execution_time_2,
)

### Throughput test

In [None]:
avg_throughput_1, each_run_throughput_1 = throughput(
    n=1000,
    client=client,
    database="nieruchomosci",
    collection="boston",
)
avg_throughput_2, each_run_throughput_2 = throughput(
    n=1000,
    client=client,
    database="nieruchomosci",
    collection="boston",
)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 3))
axes[0].hist(each_run_throughput_1, bins=10, edgecolor="black")
axes[0].set_title("Throughput (1st run)")
axes[0].set_xlabel("Response time")
axes[0].set_ylabel("Frequency")

axes[1].hist(each_run_throughput_2, bins=10, edgecolor="black")
axes[1].set_title("Throughput (2nd run)")
axes[1].set_xlabel("Response time")
axes[1].set_ylabel("Frequency")
plt.show()

In [None]:
filtered_each_run_throughput_1 = filter_outliers(each_run_throughput_1)
filtered_each_run_throughput_2 = filter_outliers(each_run_throughput_2)

In [None]:
print(
    f"Executed 1000 queries.\nAverage time: {sum(filtered_each_run_throughput_1) / len(filtered_each_run_throughput_1):.6f} sec"
)
print(f" Median time: {statistics.median(filtered_each_run_throughput_1):.6f} sec")
print(
    f"Executed 1000 queries.\nAverage time: {sum(filtered_each_run_throughput_2) / len(filtered_each_run_throughput_2):.6f} sec"
)
print(f" Median time: {statistics.median(filtered_each_run_throughput_2):.6f} sec")

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 3))
axes[0].hist(filtered_each_run_throughput_1, bins=10, edgecolor="black")
axes[0].set_title("Throughput (1st run) without outliers")
axes[0].set_xlabel("Response time")
axes[0].set_ylabel("Frequency")

axes[1].hist(filtered_each_run_throughput_2, bins=10, edgecolor="black")
axes[1].set_title("Throughput (2nd run) without outliers")
axes[1].set_xlabel("Response time")
axes[1].set_ylabel("Frequency")
plt.show()

In [None]:
is_statistically_significant(
    first_data=filtered_each_run_throughput_1,
    second_data=filtered_each_run_throughput_2,
)

### Connection time

In [None]:
avg_connection_time_1, each_run_connection_time_1 = connection_time(
    n=1000,
    client=client,
)
avg_connection_time_2, each_run_connection_time_2 = connection_time(
    n=1000,
    client=client,
)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 3))
axes[0].hist(each_run_connection_time_1, bins=10, edgecolor="black")
axes[0].set_title("Connection time (1st run)")
axes[0].set_xlabel("Connection")
axes[0].set_ylabel("Frequency")

axes[1].hist(each_run_connection_time_2, bins=10, edgecolor="black")
axes[1].set_title("Connection time (2nd run)")
axes[1].set_xlabel("Connection")
axes[1].set_ylabel("Frequency")
plt.show()

In [None]:
filtered_each_connection_time_1 = filter_outliers(each_run_connection_time_1)
filtered_each_connection_time_2 = filter_outliers(each_run_connection_time_2)

In [None]:
print(
    f"Executed 1000 connections.\nAverage time: {sum(filtered_each_connection_time_1) / len(filtered_each_connection_time_1):.6f} sec"
)
print(f" Median time: {statistics.median(filtered_each_connection_time_1):.6f} sec")
print(
    f"Executed 1000 connections.\nAverage time: {sum(filtered_each_connection_time_2) / len(filtered_each_connection_time_2):.6f} sec"
)
print(f" Median time: {statistics.median(filtered_each_connection_time_2):.6f} sec")

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 3))
axes[0].hist(filtered_each_connection_time_1, bins=10, edgecolor="black")
axes[0].set_title("Connection time (1st run) without outliers")
axes[0].set_xlabel("Connection")
axes[0].set_ylabel("Frequency")

axes[1].hist(filtered_each_connection_time_2, bins=10, edgecolor="black")
axes[1].set_title("Connection time (2nd run) without outliers")
axes[1].set_xlabel("Connection")
axes[1].set_ylabel("Frequency")
plt.show()

In [None]:
is_statistically_significant(
    first_data=filtered_each_connection_time_1,
    second_data=filtered_each_connection_time_2,
)

### Write performance

In [None]:
client = pymongo.MongoClient(os.environ["MONGO_URI"])
avg_write_performance_1, each_run_write_performance_1 = write_performance(
    n=1000,
    client=client,
    database="nieruchomosci",
    collection="boston",
)
avg_write_performance_2, each_run_write_performance_2 = write_performance(
    n=1000,
    client=client,
    database="nieruchomosci",
    collection="boston",
)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 3))
axes[0].hist(each_run_write_performance_1, bins=10, edgecolor="black")
axes[0].set_title("Write performance (1st run)")
axes[0].set_xlabel("Write time")
axes[0].set_ylabel("Frequency")

axes[1].hist(each_run_write_performance_2, bins=10, edgecolor="black")
axes[1].set_title("Write performance (2nd run)")
axes[1].set_xlabel("Write time")
axes[1].set_ylabel("Frequency")
plt.show()

In [None]:
filtered_each_write_performance_1 = filter_outliers(each_run_write_performance_1)
filtered_each_write_performance_2 = filter_outliers(each_run_write_performance_2)

In [None]:
print(
    f"Inserted 1000 rows.\nAverage time: {sum(filtered_each_write_performance_1) / len(filtered_each_write_performance_1):.6f} sec"
)
print(f" Median time: {statistics.median(filtered_each_write_performance_1):.6f} sec")
print(
    f"Inserted 1000 queries.\nAverage time: {sum(filtered_each_write_performance_2) / len(filtered_each_write_performance_2):.6f} sec"
)
print(f" Median time: {statistics.median(filtered_each_write_performance_2):.6f} sec")

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 3))
axes[0].hist(filtered_each_write_performance_1, bins=10, edgecolor="black")
axes[0].set_title("Write performance (1st run) without outliers")
axes[0].set_xlabel("Write time")
axes[0].set_ylabel("Frequency")

axes[1].hist(filtered_each_write_performance_2, bins=10, edgecolor="black")
axes[1].set_title("Write performance (2nd run) without outliers")
axes[1].set_xlabel("Write time")
axes[1].set_ylabel("Frequency")
plt.show()

In [None]:
is_statistically_significant(
    first_data=filtered_each_write_performance_1,
    second_data=filtered_each_write_performance_2,
)