### File: performance_analysis_full_updated.ipynb
#### Created with assistance of ChatGPT (OpenAI) – reviewed on 2025-05-11
#### Author: Maria Heinrich

In [0]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Load your execution times CSV
df = pd.read_csv("Execution_Times.csv")  # Adjust the path as needed

# Parse Environment and Date from Filename
def parse_metadata(filename):
    if "Notebook" in filename:
        environment = "Fabric"
    else:
        environment = "Databricks"

    if "_0505" in filename or "_21_" in filename:
        date = "May 5"
    elif "_0506" in filename or "_22_" in filename:
        date = "May 6"
    elif "_250511_" in filename or "_18_" in filename:
        date = "May 11"
    else:
        date = "Unknown"

    return pd.Series([environment, date])

# Apply classification
df[['Environment', 'Date']] = df['Filename'].apply(parse_metadata)
df = df.rename(columns={"Execution time in seconds": "Execution Time (s)"})
df = df[['Environment', 'Date', 'Execution Time (s)']]
df.head()

In [0]:

plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x="Date", y="Execution Time (s)", hue="Environment")
plt.title("Execution Time Distribution: Azure Databricks vs Microsoft Fabric")
plt.grid(True)
plt.show()


In [0]:


summary = df.groupby(["Environment", "Date"])["Execution Time (s)"].agg(["mean", "min", "max", "std"]).reset_index()
summary



In [0]:

# Filter dynamically from the dataframe
db_0505 = df[(df["Environment"] == "Databricks") & (df["Date"] == "May 5")]["Execution Time (s)"]
db_0506 = df[(df["Environment"] == "Databricks") & (df["Date"] == "May 6")]["Execution Time (s)"]
db_0511 = df[(df["Environment"] == "Databricks") & (df["Date"] == "May 11")]["Execution Time (s)"]

fab_0505 = df[(df["Environment"] == "Fabric") & (df["Date"] == "May 5")]["Execution Time (s)"]
fab_0506 = df[(df["Environment"] == "Fabric") & (df["Date"] == "May 6")]["Execution Time (s)"]
fab_0511 = df[(df["Environment"] == "Fabric") & (df["Date"] == "May 11")]["Execution Time (s)"]

# Perform t-tests
t_db_0505_0506, p_db_0505_0506 = stats.ttest_ind(db_0505, db_0506, equal_var=False)
t_db_0506_0511, p_db_0506_0511 = stats.ttest_ind(db_0506, db_0511, equal_var=False)
t_fab_0505_0506, p_fab_0505_0506 = stats.ttest_ind(fab_0505, fab_0506, equal_var=False)
t_fab_0506_0511, p_fab_0506_0511 = stats.ttest_ind(fab_0506, fab_0511, equal_var=False)

# Print results
print(f"Databricks May 5 vs May 6: t = {t_db_0505_0506:.2f}, p = {p_db_0505_0506:.4f}")
print(f"Databricks May 6 vs May 11: t = {t_db_0506_0511:.2f}, p = {p_db_0506_0511:.4f}")
print(f"Fabric May 5 vs May 6: t = {t_fab_0505_0506:.2f}, p = {p_fab_0505_0506:.4f}")
print(f"Fabric May 6 vs May 11: t = {t_fab_0506_0511:.2f}, p = {p_fab_0506_0511:.4f}")


In [0]:
# Perform T-Tests comparing Databricks vs Fabric for each individual day
results = {}

for day in ["May 5", "May 6", "May 11"]:
    db = df[(df["Environment"] == "Databricks") & (df["Date"] == day)]["Execution Time (s)"]
    fab = df[(df["Environment"] == "Fabric") & (df["Date"] == day)]["Execution Time (s)"]
    t_stat, p_val = stats.ttest_ind(db, fab, equal_var=False)
    results[day] = {"t": t_stat, "p": p_val, "db_mean": db.mean(), "fab_mean": fab.mean()}

# Generate interpretation
sophisticated_interpretation = "### T-Test: Databricks vs Fabric by Day\n\n"
for day, r in results.items():
    significance = "a statistically significant difference" if r["p"] < 0.05 else "no statistically significant difference"
    better_platform = "Databricks" if r["db_mean"] < r["fab_mean"] else "Fabric"
    sophisticated_interpretation += (
        f"**{day}**:\n"
        f"- t = {r['t']:.2f}, p = {r['p']:.4f}\n"
        f"- Mean Execution Time — Databricks: {r['db_mean']:.2f}s, Fabric: {r['fab_mean']:.2f}s\n"
        f"- Result: {significance} in execution time. On average, **{better_platform}** was faster.\n\n"
    )

sophisticated_interpretation.strip()


### T-Test: Databricks vs Fabric by Day

**May 5**  
- **t** = 0.56, **p** = 0.5870  
- **Mean Execution Time** — Databricks: 45.36s, Fabric: 43.77s  
- **Result**: No statistically significant difference in execution time.  
  On average, **Fabric** was faster.

**May 6**  
- **t** = -4.12, **p** = 0.0023  
- **Mean Execution Time** — Databricks: 30.34s, Fabric: 44.66s  
- **Result**: A statistically significant difference in execution time.  
  On average, **Databricks** was faster.

**May 11**  
- **t** = -4.89, **p** = 0.0008  
- **Mean Execution Time** — Databricks: 29.66s, Fabric: 44.53s  
- **Result**: A statistically significant difference in execution time.  
  On average, **Databricks** was faster.


In [0]:
# Perform T-Test across all days combined between Databricks and Fabric
db_all = df[df["Environment"] == "Databricks"]["Execution Time (s)"]
fab_all = df[df["Environment"] == "Fabric"]["Execution Time (s)"]

# T-test
t_all, p_all = stats.ttest_ind(db_all, fab_all, equal_var=False)

# Means
db_mean = db_all.mean()
fab_mean = fab_all.mean()

# Generate interpretation
overall_interpretation = f"""
### T-Test: Databricks vs Fabric (All Days Combined)

- t = {t_all:.2f}, p = {p_all:.4f}
- Mean Execution Time — Databricks: {db_mean:.2f}s, Fabric: {fab_mean:.2f}s
- Result: {'A statistically significant difference' if p_all < 0.05 else 'No statistically significant difference'} in execution time.
  On average, **{'Databricks' if db_mean < fab_mean else 'Fabric'}** was faster across all days.
"""

overall_interpretation.strip()

### T-Test: Databricks vs Fabric (All Days Combined)

- **t** = -4.18, **p** = 0.0001  
- **Mean Execution Time** — Databricks: 35.12s, Fabric: 44.32s  
- **Result**: A statistically significant difference in execution time.  
  On average, **Databricks** was faster across all days.
