Load Data

In [2]:
import pandas as pd
import sqlite3

conn = sqlite3.connect("../climate_data.db")
df = pd.read_sql_query("SELECT * FROM avg_temperatures_cleaned", conn)
conn.close()

print(df.head())
print(df.info())


     City  Year  AvgMaxTemp
0  Zurich  2014       14.83
1  Zurich  2015       14.78
2  Zurich  2016       13.97
3  Zurich  2017       14.46
4  Zurich  2018       15.89
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   City        15 non-null     object 
 1   Year        15 non-null     int64  
 2   AvgMaxTemp  15 non-null     float64
dtypes: float64(1), int64(1), object(1)
memory usage: 492.0+ bytes
None


Trend Analysis per City

In [3]:
from scipy.stats import linregress

for city in df["City"].unique():
    df_city = df[df["City"] == city].copy()

    # Clean and check types
    df_city = df_city.dropna(subset=["Year", "AvgMaxTemp"])
    df_city["Year"] = pd.to_numeric(df_city["Year"], errors="coerce")
    df_city["AvgMaxTemp"] = pd.to_numeric(df_city["AvgMaxTemp"], errors="coerce")
    df_city = df_city.dropna()

    # Make sure there's enough data
    if len(df_city) < 2:
        print(f"{city}: Not enough data for regression.\n")
        continue

    try:
        slope, intercept, r_value, p_value, std_err = linregress(
            df_city["Year"], df_city["AvgMaxTemp"]
        )

        print(f"{city}:")
        print(f"  Slope = {slope:.3f}")
        print(f"  p-value = {p_value:.4f}")
        print(f"  R² = {r_value**2:.3f}\n")
    except Exception as e:
        print(f"{city}: Regression failed – {e}")


Zurich:
  Slope = 0.180
  p-value = 0.5009
  R² = 0.163

Rome:
  Slope = 0.370
  p-value = 0.0742
  R² = 0.708

London:
  Slope = 0.137
  p-value = 0.4403
  R² = 0.208



Comparision

In [7]:
from scipy.stats import ttest_ind

city1 = "Zurich"
city2 = "Rome"

temps1 = df[df["City"] == city1]["AvgMaxTemp"]
temps2 = df[df["City"] == city2]["AvgMaxTemp"]

t_stat, p = ttest_ind(temps1, temps2)

print(f"T-test {city1} vs. {city2}:")
print(f"  t-statistic = {t_stat:.3f}, p-value = {p:.15f}")


T-test Zurich vs. Rome:
  t-statistic = -14.257, p-value = 0.000000571082955


Summary Table

In [8]:
results = []

for city in df["City"].unique():
    df_city = df[df["City"] == city]
    slope, intercept, r, p, _ = linregress(df_city["Year"], df_city["AvgMaxTemp"])
    results.append({"City": city, "Slope": slope, "p-value": p, "R²": r**2})

df_results = pd.DataFrame(results)
print(df_results.sort_values("p-value"))


     City  Slope   p-value        R²
1    Rome  0.370  0.074167  0.707523
2  London  0.137  0.440308  0.207852
0  Zurich  0.180  0.500893  0.162576
