Climate Change: Earth Surface temperature

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

import os
print(os.getcwd())


/Users/lilouastier/Desktop/Climate-Modelling


In [4]:
df = pd.read_csv("GlobalLandTemperaturesByCountry.csv")
print(df.head())

           dt  AverageTemperature  AverageTemperatureUncertainty Country
0  1743-11-01               4.384                          2.294   Åland
1  1743-12-01                 NaN                            NaN   Åland
2  1744-01-01                 NaN                            NaN   Åland
3  1744-02-01                 NaN                            NaN   Åland
4  1744-03-01                 NaN                            NaN   Åland


In [None]:
# 1. LOAD AND PREPROCESS DATA
df = pd.read_csv('data/GlobalLandTemperaturesByCountry.csv')
df['dt'] = pd.to_datetime(df['dt'])
# Filter for a specific country to make the model specific
df = df[df['Country'] == 'Spain'].dropna()


# 2. CREATE MOVING AVERAGE BASELINE 
# We use a 10-year rolling window to find the 'expected' temperature
df['Yearly_Avg'] = df.set_index('dt').resample('A')['AverageTemperature'].transform('mean')
df['Baseline_10Y'] = df['AverageTemperature'].rolling(window=120).mean() # 10 years of months
df['Anomaly'] = df['AverageTemperature'] - df['Baseline_10Y']


# 3. FEATURE ENGINEERING FOR RANDOM FOREST
# RF needs numerical features to identify non-linear trends
df['Year'] = df['dt'].dt.year
df['Month'] = df['dt'].dt.month
df['Lag_1'] = df['AverageTemperature'].shift(1) # Previous month's temp
df = df.dropna()


X = df[['Year', 'Month', 'Lag_1']]
y = df['AverageTemperature']




In [None]:
# 4. TRAIN RANDOM FOREST REGRESSOR
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
# 5. IDENTIFY STATISTICALLY SIGNIFICANT ANOMALIES
threshold = df['Anomaly'].std() * 2
significant_anomalies = df[df['Anomaly'].abs() > threshold]


print(f"Detected {len(significant_anomalies)} significant heat anomalies.")

In [None]:
# 6. VISUALIZATION
plt.figure(figsize=(12,6))
plt.plot(df['dt'], df['AverageTemperature'], label='Actual Temp', alpha=0.5)
plt.plot(df['dt'], df['Baseline_10Y'], label='10Y Baseline', color='red')
plt.scatter(significant_anomalies['dt'], significant_anomalies['AverageTemperature'], color='black', label='Anomalies')
plt.legend()
plt.title('Extreme Heat Anomaly Detection')
plt.show()