In [58]:
import ee
try:
    ee.Initialize()
except: 
    ee.Authenticate()
    ee.Initialize()

import geemap
from geeml.utils import eeprint
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [3]:
pts = ee.FeatureCollection('projects/ee-geethensingh/assets/postdoc/aliens_sep2018_bioscape2023')
eeprint(pts.limit(5))

In [4]:
def createComposite(imageCollection: str, points: ee.Geometry, year: int, period:str):
    """Creates a composite image of a point over a given year (uses 1 month period for 2018 and 2 months for 2023).
    
    Args:
        imageCollection (str): The collection to use for the composite.
        point (ee.Geometry): The point to create the composite for.
        year (int): The year to create the composite for.
        period (str): Either values of first (Sept-09), second(oct-Nov) or both. 
    
    Returns:
        ee.Image: The composite image.
    """

    # Get the image collection
    ic = ee.ImageCollection(imageCollection)

    if period == 'first':
        startDate = f'{year}-09-01'
        endDate = f'{year}-10-01'
    elif period == 'second':
        startDate = f'{year}-10-01'
        endDate = f'{year}-12-01'
        

    # Mask clouds using cloud score+
    csPlus = ee.ImageCollection('GOOGLE/CLOUD_SCORE_PLUS/V1/S2_HARMONIZED')

    QA_BAND = 'cs_cdf'
    CLEAR_THRESHOLD = 0.65
    
    # Filter the collection to the start and end dates, and point
    medianImage = ic.filterDate(startDate, endDate).filterBounds(points).linkCollection(csPlus, [QA_BAND])\
    .map(lambda img: img.updateMask(img.select(QA_BAND).gte(CLEAR_THRESHOLD)))\
    .median()
    
    return medianImage.divide(10000)

In [24]:
# ee.ImageCollection("COPERNICUS/S2_SR_HARMONIZED")

composite2018 = createComposite(imageCollection= ee.ImageCollection("COPERNICUS/S2_HARMONIZED"),
                points=pts.geometry(),
                year=2018,
                period='first')

# Create a composite for 2023
composite2023 = createComposite(imageCollection= ee.ImageCollection("COPERNICUS/S2_HARMONIZED"),
                points=pts.geometry(),
                year=2023,
                period='second')

In [25]:
# Centre map on South Africa, western Cape
Map = geemap.Map(center=[-33.86, 19.21], zoom=8)
Map.addLayer(pts, {}, 'Points')
Map.addLayer(composite2018, {'bands': ['B4', 'B3', 'B2'], 'min': 0, 'max': 0.3}, 'Composite 2018')
Map.addLayer(composite2023, {'bands': ['B4', 'B3', 'B2'], 'min': 0, 'max': 0.3}, 'Composite 2023')
Map

Map(center=[-33.86, 19.21], controls=(WidgetControl(options=['position', 'transparent_bg'], widget=SearchDataG…

In [None]:
def getSpectralSignature(image: ee.Image, point: ee.Geometry, scale: int = 10):
    """Plots the spectral signature of a point on an image.
    
    Args:
        image (ee.Image): The image to plot the spectral signature on.
        point (ee.Geometry): The point to plot the spectral signature for.
        includeMap (bool, optional): Whether to include a map of the image. Defaults to True.
        scale (int, optional): The scale to plot the spectral signature at. Defaults to None.
    """

    # Get the spectral signature of the point
    spectralSignature = image.select('B.*').reduceRegion(reducer = ee.Reducer.first()
                                                        , geometry = point
                                                        , scale = scale)

    # Convert to geodataframe
    pts_gdf = ee.data.computeFeatures({
        'expression': spectralSignature,
        'fileFormat': 'PANDAS_DATAFRAME'
    })
    
    return pts_gdf

In [27]:
# extract values at points
data = composite2018.reduceRegions(collection=pts, reducer=ee.Reducer.first(), scale=10)
data.limit(5)

# convert to geopandas dataframe
gdf18 = ee.data.computeFeatures({
    'expression': data,
    'fileFormat': 'GEOPANDAS_GEODATAFRAME'
})

# Need to set the CRS.
# Make sure it matches the CRS of FeatureCollection geometries.
gdf18.crs = 'EPSG:4326'

gdf18.head()

# extract values at points
data = composite2023.reduceRegions(collection=pts, reducer=ee.Reducer.first(), scale=10)
data.limit(5)

# convert to geopandas dataframe
gdf23 = ee.data.computeFeatures({
    'expression': data,
    'fileFormat': 'GEOPANDAS_GEODATAFRAME'
})

# Need to set the CRS.
# Make sure it matches the CRS of FeatureCollection geometries.
gdf23.crs = 'EPSG:4326'

gdf23.head()

Unnamed: 0,geometry,2018_2023,2023_class,B1,B10,B11,B12,B2,B3,B4,...,QA20,QA60,change,class,cs_cdf,fid,group,layer,notes,path
0,POINT (19.07261 -33.80554),0.006703,,0.1558,0.0014,0.3663,0.2733,0.1629,0.174,0.1985,...,0,0,0,0,8.9e-05,1028,0,valid,,/Users/glennmoncrieff/Documents/qgis/valid.gpk...
1,POINT (19.07795 -33.80792),0.012548,,0.1129,0.001,0.0381,0.0236,0.0883,0.0786,0.0741,...,0,0,0,0,9.1e-05,1029,0,valid,chngg rb,/Users/glennmoncrieff/Documents/qgis/valid.gpk...
2,POINT (19.07821 -33.80821),0.021821,,0.1129,0.001,0.0224,0.0127,0.0946,0.0905,0.0723,...,0,0,0,0,9.1e-05,1030,0,valid,chngg rb,/Users/glennmoncrieff/Documents/qgis/valid.gpk...
3,POINT (19.14952 -33.7358),0.021899,,0.1313,0.00135,0.26125,0.17635,0.18115,0.17685,0.1839,...,0,0,0,0,9.1e-05,1125,0,valid,,/Users/glennmoncrieff/Documents/qgis/valid.gpk...
4,POINT (19.26657 -33.75278),0.003018,,0.1426,0.0015,0.2787,0.2105,0.1429,0.1358,0.1425,...,0,0,0,0,8.7e-05,1139,0,valid,,/Users/glennmoncrieff/Documents/qgis/valid.gpk...


### Models

### Experiment 1: train on 2018 (all), predict on 2018 (all)

In [31]:
gdf18.columns

wavelength23_cols = ['B1', 'B10', 'B11', 'B12', 'B2','B3', 'B4', 'B5', 'B6', 'B7', 'B8', 'B8A', 'B9']

In [32]:
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()

# Load data
X, y = gdf18[wavelength23_cols], gdf18['class'].astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

clf.fit(X_train, y_train)

In [33]:
prediction_probabilities = clf.predict_proba(X_test)
prediction_probabilities.shape

# Get the predicted class labels
predicted_labels = clf.predict(X_test)
# Calculate accuracy
accuracy = accuracy_score(y_test, predicted_labels)
print(f"Accuracy: {accuracy:.2f}")

# from .64

Accuracy: 0.64


In [34]:
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split

from tabpfn import TabPFNClassifier
from tabpfn_extensions.many_class import ManyClassClassifier

# Create a base TabPFN classifier
base_clf = TabPFNClassifier()

# Load data
X, y = gdf18[wavelength23_cols], gdf18['class'].astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# Initialize a classifier
clf = ManyClassClassifier(
    estimator=base_clf,
    alphabet_size=10  # Use TabPFN's maximum class limit
)
clf.fit(X_train, y_train)

In [None]:
# Predict probabilities
# prediction_probabilities = clf.predict_proba(X_test)
# print("ROC AUC:", roc_auc_score(y_test, prediction_probabilities[:, 1]))

# Predict labels
predictions = clf.predict(X_test)
print("Accuracy", accuracy_score(y_test, predictions))

#from 0.813

100%|██████████| 16/16 [00:05<00:00,  3.05it/s]

Accuracy 0.7843601895734598





### Experiment 1.5: train on 2018 (no change), predict on 2018 (no change)

In [36]:
df18subset = gdf18[gdf18['change'] == 0].copy()

In [37]:
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()

# Load data
X, y = df18subset[wavelength23_cols], df18subset['class'].astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

clf.fit(X_train, y_train)

In [38]:
prediction_probabilities = clf.predict_proba(X_test)
prediction_probabilities.shape

# Get the predicted class labels
predicted_labels = clf.predict(X_test)
# Calculate accuracy
accuracy = accuracy_score(y_test, predicted_labels)
print(f"Accuracy: {accuracy:.2f}")

# from .65

Accuracy: 0.63


In [39]:
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split

from tabpfn import TabPFNClassifier
from tabpfn_extensions.many_class import ManyClassClassifier

# Create a base TabPFN classifier
base_clf = TabPFNClassifier()

# Load data
X, y = df18subset[wavelength23_cols], df18subset['class'].astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# Initialize a classifier
clf = ManyClassClassifier(
    estimator=base_clf,
    alphabet_size=10  # Use TabPFN's maximum class limit
)
clf.fit(X_train, y_train)

In [40]:
# Predict probabilities
# prediction_probabilities = clf.predict_proba(X_test)
# print("ROC AUC:", roc_auc_score(y_test, prediction_probabilities[:, 1]))

# Predict labels
predictions = clf.predict(X_test)
print("Accuracy", accuracy_score(y_test, predictions))

# from .785

Accuracy 0.7417503586800573


### Experiment 1.6: train on 2023 (all), predict on 2023 (all)

In [41]:
# use 2018 class for 2023 whereever there is no change
gdf23.loc[gdf23['change'] == 0, '2023_class'] = gdf23['class']
gdf23.dropna(subset=['2023_class'], inplace=True)

In [42]:
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()

# Load data
X, y = gdf23[wavelength23_cols], gdf23['2023_class'].astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

clf.fit(X_train, y_train)

In [44]:
prediction_probabilities = clf.predict_proba(X_test)
prediction_probabilities.shape

# Get the predicted class labels
predicted_labels = clf.predict(X_test)
# Calculate accuracy
accuracy = accuracy_score(y_test, predicted_labels)
print(f"Accuracy: {accuracy:.2f}")

# from .54

Accuracy: 0.59


In [45]:
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split

from tabpfn import TabPFNClassifier
from tabpfn_extensions.many_class import ManyClassClassifier

# Create a base TabPFN classifier
base_clf = TabPFNClassifier()

# Load data
X, y = gdf23[wavelength23_cols], gdf23['2023_class'].astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# Initialize a classifier
clf = ManyClassClassifier(
    estimator=base_clf,
    alphabet_size=10  # Use TabPFN's maximum class limit
)
clf.fit(X_train, y_train)

In [46]:
# Predict probabilities
# prediction_probabilities = clf.predict_proba(X_test)
# print("ROC AUC:", roc_auc_score(y_test, prediction_probabilities[:, 1]))

# Predict labels
predictions = clf.predict(X_test)
print("Accuracy", accuracy_score(y_test, predictions))

# from .690

100%|██████████| 16/16 [00:04<00:00,  3.91it/s]

Accuracy 0.6761565836298933





### Experiment 2: train on 2023 (no change), predict on 2023 (no change)

In [48]:
df23subset = gdf23[gdf23['change'] == 0].copy()

In [49]:
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()

# Load data
X, y = df23subset[wavelength23_cols], df23subset['class'].astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

clf.fit(X_train, y_train)

In [50]:
prediction_probabilities = clf.predict_proba(X_test)
prediction_probabilities.shape

# Get the predicted class labels
predicted_labels = clf.predict(X_test)
# Calculate accuracy
accuracy = accuracy_score(y_test, predicted_labels)
print(f"Accuracy: {accuracy:.2f}")

# from .60

Accuracy: 0.63


In [51]:
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split

from tabpfn import TabPFNClassifier
from tabpfn_extensions.many_class import ManyClassClassifier

# Create a base TabPFN classifier
base_clf = TabPFNClassifier()

# Load data
X, y = df23subset[wavelength23_cols], df23subset['class'].astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# Initialize a classifier
clf = ManyClassClassifier(
    estimator=base_clf,
    alphabet_size=10  # Use TabPFN's maximum class limit
)
clf.fit(X_train, y_train)

In [52]:
# Predict probabilities
# prediction_probabilities = clf.predict_proba(X_test)
# print("ROC AUC:", roc_auc_score(y_test, prediction_probabilities[:, 1]))

# Predict labels
predictions = clf.predict(X_test)
print("Accuracy", accuracy_score(y_test, predictions))

# from .760

Accuracy 0.7546628407460545


### Experiment 3: train on 2018 (2018), predict on 2023 (no change)

In [53]:
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()

# Load data
X, y = gdf18[wavelength23_cols], gdf18['class'].astype(int)

clf.fit(X, y)

In [54]:
prediction_probabilities = clf.predict_proba(df23subset[wavelength23_cols])
prediction_probabilities.shape

# Get the predicted class labels
predicted_labels = clf.predict(df23subset[wavelength23_cols])
# Calculate accuracy
accuracy = accuracy_score(df23subset['class'], predicted_labels)
print(f"Accuracy: {accuracy:.2f}")

# from .05

Accuracy: 0.54


In [55]:
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split

from tabpfn import TabPFNClassifier
from tabpfn_extensions.many_class import ManyClassClassifier

# Create a base TabPFN classifier
base_clf = TabPFNClassifier()

# Load data
X, y = gdf18[wavelength23_cols], gdf18['class'].astype(int)

# Initialize a classifier
clf = ManyClassClassifier(
    estimator=base_clf,
    alphabet_size=10  # Use TabPFN's maximum class limit
)
clf.fit(X, y)

In [56]:
# Predict probabilities
# prediction_probabilities = clf.predict_proba(X_test)
# print("ROC AUC:", roc_auc_score(y_test, prediction_probabilities[:, 1]))

# Predict labels
predictions = clf.predict(df23subset[wavelength23_cols])
print("Accuracy", accuracy_score(df23subset['class'], predictions))

# from 0.08

100%|██████████| 16/16 [00:06<00:00,  2.52it/s]

Accuracy 0.5767575322812052





### Experiment 4: train on 2018 (all) and 2023 (no change), predict on 2023 (no change)

In [60]:
df1823 = pd.concat([gdf18[['class']+wavelength23_cols], df23subset[['class']+wavelength23_cols]], ignore_index=True)
df1823

n18 = len(gdf18)
n23 = len(df23subset)


In [61]:
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()

# Load data
X, y = df1823[wavelength23_cols], df1823['class'].astype(int)
X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(X, y, df1823.index, test_size=0.5, random_state=42)

clf.fit(X_train, y_train)

In [62]:
train_from_18 = (idx_train < n18).sum()
train_from_23 = (idx_train >= n18).sum()
test_from_18 = (idx_test < n18).sum()
test_from_23 = (idx_test >= n18).sum()

train_total = len(idx_train)
test_total = len(idx_test)

print("Train proportions:")
print(f"df18: {train_from_18 / train_total:.2f}, df23: {train_from_23 / train_total:.2f}")

print("Test proportions:")
print(f"df18: {test_from_18 / test_total:.2f}, df23: {test_from_23 / test_total:.2f}")


Train proportions:
df18: 0.55, df23: 0.45
Test proportions:
df18: 0.55, df23: 0.45


In [63]:
prediction_probabilities = clf.predict_proba(X_test)
prediction_probabilities.shape

# Get the predicted class labels
predicted_labels = clf.predict(X_test)
# Calculate accuracy
accuracy = accuracy_score(y_test, predicted_labels)
print(f"Accuracy: {accuracy:.2f}")

# from .62

Accuracy: 0.65


In [64]:
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split

from tabpfn import TabPFNClassifier
from tabpfn_extensions.many_class import ManyClassClassifier

# Create a base TabPFN classifier
base_clf = TabPFNClassifier()

# Load data
X, y = df1823[wavelength23_cols], df1823['class'].astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# Initialize a classifier
clf = ManyClassClassifier(
    estimator=base_clf,
    alphabet_size=10  # Use TabPFN's maximum class limit
)
clf.fit(X_train, y_train)

In [65]:
# Predict probabilities
# prediction_probabilities = clf.predict_proba(X_test)
# print("ROC AUC:", roc_auc_score(y_test, prediction_probabilities[:, 1]))

# Predict labels
predictions = clf.predict(X_test)
print("Accuracy", accuracy_score(y_test, predictions))

# from .767

100%|██████████| 16/16 [00:05<00:00,  2.71it/s]

Accuracy 0.7709279688513953





### Experiment 5: train on 2018 (all) and 2023 (no change), predict on 2023 (change). Then train on all 2018 and 2023 and predict on 2023 (all)


In [66]:
df1823 = pd.concat([gdf18[['class']+wavelength23_cols], df23subset[['class']+wavelength23_cols]], ignore_index=True)
df1823

n18 = len(gdf18)
n23 = len(df23subset)


In [67]:
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()

# Load data
X, y = df1823[wavelength23_cols], df1823['class'].astype(int)
X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(X, y, df1823.index, test_size=0.5, random_state=42)

clf.fit(X_train, y_train)

In [68]:
train_from_18 = (idx_train < n18).sum()
train_from_23 = (idx_train >= n18).sum()
test_from_18 = (idx_test < n18).sum()
test_from_23 = (idx_test >= n18).sum()

train_total = len(idx_train)
test_total = len(idx_test)

print("Train proportions:")
print(f"df18: {train_from_18 / train_total:.2f}, df23: {train_from_23 / train_total:.2f}")

print("Test proportions:")
print(f"df18: {test_from_18 / test_total:.2f}, df23: {test_from_23 / test_total:.2f}")


Train proportions:
df18: 0.55, df23: 0.45
Test proportions:
df18: 0.55, df23: 0.45


In [69]:
prediction_probabilities = clf.predict_proba(X_test)
prediction_probabilities.shape

# Get the predicted class labels
predicted_labels = clf.predict(X_test)
# Calculate accuracy
accuracy = accuracy_score(y_test, predicted_labels)
print(f"Accuracy: {accuracy:.2f}")

# from .63

Accuracy: 0.65


In [70]:
# get pseudo-labels for 2023 change class

df23changesubset = gdf23[gdf23['change'] == 1].copy()
X = df23changesubset[wavelength23_cols]
predicted_labels = clf.predict(X)
df23changesubset['predicted_class'] = predicted_labels

# Ensure the 'class' column is of compatible type (e.g., int)
df23changesubset['class'] = df23changesubset['class'].astype(int)

# Compare predicted and true labels
matches = df23changesubset['predicted_class'] == df23changesubset['class']

# Compute proportion (i.e., accuracy)
proportion = matches.mean()

print(f"Proportion of matches: {proportion:.2f}")


Proportion of matches: 0.15


In [71]:
from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(df23changesubset['class'], df23changesubset['predicted_class']))
print(classification_report(df23changesubset['class'], df23changesubset['predicted_class']))


[[ 4  0  0  0  0  0  0  0  0  0 10]
 [ 0  0  0  0  0  0  0  0  3  0  0]
 [ 4  0  8 12 36  8  4  1 10  0  0]
 [ 1  0  3  6 11  1  0  0 11  0  0]
 [ 6  0  1  3 22  0  0  1  1  0  0]
 [ 1  0  0  1  1  0  0  0  0  0  0]
 [ 0  1  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  1  3  0  0  0  0  0  0]
 [ 0  0  0  7 80  0  0 18  9  1  0]
 [ 0  0  0  0  0  0  0  0  0  0  2]]
              precision    recall  f1-score   support

           0       0.25      0.29      0.27        14
           1       0.00      0.00      0.00         3
           2       0.67      0.10      0.17        83
           3       0.20      0.18      0.19        33
           4       0.14      0.65      0.24        34
           5       0.00      0.00      0.00         3
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         4
           9       1.00      0.01      0.02       115
        

In [72]:
df23changesubset['class'] = df23changesubset['predicted_class'].astype(int)
dfallsemisuperivised = pd.concat([gdf18[['class']+wavelength23_cols], df23subset[['class']+wavelength23_cols], df23changesubset[['class']+wavelength23_cols]], ignore_index=True)

In [73]:
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()

# Load data
X, y = dfallsemisuperivised[wavelength23_cols], dfallsemisuperivised['class'].astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

clf.fit(X_train, y_train)

In [74]:
# Get the predicted class labels
predicted_labels = clf.predict(X_test)
# Calculate accuracy
accuracy = accuracy_score(y_test, predicted_labels)
print(f"Accuracy: {accuracy:.2f}")

# from .65

Accuracy: 0.67


In [75]:
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split

from tabpfn import TabPFNClassifier
from tabpfn_extensions.many_class import ManyClassClassifier

# Create a base TabPFN classifier
base_clf = TabPFNClassifier()

# Load data
X, y = df1823[wavelength23_cols], df1823['class'].astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# Initialize a classifier
clf = ManyClassClassifier(
    estimator=base_clf,
    alphabet_size=10  # Use TabPFN's maximum class limit
)
clf.fit(X_train, y_train)

In [76]:
# Predict probabilities
# prediction_probabilities = clf.predict_proba(X_test)
# print("ROC AUC:", roc_auc_score(y_test, prediction_probabilities[:, 1]))

# Predict labels
predictions = clf.predict(X_test)
print("Accuracy", accuracy_score(y_test, predictions))

100%|██████████| 16/16 [00:05<00:00,  2.75it/s]

Accuracy 0.7696301103179753





In [77]:
# get pseudo-labels for 2023 change class

df23changesubset = gdf23[gdf23['change'] == 1].copy()
X = df23changesubset[wavelength23_cols]
predicted_labels = clf.predict(X)
df23changesubset['predicted_class'] = predicted_labels

# Ensure the 'class' column is of compatible type (e.g., int)
df23changesubset['class'] = df23changesubset['class'].astype(int)

# Compare predicted and true labels
matches = df23changesubset['predicted_class'] == df23changesubset['class']

# Compute proportion (i.e., accuracy)
proportion = matches.mean()

print(f"Proportion of matches: {proportion:.2f}")


100%|██████████| 16/16 [00:04<00:00,  3.57it/s]

Proportion of matches: 0.22





In [78]:
from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(df23changesubset['class'], df23changesubset['predicted_class']))
print(classification_report(df23changesubset['class'], df23changesubset['predicted_class']))


[[ 4  0  0  0  0  0  0  0  0  0 10]
 [ 0  2  0  0  0  0  0  0  1  0  0]
 [ 2  1 19 16 33  6  1  2  3  0  0]
 [ 0  0  5 13 11  1  0  0  3  0  0]
 [ 2  0  0  4 22  0  0  5  1  0  0]
 [ 0  1  1  0  1  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  1  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  1  3  0  0  0  0  0  0]
 [ 0  0  0  2 87  0  0 16 10  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  2]]
              precision    recall  f1-score   support

           0       0.50      0.29      0.36        14
           1       0.50      0.67      0.57         3
           2       0.76      0.23      0.35        83
           3       0.36      0.39      0.38        33
           4       0.14      0.65      0.23        34
           5       0.00      0.00      0.00         3
           6       0.50      1.00      0.67         1
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         4
           9       0.00      0.00      0.00       115
        

In [79]:
df23changesubset['class'] = df23changesubset['predicted_class'].astype(int)
dfallsemisuperivised = pd.concat([gdf18[['class']+wavelength23_cols], df23subset[['class']+wavelength23_cols], df23changesubset[['class']+wavelength23_cols]], ignore_index=True)

In [80]:
# Create a base TabPFN classifier
base_clf = TabPFNClassifier()

# Load data
X, y = dfallsemisuperivised[wavelength23_cols], dfallsemisuperivised['class'].astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# Initialize a classifier
clf = ManyClassClassifier(
    estimator=base_clf,
    alphabet_size=10  # Use TabPFN's maximum class limit
)
clf.fit(X_train, y_train)


In [81]:
# Predict labels
predictions = clf.predict(X_test)
print("Accuracy", accuracy_score(y_test, predictions))

# from .786

100%|██████████| 16/16 [00:06<00:00,  2.35it/s]

Accuracy 0.7806757557794902





### Experiment 6: train on 2023 (stable), predict on 2018 (all labelled)

In [82]:
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()

# Load data
X, y = df23subset[wavelength23_cols], df23subset['class'].astype(int)

clf.fit(X, y)

In [85]:
prediction_probabilities = clf.predict_proba(gdf18[wavelength23_cols])
prediction_probabilities.shape

# Get the predicted class labels
predicted_labels = clf.predict(gdf18[wavelength23_cols])
# Calculate accuracy
accuracy = accuracy_score(gdf18['class'], predicted_labels)
print(f"Accuracy: {accuracy:.2f}")

# from .04

Accuracy: 0.44


In [86]:
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split

from tabpfn import TabPFNClassifier
from tabpfn_extensions.many_class import ManyClassClassifier

# Create a base TabPFN classifier
base_clf = TabPFNClassifier()

# Load data
X, y = df23subset[wavelength23_cols], df23subset['class'].astype(int)

# Initialize a classifier
clf = ManyClassClassifier(
    estimator=base_clf,
    alphabet_size=10  # Use TabPFN's maximum class limit
)
clf.fit(X, y)

In [89]:
# Get the predicted class labels
predicted_labels = clf.predict(gdf18[wavelength23_cols])
# Calculate accuracy
accuracy = accuracy_score(gdf18['class'], predicted_labels)
print(f"Accuracy: {accuracy:.2f}")

# from .05

Accuracy: 0.50


### Experiment 7: train on 2018 (all) and 2023 (all), predict on 2023 (no change)

In [90]:
# use 2018 class for 2023 whereever there is no change
gdf23.loc[gdf23['change'] == 0, '2023_class'] = gdf23['class']
gdf23.dropna(subset=['2023_class'], inplace=True)

In [91]:
df1823 = pd.concat([gdf18[['class']+wavelength23_cols],
                    gdf23[['2023_class']+wavelength23_cols].rename(columns={'2023_class': 'class'})]
, ignore_index=True)
df1823

n18 = len(gdf18)
n23 = len(gdf23)


In [92]:
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()

# Load data
X, y = df1823[wavelength23_cols], df1823['class'].astype(int)
X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(X, y, df1823.index, test_size=0.5, random_state=42)

clf.fit(X_train, y_train)

In [93]:
train_from_18 = (idx_train < n18).sum()
train_from_23 = (idx_train >= n18).sum()
test_from_18 = (idx_test < n18).sum()
test_from_23 = (idx_test >= n18).sum()

train_total = len(idx_train)
test_total = len(idx_test)

print("Train proportions:")
print(f"df18: {train_from_18 / train_total:.2f}, df23: {train_from_23 / train_total:.2f}")

print("Test proportions:")
print(f"df18: {test_from_18 / test_total:.2f}, df23: {test_from_23 / test_total:.2f}")


Train proportions:
df18: 0.49, df23: 0.51
Test proportions:
df18: 0.51, df23: 0.49


In [94]:
prediction_probabilities = clf.predict_proba(X_test)
prediction_probabilities.shape

# Get the predicted class labels
predicted_labels = clf.predict(X_test)
# Calculate accuracy
accuracy = accuracy_score(y_test, predicted_labels)
print(f"Accuracy: {accuracy:.2f}")

# from .59

Accuracy: 0.62


In [95]:
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split

from tabpfn import TabPFNClassifier
from tabpfn_extensions.many_class import ManyClassClassifier

# Create a base TabPFN classifier
base_clf = TabPFNClassifier()

# Load data
X, y = df1823[wavelength23_cols], df1823['class'].astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# Initialize a classifier
clf = ManyClassClassifier(
    estimator=base_clf,
    alphabet_size=10  # Use TabPFN's maximum class limit
)
clf.fit(X_train, y_train)

In [96]:
# Predict probabilities
# prediction_probabilities = clf.predict_proba(X_test)
# print("ROC AUC:", roc_auc_score(y_test, prediction_probabilities[:, 1]))

# Predict labels
predictions = clf.predict(X_test)
print("Accuracy", accuracy_score(y_test, predictions))

# from .748

100%|██████████| 16/16 [00:06<00:00,  2.43it/s]

Accuracy 0.7291049199762892



