# Traunstein

If a tree has multiple stems, each stem is treates as a separate entity. Only alive stems are used.

In [None]:
import geopandas as gpd

# To crs 4326
gdf = gpd.read_file(r"C:\Users\hofin\Downloads\Baumkoordinaten_mh\Baumkoordinaten_mh.shp")
gdf = gdf.to_crs(epsg=4326)
evergreen = {
    "Abies alba": True,
    "Acer campestre": False,
    "Acer platanoides": False,
    "Acer pseudoplatanus": False,
    "Aesculus hippocastanum": False,
    "Alnus glutinose": False,
    "Betula ": False,
    "Carpinus betulus": False,
    "Fagus sylvatica": False,
    "Fraxinus excelsior": False,
    "Juglans regia": False,
    "Larix decidua": False,
    "Picea abies": True,
    "Pinus sylvestris": True,
    "Populus ": False,
    "Populus tremula": False,
    "Prunus avium": False,
    "Pseudotsuga menziesii": True,
    "Quercus ": False,
    "Quercus rubra": False,
    "Salix ": False,
    "Sorbus aria": False,
    "Sorbus aucuparia": False,
    "Sorbus torminalis": False,
    "Thuja plicata": True,
    "Tilia ": False,
    "Ulmus glabra": False,
    "Unidentified broadleaf": False,
    "Unidentified conifer": True,
}
gdf["Latin"] = gdf[["Latin", "Mnemonic"]].agg(' '.join, axis=1)
gdf["Evergreen"] = gdf["Latin"].map(evergreen)

In [None]:
plot = gdf[["Evergreen", "Date", "DBH"]]
plot.columns = ["conifer", "date", "dbh"]
plot["conifer"] = plot["conifer"].astype(int)
plot["dbh"] = plot["dbh"] / 1000
plot["latitude"] = gdf["geometry"].y
plot["longitude"] = gdf["geometry"].x
plot.to_csv("plot.csv", index=False)

# Naturwaldreservate

TODO: Resample the dataset from uniform labels [0, 1], or use SMOTE or other imbalance techniques. See https://imbalanced-learn.org/stable/index.html

In [None]:
import pandas as pd
import geopandas as gpd

is_conifer = {
    'Ah': False,
    'BAh': False,
    'Eibe': True,
    'Bu': False,
    'Ei': False,
    'SAh': False,
    'ScDo': False,
    'FAh': False,
    'Li': False,
    'Elsb': False,
    'Fi': True,
    'HBu': False,
    'Ta': True,
    'BUl': False,
    'As': False,
    'Es': False,
    'WLi': False,
    'FUl': False,
    'SErl': False,
    'Ul': False,
    'Mehlb': False,
    'Erl': False,
    'Lae': True,
    'Bi': False,
    'SHol': False,
    'VKir': False,
    'Has': False,
    'SLi': False,
    'WDo': False,
    'Kie': True,
    'Stro': True,
    'TrKir': False,
    'SPa': False,
    'Hartr': False,
    'Pfaffh': False,
    'WErl': False,
    'Vobe': False,
    'Hol': False,
    'StEi': False,
    'Dgl': True,
    'WDom': False,
    'Kir': False,
    'GPa': False,
    'Birne': False,
    'Wei': False,
    'Spi': False,
    'Zir': True,
    'KreuzD': False,
    'WObst': False,
    'SWei': False,
    'JLä': True,
    'ei': False,
    'TrEi': False,
}

df = pd.read_excel("../data/raw/NWR.xlsx")

rep = df["NWR"]

year = df["aufnahmeja"]#.map(lambda x: 2020 if not isinstance(x, str) else int(float(x.replace(",", "."))))
dbh = df["DM"] / 100
conifer = df["BA"].map(is_conifer)

coords = gpd.points_from_xy(df["CENTROID_X"], df["CENTROID_Y"], crs="EPSG:32632").to_crs("EPSG:4326")
latitude = coords.y
longitude = coords.x

cleaned_df = pd.DataFrame({
    "rep": rep,
    "conifer": conifer.astype(int),
    "date": year.map(lambda x: f"{x}-01-01"),
    "dbh": dbh,
    "latitude": latitude,
    "longitude": longitude,
})

In [None]:
for rep_id in cleaned_df["rep"].unique():
    rep_df = cleaned_df[cleaned_df["rep"] == rep_id]
    rep_df = rep_df.drop("rep", axis=1)

    rep_df.to_csv(f"../data/interim/{rep_id}.csv", index=False)

In [None]:
# Create ground truths
from ltm.data import compute_target
from tqdm.notebook import tqdm
from pathlib import Path

targets_folder = "../data/processed/generalization/targets/"
Path(targets_folder).mkdir(parents=True, exist_ok=True)

for rep_id in tqdm(cleaned_df["rep"].unique()):
    target_path = targets_folder + f"{rep_id}.tif"
    if not Path(target_path).exists():
        plot = pd.read_csv(f"../data/interim/{rep_id}.csv")
        if not any(plot["latitude"].isna()):  # for rep_id 132
            compute_target(target_path, plot)

In [None]:
import rasterio
import numpy as np

total = 0
counts = {}
for rep_id in cleaned_df["rep"].unique():
    if rep_id == 132:
        continue
    with rasterio.open(targets_folder + f"{rep_id}.tif") as src:
        count = np.sum(~np.isnan(src.read(1)))
        total += count
        counts[rep_id] = count

print("Gesamtfläche [Hektar]:", total / 100)

# Inference on test data

In [None]:
from glob import glob
from datetime import datetime
from ltm.models import create_data, sentinel_composite
from pathlib import Path
from tqdm.notebook import tqdm
import pandas as pd

target_folder = "../data/processed/generalization/targets/"

ids = glob("../data/processed/generalization/targets/*.tif")
ids = sorted(int(Path(target_id).stem) for target_id in ids)

for rep_id in tqdm(ids):
    # Read CSV and get all years
    plot = pd.read_csv(f"../data/interim/{rep_id}.csv")
    date = plot["date"].unique()[0]
    year = datetime.strptime(date, "%Y-%m-%d").year

    if year < 2018:
        print(f"Skipping {rep_id} because year is {year} (before 2018)")
        continue

    data_folder = str(Path(target_folder) / str(rep_id))
    target_path = str(Path(target_folder) / f"{rep_id}.tif")

    # Create folder
    Path(data_folder).mkdir(parents=True, exist_ok=True)
    create_data(year, target_path, data_folder)
    # data_path = str(Path(data_folder) / "data.tif")
    # time_window = (datetime(year, 1, 1), datetime(year + 1, 1, 1))

    # if not Path(data_path).exists():
    #     sentinel_composite(target_path, data_path, num_composites=6, sentinel_bands=["TCI_R", "TCI_G", "TCI_B"], indices=["NDVI"], time_window=time_window)

In [None]:
from ltm.features import load_raster

# data_paths = glob("../data/processed/generalization/targets/*/data.tif")
data_paths = glob("../data/processed/generalization/targets/*/*/data.tif")

data_list = []
target_list = []
for data_path in tqdm(data_paths):
    # rep_id = Path(data_path).parent.name
    # target_folder = Path(data_path).parent.parent
    rep_id = Path(data_path).parent.parent.name
    target_folder = Path(data_path).parent.parent.parent
    target_path = str(target_folder / f"{rep_id}.tif")

    target = load_raster(target_path)
    conifer_proportion = target.mean()
    if True:# 0.2 <= conifer_proportion <= 0.8:
        data = load_raster(data_path)

        mask = target.notna()
        data = data[mask]
        target = target[mask]

        data_list.append(data)
        target_list.append(target)

test_data = pd.concat(data_list).reset_index(drop=True)
test_target = pd.concat(target_list).reset_index(drop=True)

from ltm.features import interpolate_data

test_data = interpolate_data(test_data)

In [None]:
import dill
from sklearn.base import clone

# Create data for multiple years
target_path = "../data/processed/target.tif"
data_folder = "../data/processed/generalization/targets/data/"
Path(data_folder).mkdir(parents=True, exist_ok=True)
refit_years = [2018, 2019, 2020, 2021, 2022, 2023]

for refit_year in tqdm(refit_years, desc="Years"):
    create_data(refit_year, target_path, data_folder)
    # data_path = data_folder + f"{refit_year}/data.tif"
    # if not Path(data_path).exists():
    #     Path(data_path).parent.mkdir(parents=True, exist_ok=True)
    #     sentinel_composite(target_path, data_path, num_composites=6, sentinel_bands=["TCI_R", "TCI_G", "TCI_B"], indices=["NDVI"], time_window=(datetime(refit_year, 1, 1), datetime(refit_year + 1, 1, 1)))

# Concatenate all data into one dataframe
print("Combining data...")
total_data = pd.DataFrame()
for refit_year in tqdm(refit_years, desc="Years"):
    stem = Path(data_folder).stem
    data_path = Path(data_folder) / f"{refit_year}/data.tif"
    data = load_raster(str(data_path))
    total_data = pd.concat([total_data, data])

total_data = interpolate_data(total_data)

# Create target data
target = load_raster(target_path)
total_target = pd.concat([target] * len(refit_years))

total_data = total_data.reset_index(drop=True)
total_target = total_target.reset_index(drop=True)

# Drop rows with NaN label
heights = load_raster("../data/processed/DEM_median.tif")
heights = pd.concat([heights] * len(refit_years))
mask = total_target.notna() #& (heights > 20)
data, target = total_data[mask], total_target[mask]

refitted_path = f"../models/refitted.pkl"

with open(refitted_path, "rb") as f:
    refitted = dill.load(f)

refitted = clone(refitted)
refitted.fit(data, target)

In [None]:
# TODO: Compute dummy RMSE

# Mean RMSE on old data: 0.45560590764009135 with a mean conifer proportion of 0.602...
# Mean RMSE on new data: 0.32092856066562325 with a mean conifer proportion of 0.148...
# RMSE is way better without pure pixels
# Check whether the data is read incorrectly
# Try target computed from evergreen proportion (with larix) instead of conifer proportion
# Try with only endmember pixels
# Try without endmember pixels
# Try selecting areas with 0.5 +- 0.1 conifer proportion and < 50 % endmember pixels

from glob import glob
from sklearn.metrics import root_mean_squared_error

pkls = glob("../models/*.pkl")
studies = glob("../models/*_study.pkl")

models = [pkl for pkl in pkls if pkl not in studies]

for model_path in models:
    with open(model_path, "rb") as f:
        model = dill.load(f)
    
    try:
        model.fit(data, target)
    except ValueError:
        continue

    print(f"Model: {model_path}")
    print(f"Score: {root_mean_squared_error(test_target, model.predict(test_data))}")
    print()

In [None]:
# mask = ~(test_target == test_target.astype(int))
tmp_target = test_target#[mask]

from sklearn.metrics import root_mean_squared_error
root_mean_squared_error(tmp_target, refitted.predict(test_data))#[mask])

In [None]:
# Compute RMSE per rep
from sklearn.metrics import root_mean_squared_error
import matplotlib.pyplot as plt

sizes = []
rmses = []
ids = []
for single_data, single_target, data_path in zip(data_list, target_list, data_paths):
    pred = refitted.predict(single_data)
    rmse = root_mean_squared_error(single_target, pred)

    # plt.scatter(target, pred, alpha=0.5)
    # plt.show()

    sizes.append(len(single_data))
    rmses.append(rmse)
    ids.append(Path(data_path).parent.parent.name)

results = pd.DataFrame({
    "size": sizes,
    "rmse": rmses,
}, index=ids)

results

In [None]:
results.sort_values("rmse")

In [None]:
refitted = clone(refitted)
refitted.fit(test_data, test_target)

prediction = refitted.predict(data)

plt.scatter(target, prediction, alpha=0.1)

In [None]:
root_mean_squared_error(target, prediction)

# -> performance improves if filtered by height

In [None]:
# root_mean_squared_error(target, prediction)
from sklearn.model_selection import cross_val_score

cross_val_score(refitted, data, target, cv=5, scoring="neg_root_mean_squared_error", n_jobs=-1)

In [None]:
# root_mean_squared_error(target, prediction)
from sklearn.model_selection import cross_val_score

cross_val_score(refitted, test_data, test_target, cv=5, scoring="neg_root_mean_squared_error", n_jobs=-1)

In [None]:
raise Exception

import dill
from ltm.features import load_raster

def predict_on(data_path, prediction_path, model):
    data = load_raster(data_path)
    prediction = model.predict(data)

    # Read profile and raster params
    with rasterio.open(prediction_path) as src:
        profile = src.profile
        shape = src.read().shape
        nan_mask = np.isnan(src.read())

    # Write prediction to target raster
    with rasterio.open(prediction_path, "w", **profile) as dst:
        reshaped = prediction.reshape(shape)
        reshaped[nan_mask] = np.nan
        dst.write(reshaped)
        dst.descriptions = ("Conifer Proportion",)

refitted_path = f"../models/refitted.pkl"

with open(refitted_path, "rb") as f:
    refitted = dill.load(f)

data_path = f"data/2020/data.tif"
prediction_path = f"prediction.tif"

predict_on(data_path, prediction_path, refitted)

# DEPRECATED

In [None]:
# Convert the data to two rasters
import rasterio
from ltm.data import compute_label
from rasterio.plot import show
import matplotlib as mpl
import matplotlib.pyplot as plt

plot = pd.read_csv("../data/interim/5.csv")

# compute_label(target_path="plot.tif", plot=plot)

cmap = mpl.cm.viridis
norm = mpl.colors.Normalize(vmin=0, vmax=1)
mappable = mpl.cm.ScalarMappable(norm=norm, cmap=cmap)

with rasterio.open("plot.tif") as src:
    fig, ax = plt.subplots()
    fig.colorbar(mappable, ax=ax, label="Conifer Proportion", shrink=0.8)
    ax.axis("off")

    show(src, ax=ax, cmap=cmap, norm=norm)

In [None]:
# Create data for the target area
from ltm.models import create_data

create_data(2020, "plot.tif", "data")

In [None]:
# Train best model on new data of the study area
import dill
from ltm.features import load_raster
import numpy as np

def predict_on(data_path, prediction_path, model):
    data = load_raster(data_path)
    prediction = model.predict(data)

    # Read profile and raster params
    with rasterio.open(prediction_path) as src:
        profile = src.profile
        shape = src.read().shape
        nan_mask = np.isnan(src.read())

    # Write prediction to target raster
    with rasterio.open(prediction_path, "w", **profile) as dst:
        reshaped = prediction.reshape(shape)
        reshaped[nan_mask] = np.nan
        dst.write(reshaped)
        dst.descriptions = ("Conifer Proportion",)

refitted_path = f"../models/refitted.pkl"

with open(refitted_path, "rb") as f:
    refitted = dill.load(f)

data_path = f"data/2020/data.tif"
prediction_path = f"prediction.tif"

predict_on(data_path, prediction_path, refitted)

In [None]:
with rasterio.open("prediction.tif") as src:
    fig, ax = plt.subplots()
    fig.colorbar(mappable, ax=ax, label="Conifer Proportion", shrink=0.8)
    ax.axis("off")

    show(src, ax=ax, cmap=cmap, norm=norm)

In [None]:
target = load_raster("plot.tif")
prediction = load_raster("prediction.tif")

mask = ~np.isnan(target)
target = target[mask]
prediction = prediction[mask]

from sklearn.metrics import root_mean_squared_error

root_mean_squared_error(target, prediction)

# Convert SVGs to PNGs

In [None]:
from glob import glob
from pathlib import Path
from cairosvg import svg2png

svgs = glob("../reports/figures/*/*.svg")
pngs = [str(Path(svg).with_suffix(".png")) for svg in svgs]

for svg, png in zip(svgs, pngs):
    svg2png(url=svg, write_to=png)

# TMP

In [None]:
from glob import glob
from ltm.features import load_raster

regressors = glob("../models/*.pkl")
studies = glob("../models/*_study.pkl")

regressors = [regressor for regressor in regressors if regressor not in studies and "cache" not in regressor]

In [None]:
target = load_raster("../data/processed/target.tif")
data = load_raster("../data/processed/generalization/data/2020/data.tif")

mask = target.notna()
target = target[mask]
data = data[mask]

In [None]:
import numpy as np

def resample(data, target, sampler=None, num_samples=None):
    df = data.copy()
    df["target"] = target

    if sampler is None:
        sampler = lambda: np.random.uniform(0, 1)

    if num_samples is None:
        num_samples = len(df)
    
    new_rows = []
    for i in range(num_samples):
        sample = sampler()
        
        # find closest row
        row = df.iloc[(target - sample).abs().argmin()]
        new_rows.append(row)
    
    res_data = pd.DataFrame(new_rows, columns=df.columns)

    data = res_data.drop("target", axis=1)
    target = res_data["target"]
    
    return data, target

In [None]:
def bin(series, num_bins: int, lower_bound: float=None, upper_bound: float=None):
    if lower_bound is None:
        lower_bound = series.min()
    if upper_bound is None:
        upper_bound = series.max()
    
    if num_bins < 1:
        raise ValueError
    if upper_bound < lower_bound:
        raise ValueError

    bin_size = (upper_bound - lower_bound) / num_bins
    thresholds = [float("-inf")] + [lower_bound + i*bin_size for i in range(1, num_bins)] + [float("inf")]

    bins = []
    for lower, upper in zip(thresholds[:-1], thresholds[1:]):
        mask = (series >= lower) & (series < upper)
        bin = series[mask]
        bins.append(bin)

    return bins

def find_largest_bin(series, min_size=6):
    largest_binsize = 1
    while True:
        bins = bin(series, largest_binsize)
        smallest_bin = min(len(bin) for bin in bins)

        if smallest_bin < min_size:
            largest_binsize -= 1
            break

        largest_binsize += 1
    
    return largest_binsize

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

bin_size = find_largest_bin(test_target, np.ceil(6 * 5/4))

X = test_data.dropna(axis=1)
# y = target
y = np.round(test_target * bin_size).astype(int)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/5)#, stratify=y)

X_resampled, y_resampled = SMOTE().fit_resample(X, y)
# X_resampled, y_resampled = X_train, y_train

y_resampled = y_resampled / bin_size
# # y_test = y_test / bin_size

# model = RandomForestRegressor(n_jobs=-1, random_state=42)
# model.fit(X_resampled, y_resampled)
# # y_pred = model.predict(X_test)

In [None]:
from sklearn.dummy import DummyRegressor
from copy import deepcopy
import matplotlib.pyplot as plt

# X_train = res_data.drop("target", axis=1)
# y_train = res_data["target"]

# dummy = DummyRegressor()
# dummy = deepcopy(refitted)
# dummy.fit(X_train, y_train)
# dummy
with open("../models/SVR.pkl", "rb") as f:
    dummy = dill.load(f)


# mask = total_target.notna()
# data, target = total_data[mask], total_target[mask]


dummy.fit(data, target)

dummy_pred = dummy.predict(test_data)

mask = (test_target != 0) & (test_target != 1)
test_target = test_target[mask]
dummy_pred = dummy_pred[mask]

plt.scatter(test_target, dummy_pred, alpha=0.1)

In [None]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import train_test_split

model = RandomForestRegressor(n_jobs=-1, random_state=42)
# model = DummyRegressor()
# model = deepcopy(refitted)

# cv_data = data#test_data#
# cv_target = target#test_target#

# X_train, X_test, y_train, y_test = train_test_split(cv_data, cv_target, test_size=1/5, shuffle=True, random_state=42)

X_train, y_train = data, target
X_test, y_test = test_data, test_target
# X_train, y_train = test_data, test_target
# X_test, y_test = data, target
X_test, y_test = resample(X_test, y_test)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# cv = KFold(n_splits=5, shuffle=False, random_state=None)
# cross_val_score(model, cv_data, cv_target, cv=cv, scoring="neg_root_mean_squared_error", n_jobs=-1)

In [None]:
root_mean_squared_error(y_test, y_pred)

# Traunstein w/o resampling: 0.287823994888369
# Traunstein w/ resampling: 0.3163959917390755
# Traunstein generalization w/ resampling: 0.2881937400211223
# Traunstein dummy generalization w/ resampling: 0.3057842981793853

# NWR w/o resampling: 0.17353728474049634
# NWR w resampling: 0.3411365410789887
# NWR generalization w resampling: 0.3271471925202301  <- this should decrease
# NWR dummy generalization w resampling: 0.3075913736494596
# NWR refit generalization w resampling: 0.3461570392864428

In [None]:
import matplotlib.pyplot as plt

plt.scatter(y_test, y_pred, alpha=0.1)

In [26]:
from ltm.features import load_raster

import smogn

data = load_raster("../data/processed/ground_truth/data_2A.tif")
target = load_raster("../data/processed/target.tif")

mask = target.notna() & (target != 0) & (target != 1)
data = data[mask]
target = target[mask]

data["target"] = target
data.reset_index(inplace=True, drop=True)

## conduct smogn
housing_smogn = smogn.smoter(
    data = data, 
    y = "target",
    rel_coef = 0.01,
    rel_method = "extremes",
)

UnboundLocalError: local variable 'phi_params' referenced before assignment

# CLASSIFICATION

In [None]:
from sklearn.ensemble import RandomForestClassifier

mask = (target == 0) | (target == 1)
target_clf = target[mask]

model = RandomForestClassifier(n_jobs=-1, random_state=42)
model.fit(data[mask], target_clf)

mask = (test_target == 0) | (test_target == 1)
test_target_clf = test_target[mask]

pred = model.predict(test_data[mask])

from sklearn.metrics import accuracy_score

print(accuracy_score(test_target_clf, pred))

from sklearn.metrics import confusion_matrix

confusion_matrix(test_target_clf, pred)

In [None]:
root_mean_squared_error(target, dummy_pred)  # 0.30224987799459396 with mean

In [None]:
import matplotlib.pyplot as plt

X_test = res_data.drop("target", axis=1)
y_test = res_data["target"]
y_pred = refitted.predict(X_test)

plt.scatter(y_test, y_pred, alpha=0.1)

In [None]:
from sklearn.metrics import root_mean_squared_error

root_mean_squared_error(y_test, y_pred)

In [None]:
train_idcs = np.random.choice(len(X_resampled), len(test_data), replace=False)
X_train, y_train = X_resampled.iloc[train_idcs], y_resampled.iloc[train_idcs]

model = RandomForestRegressor(n_jobs=-1, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(data.dropna(axis=1))
plt.scatter(target, y_pred, alpha=0.1)

In [None]:
root_mean_squared_error(target, y_pred)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import cross_val_predict

model = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=42)

predictions = cross_val_predict(model, train_data, train_target, cv=5, n_jobs=-1)

In [None]:
import matplotlib.pyplot as plt

X_test = res_data.drop("target", axis=1).dropna(axis=1)
y_test = res_data["target"]

predictions = model.predict(X_test.dropna(axis=1))

plt.scatter(y_test, predictions, alpha=0.1)

In [None]:
from glob import glob

model_paths = glob("../tmp/models/*.pkl")
model_paths = [path for path in model_paths if "cache" not in path and "study" not in path]

model_paths

In [None]:
# KDE plot of the conifer proportion
import seaborn as sns

wo_endmembers = target[(target < 1) & (target > 0)]
ax = sns.kdeplot(wo_endmembers, bw_adjust=0.2)

In [None]:
import dill

for model_path in model_paths:
    with open(model_path, "rb") as f:
        model = dill.load(f)

    try:
        print(model_path, root_mean_squared_error(train_target, model.predict(new_train_data)))
    except ValueError:
        continue

In [None]:
# Import necessary libraries
import numpy as np
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from imblearn.over_sampling import RandomOverSampler

# Generate synthetic imbalanced regression data
X, y = make_regression(n_samples=1000, n_features=10, n_informative=5, noise=0.1, random_state=42)

# Introduce imbalance by duplicating some target values
y[900:] = y.max()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply Random Over-Sampling to address imbalance
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

# Train a linear regression model on the resampled data
model = LinearRegression()
model.fit(X_resampled, y_resampled)

# Evaluate the model on the original test set
score = model.score(X_test, y_test)
print("Model R-squared score on original test set:", score)


In [None]:
import dill
from sklearn.metrics import root_mean_squared_error
import matplotlib.pyplot as plt

with open(regressors[0], "rb") as f:
    model = dill.load(f)

prediction = model.predict(data)
print(root_mean_squared_error(target, prediction))

plt.scatter(target, prediction, alpha=0.1)

In [None]:
import dill
from sklearn.metrics import root_mean_squared_error
import matplotlib.pyplot as plt

with open(regressors[1], "rb") as f:
    model = dill.load(f)

prediction = model.predict(data)
print(root_mean_squared_error(target, prediction))

plt.scatter(target, prediction, alpha=0.1)

In [None]:
import dill
from sklearn.metrics import root_mean_squared_error
import matplotlib.pyplot as plt

with open(regressors[2], "rb") as f:
    model = dill.load(f)

prediction = model.predict(data)
print(root_mean_squared_error(target, prediction))

plt.scatter(target, prediction, alpha=0.1)

In [None]:
import dill
from sklearn.metrics import root_mean_squared_error
import matplotlib.pyplot as plt

with open(regressors[3], "rb") as f:
    model = dill.load(f)

prediction = model.predict(data)
print(root_mean_squared_error(target, prediction))

plt.scatter(target, prediction, alpha=0.1)

In [None]:
import dill
from sklearn.metrics import root_mean_squared_error
import matplotlib.pyplot as plt

with open(regressors[4], "rb") as f:
    model = dill.load(f)

prediction = model.predict(data)
print(root_mean_squared_error(target, prediction))

plt.scatter(target, prediction, alpha=0.1)

In [None]:
import dill
from sklearn.metrics import root_mean_squared_error
import matplotlib.pyplot as plt

with open(regressors[5], "rb") as f:
    model = dill.load(f)

prediction = model.predict(data)
print(root_mean_squared_error(target, prediction))

plt.scatter(target, prediction, alpha=0.1)