# Data Prep

Preprocess and unify all data.

Before starting to train a ML model, we have to preprocess our data. In this case Sentinel-2 Level-2A imagery is used to generate composites by maximum NDVI across a period of two months. The resulting composites are augmented with indices, like NDVI and all timesteps are reduced into a single raster by deriving statistical parameters, like mean and variance.

The DEM image uploaded beforehand is downsampled to the same resolution as the Sentinel-2 composites by calculating various textile measures.

Then the resulting Sentinel-2 derived raster and DEM derived raster are stacked and a dimensionality reduction is performed. The reduced image can then be used for further processing.

## Define Parameters

In [None]:
# Import Earth Engine API and initialize it
import ee
ee.Initialize()

# Define data constants
SOURCE = 'COPERNICUS/S2_SR'  # Define dataset source
REGION = ee.Geometry.Rectangle([12.6545, 47.9291, 12.6762, 47.9423])  # Define region in EPSG:4326

# Define processing constants
TIMESERIES_MIDDLE = '2019-06-01'  # Define middle of timeseries
TIMESERIES_DURATION = 365  # Define duration of timeseries in days
NUM_COMPOSITES = 12  # Define amount of composites in the timeseries
TEMPORAL_REDUCERS = [ee.Reducer.median(), ee.Reducer.variance()]  # Define temporal reducer

# Define quality measure for composites
def addQuality(image):
    quality_band = image.normalizedDifference(['B5', 'B4']).rename(['quality'])  # NDVI in this case
    return image.addBands(quality_band)

# Define export constants
FILENAME = 'NDVI_composite'  # Name of exported raster
FOLDER = 'Google Earth Engine'  # Name of export folder
SCALE = 10  # Size of pixel in meters
CRS = 'EPSG:32632'  # Coordinate reference system of exported raster
MAX_PIXELS = 1e7  # Maximum number of pixels when exporting

# Define map constants
VIS_PARAMS = {'bands': ['B4', 'B3', 'B2'], 'min': 0, 'max': 0.2, 'gamma': 1}
LAYER_NAME = FILENAME

# Calculate Timeseries Windows for calculating Composites

In [None]:
from datetime import datetime, timedelta

def generate_timewindows(middle_date, num_windows, timeseries_duration=365.242):
    # Calculate the start date of the timeseries
    middle_date = datetime.strptime(middle_date, '%Y-%m-%d')
    current_start = middle_date - timedelta(days=timeseries_duration / 2)

    # Calculate the duration of each timewindow (in days)
    window_duration = timeseries_duration / num_windows
    
    # Initialize a list to store the timewindows as tuples
    timewindows = []
    for _ in range(num_windows):
        # Calculate the start and end dates of each timewindow
        start_date = current_start
        end_date = current_start + timedelta(days=window_duration)

        start_date = start_date.strftime('%Y-%m-%d')
        end_date = end_date.strftime('%Y-%m-%d')
        timewindow = (start_date, end_date)
        
        # Append the timewindow as a tuple (start, end) to the list
        timewindows.append(timewindow)
        
        # Move the middle_date to the next timewindow
        current_start += timedelta(days=window_duration)
    
    return timewindows

timewindows = generate_timewindows(TIMESERIES_MIDDLE, NUM_COMPOSITES, TIMESERIES_DURATION)

# Show Sentinel-2 (Level-2A) imagery

In [None]:
def maskS2clouds(image, cld_thresh=0.8, snw_thresh=0.8):
  # -----------------------------------------------------------------------
  # qa = image.select('QA60')

  # # Bits 10 and 11 are clouds and cirrus, respectively.
  # cloudBitMask = 1 << 10
  # cirrusBitMask = 1 << 11

  # # Both flags should be set to zero, indicating clear conditions.
  # mask = qa.bitwiseAnd(cloudBitMask).eq(0).And(qa.bitwiseAnd(cirrusBitMask).eq(0))
  # -----------------------------------------------------------------------

  # Use MSK_CLDPRB and MSK_SNWPRB bands with a threshold of 100
  cld_prb = image.select('MSK_CLDPRB').divide(100)
  snw_prb = image.select('MSK_SNWPRB').divide(100)
  mask = cld_prb.lt(cld_thresh).And(snw_prb.lt(snw_thresh))

  return image.updateMask(mask).divide(10000)

def get_bands(image, print=False):
  bands = [band['id'] for band in image.getInfo()['bands']]

  if print:
    print(bands)

  return bands

def get_reducer_name(reducer, print=False):
  reducer_name = reducer.getInfo()['type'].split('.')[-1]

  if print:
    print(reducer_name)

  return reducer_name

In [None]:
# Define methods for calculating indices
def add_ndvi(image):
    ndvi_band = image.normalizedDifference(['B5', 'B4']).rename(['NDVI'])
    return image.addBands(ndvi_band)

def add_ndwi(image):
    ndwi_band = image.normalizedDifference(['B3', 'B5']).rename(['NDWI'])
    return image.addBands(ndwi_band)

add_indices = [add_ndvi, add_ndwi]

In [None]:
from IPython.display import Image

composites = []
for start, end in timewindows:
    # Define dataset filters
    filter_date = ee.Filter.date(start, end)  # inclusive start, exclusive end
    filter_region = ee.Filter.bounds(REGION)

    # Read dataset
    dataset = ee.ImageCollection(SOURCE).filter(filter_date).filter(filter_region).map(maskS2clouds)
    
    # Calculate indices
    for add_index in add_indices:
        dataset = dataset.map(add_index)

    # Create max NDVI pixel composite
    dataset = dataset.map(addQuality)  # Add quality band
    composite = dataset.qualityMosaic('quality')  # Choose max quality pixels

    # Remove quality band
    remaining_bands = composite.bandNames().getInfo()
    remaining_bands.remove('quality')
    composite = composite.select(remaining_bands)  # Remove quality band

    # Add to time series
    composites.append(composite)

# Create image collection
composites = ee.ImageCollection(composites)

# Apply remporal reducers to image collection
reduced_images = []
for temporal_reducer in TEMPORAL_REDUCERS:
    reduced_images.append(composites.reduce(temporal_reducer))

# Stack images
stacked_images = ee.ImageCollection(reduced_images).toBands()
stacked_images = stacked_images.reproject(crs=CRS, scale=SCALE)

# Resample and clip to region
stacked_images = stacked_images.reproject(crs=CRS, scale=SCALE)
stacked_images = stacked_images.clip(REGION)

# Define RGB bands
rgb_bands = get_bands(stacked_images)[3:0:-1]

print('Stacked images:', stacked_images.bandNames().getInfo())

# Show image
Image(url=stacked_images.getThumbUrl({
    **VIS_PARAMS,
    'bands': rgb_bands,
    'dimensions': 500}))

# Correlation Matrix

In [None]:
# Calculate correlation matrix for all bands in the image collection
import pandas as pd

# Select two bands to calculate correlation coefficient
df = pd.DataFrame(stacked_images.reduceRegion(ee.Reducer.toList(), maxPixels=1e8).getInfo())
correlation_matrix = df.corr().fillna(1)
correlation_matrix.head()

## Download np.ndarray

In [None]:
# Get the data from the image as a numpy array
import requests
import io
import numpy as np

# url = composites.getDownloadUrl({
#     'scale': SCALE,
#     'region': REGION,
#     'format': 'NPY'
#     })

# response = requests.get(url)
# data = np.load(io.BytesIO(response.content))

## Downsample DEM

In [None]:
from IPython.display import Image

# Load the DEM
dem = ee.Image('projects/leaf-type-mixture/assets/DEM')
band_name = dem.bandNames().getInfo()[0]

# Calculate textile measures
dem_mean = dem.reduceResolution(ee.Reducer.mean(), maxPixels=1024).rename(band_name + '_mean')
dem_variance = dem.reduceResolution(ee.Reducer.variance(), maxPixels=1024).rename(band_name + '_variance')
dem_glcm = dem.multiply(1000).toUint16().glcmTexture()  # naive approach to avoid memory issues

# Stack the DEM and its gradient
dem = dem.addBands([dem_mean, dem_variance, dem_glcm])

# Clip to the region of interest
dem = dem.reproject(crs=CRS, scale=SCALE)
dem = dem.clip(REGION)

# Print all bands
print(dem.bandNames().getInfo())

Image(url=dem.getThumbUrl({
    'min': -5,
    'max': 50,
    'bands': ['b1_shade', 'b1_diss', 'b1_prom'],
    'region': REGION,
    'dimensions': 500}))

# Rasterize Plot

In [None]:
import geopandas as gpd

# Constants, zero for conifers, one for broadleafs
CONIFER = 0
BROADLEAF = 1

# Map leaf type to a number
leaf_type_dict = {'Abies alba': CONIFER,
                  'Acer campestre': BROADLEAF,
                  'Acer platanoides': BROADLEAF,
                  'Acer pseudoplatanus': BROADLEAF,   
                  'Aesculus hippocastanum': BROADLEAF,
                  'Alnus glutinose': BROADLEAF,
                  'Betula ': BROADLEAF,
                  'Carpinus betulus': BROADLEAF,
                  'Fagus sylvatica': BROADLEAF,
                  'Fraxinus excelsior': BROADLEAF,
                  'Juglans regia': BROADLEAF,
                  'Larix decidua': CONIFER,
                  'Picea abies': CONIFER,
                  'Pinus sylvestris': CONIFER,
                  'Populus ': BROADLEAF,
                  'Populus tremula': BROADLEAF,
                  'Prunus avium': BROADLEAF,
                  'Pseudotsuga menziesii': CONIFER,
                  'Quercus ': BROADLEAF,
                  'Quercus rubra': BROADLEAF,
                  'Salix ': BROADLEAF,
                  'Sorbus aria': BROADLEAF,
                  'Sorbus aucuparia': BROADLEAF,
                  'Sorbus torminalis': BROADLEAF,
                  'Thuja plicata': CONIFER,
                  'Tilia ': BROADLEAF,
                  'Ulmus glabra': BROADLEAF,
                  'Unidentified broadleaf': BROADLEAF,
                  'Unidentified conifer': CONIFER}

# Read in the plot data
df = gpd.read_file('../data/raw/Plot.gpkg').to_crs('EPSG:4326')

# Create Longitude, Latitude, and Broadleaf columns
df['Longitude'] = df['geometry'].x
df['Latitude'] = df['geometry'].y
df['Species'] = df['Latin'] + ' ' + df['Mnemonic']
df['Broadleaf'] = df['Species'].map(leaf_type_dict)

# Keep only the columns we need
df = df[['Longitude', 'Latitude', 'Broadleaf']]

# Convert pandas DataFrame to Earth Engine FeatureCollection
fc = ee.FeatureCollection([ee.Feature(ee.Geometry.Point([row['Longitude'], row['Latitude']]), {'Broadleaf': row['Broadleaf']}) for index, row in df.iterrows()])

# Rasterize the FeatureCollection using reduceToImage
plot = fc.reduceToImage(['Broadleaf'], ee.Reducer.mean())
plot = plot.reproject(crs=CRS, scale=SCALE)
plot = plot.clip(REGION)

# Height Mask

In [None]:
heightmask = dem.select('b1').gt(20)

masked_s2 = stacked_images.updateMask(heightmask)
masked_s2 = masked_s2.reproject(crs=CRS, scale=SCALE)
masked_s2 = masked_s2.clip(REGION)

Image(url=masked_s2.getThumbUrl({
    'min': 0,
    'max': 0.2,
    'bands': ['0_B4_median', '0_B3_median', '0_B2_median'],
    'dimensions': 500}))

## Save (& fuse DEM with Time Series)

In [None]:
import requests
import rasterio
from rasterio.io import MemoryFile

def download_image(image, scale, crs):
    download_params = {
        'scale': scale,
        'crs': crs,
        'format': 'GEO_TIFF',
    }
    url = image.getDownloadURL(download_params)

    return requests.get(url)

def save_image_to_file(image, file_path, mask, bands):
    with MemoryFile(image) as memfile, MemoryFile(mask) as mask_memfile:
        with memfile.open() as dataset, mask_memfile.open() as mask_dataset:
            profile = dataset.profile
            with rasterio.open(file_path, 'w', **profile) as dst:
                raster = dataset.read()
                mask = mask_dataset.read()
                raster[mask == 0] = np.nan
                dst.write(raster)
                dst.descriptions = tuple(bands)

def save_image(image, file_path, scale, crs):
    image_response = download_image(image, scale, crs)
    mask_response = download_image(image.mask(), scale, crs)
    
    if image_response.status_code == mask_response.status_code == 200:
        save_image_to_file(image_response.content,
                           file_path,
                           mask=mask_response.content,
                           bands=image.bandNames().getInfo())
        print('Image downloaded and saved successfully!')
    else:
        print('Failed to download the image.')

fusion = stacked_images.addBands(dem)

# Merge all masks with logical AND operation
band_names = fusion.bandNames().getInfo()
mask = fusion.select(band_names[0]).mask()
for band_name in band_names[1:]:
    mask = mask.And(fusion.select(band_name).mask())
band_names = plot.bandNames().getInfo()
for band_name in band_names:
    mask = mask.And(plot.select(band_name).mask())

# Apply the mask to both images
fusion = fusion.updateMask(mask)
plot = plot.updateMask(mask)

fusion = fusion.clip(REGION)
plot = plot.clip(REGION)

# save_image
save_image(fusion, 'fusion.tif', scale=SCALE, crs=CRS)
save_image(plot, 'plot.tif', scale=SCALE, crs=CRS)

# Open the raster dataset
input_path = 'fusion.tif'
with rasterio.open(input_path) as src:
    # read description of the raster
    print(src.descriptions)
input_path = 'plot.tif'
with rasterio.open(input_path) as src:
    # read description of the raster
    print(src.descriptions)

# Image to Dict

In [None]:
# Convert the image to grayscale
gray = stacked_images.select(['0_B4_median', '0_B3_median', '0_B2_median']).reduce(ee.Reducer.mean())  # reduce three bands to one
gray = gray.toUint16()

# Calculate the GLCM
glcm = gray.glcmTexture(3)

# Select the texture measures to export
texture_measures = glcm.select(
    ['mean_contrast', 'mean_diss', 'mean_shade', 'mean_asm', 'mean_prom'])

# Reduce the texture measures by mean
reduced_texture_measures = texture_measures.reduceRegion(
    reducer=ee.Reducer.mean(),
    geometry=REGION,
    scale=SCALE,
    crs=CRS)

# Convert to dictionary
reduced_texture_measures = reduced_texture_measures.getInfo()
reduced_texture_measures

# Interactive Map

In [None]:
import folium

# Create a map.
lon, lat = REGION.centroid().getInfo()['coordinates']
# lat, lon = 45.77, 4.855
my_map = folium.Map(location=[lat, lon], zoom_start=15)

# Define a method for displaying Earth Engine image tiles on a folium map.
def add_ee_layer(self, ee_image_object, vis_params, name):
    """Adds a method for displaying Earth Engine image tiles to folium map."""
    map_id_dict = ee.Image(ee_image_object).getMapId(vis_params)
    folium.raster_layers.TileLayer(
        tiles=map_id_dict['tile_fetcher'].url_format,
        attr='Map Data &copy; <a href="https://earthengine.google.com/">Google Earth Engine</a>',
        name=name,
        overlay=True,
        control=True
    ).add_to(self)

# Add Earth Engine drawing method to folium.
folium.Map.add_ee_layer = add_ee_layer

# Add the stacked composites to the map.
vis_params = {**VIS_PARAMS, 'bands': ['0_B4_median', '0_B3_median', '0_B2_median']}
my_map.add_ee_layer(stacked_images, vis_params, 'Sentinel-2 Composite')

# Add the plot to the map.
vis_params = {
    'min': 0,'max': 1,
    'palette': ['05450a','086a10', '54a708', '78d203', '009900', 'c6b044',
                'dcd159', 'dade48', 'fbff13', 'b6ff05', '27ff87', 'c24f44',
                'a5a5a5', 'ff6d4c', '69fff8', 'f9ffa4', '1c0dff']
}
my_map.add_ee_layer(plot, vis_params, 'Leaf Type Mixture')

# Add a layer control panel to the map.
my_map.add_child(folium.LayerControl())

# Display the map.
display(my_map)

# Inference and Evaluation

In [3]:
import numpy as np
from scipy.special import ndtri
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn import metrics
import pandas as pd
import rasterio

# Load dataset
with rasterio.open('fusion.tif') as src:
    raster = src.read()
    bands = src.count
    band_names = src.descriptions
    X = raster.transpose(1, 2, 0).reshape(-1, bands)

with rasterio.open('plot.tif') as src:
    y = src.read(1).flatten()

# remove nan values
mask = ~np.isnan(X).any(axis=1)
mask = np.logical_and(mask, ~np.isnan(y))
X = X[mask]
y = y[mask]

def based_train_test_split(*arrays, confidence_interval_width=0.5, confidence_niveau=0.95, seed=None):
    z = ndtri(confidence_niveau)
    n = z**2 / confidence_interval_width**2
    min_sample_size = np.ceil(n).astype(int)

    return train_test_split(*arrays, test_size=min_sample_size, random_state=seed)

# Split the dataset into a train, eval, and test set
X_remaining, X_test, y_remaining, y_test = based_train_test_split(
    X, y,
    confidence_interval_width=0.1,
    confidence_niveau=0.95,
    seed=1
)
X_train, X_eval, y_train, y_eval = based_train_test_split(
    X_remaining, y_remaining,
    confidence_interval_width=0.1,
    confidence_niveau=0.95,
    seed=1
)

print(f"Train set size: {len(X_train)}")
print(f"Eval set size: {len(X_eval)}")
print(f"Test set size: {len(X_test)}")

# Sample all train dataset samples where the label is 0 or 1
mask = np.logical_or(y_train == 0, y_train == 1)
X_train = X_train[mask]
y_train = y_train[mask]

# Create a list of ML models, with 5 different hyperparameters each
models = {'xgb regressor with mean squared error': xgb.XGBRegressor(n_estimators=1000, tree_method="hist", eval_metric=metrics.mean_squared_error, early_stopping_rounds=10),
          'xgb regressor with mean absolute error': xgb.XGBRegressor(n_estimators=1000, tree_method="hist", eval_metric=metrics.mean_absolute_error, early_stopping_rounds=10),
          'random forest regressor with squared error': RandomForestRegressor(n_estimators=1000, criterion='squared_error', n_jobs=-1)}

# Train all models on the train set
for model in models:
    try:
        models[model].fit(X_train, y_train, eval_set=[(X_eval, y_eval)])
    except:
        models[model].fit(X_train, y_train)

# Define the metrics to evaluate the models on
metrics = [
    metrics.mean_absolute_error,
    metrics.r2_score,
    metrics.median_absolute_error,
    metrics.explained_variance_score,
    metrics.max_error,
    metrics.mean_squared_error,
]

X_eval = X_test
y_eval = y_test

# Evaluate the models on the eval set and create a new dataframe
eval_df = pd.DataFrame({
    metric.__name__.replace('_', ' ').title(): [metric(y_eval, model.predict(X_eval))
                                                for model in models.values()]
    for metric in metrics
}, index=models.keys())
eval_df

Train set size: 1929
Eval set size: 271
Test set size: 271
[0]	validation_0-rmse:0.38919	validation_0-mean_squared_error:0.15147
[1]	validation_0-rmse:0.35602	validation_0-mean_squared_error:0.12675
[2]	validation_0-rmse:0.33938	validation_0-mean_squared_error:0.11518
[3]	validation_0-rmse:0.32998	validation_0-mean_squared_error:0.10888
[4]	validation_0-rmse:0.32197	validation_0-mean_squared_error:0.10367
[5]	validation_0-rmse:0.32103	validation_0-mean_squared_error:0.10306
[6]	validation_0-rmse:0.31593	validation_0-mean_squared_error:0.09981
[7]	validation_0-rmse:0.31677	validation_0-mean_squared_error:0.10034
[8]	validation_0-rmse:0.31391	validation_0-mean_squared_error:0.09854
[9]	validation_0-rmse:0.31363	validation_0-mean_squared_error:0.09836
[10]	validation_0-rmse:0.31417	validation_0-mean_squared_error:0.09870
[11]	validation_0-rmse:0.31376	validation_0-mean_squared_error:0.09845
[12]	validation_0-rmse:0.31292	validation_0-mean_squared_error:0.09792
[13]	validation_0-rmse:0.312

Unnamed: 0,Mean Absolute Error,R2 Score,Median Absolute Error,Explained Variance Score,Max Error,Mean Squared Error
xgb regressor with mean squared error,0.225622,0.500815,0.169094,0.503089,0.988431,0.092204
xgb regressor with mean absolute error,0.221301,0.516491,0.166118,0.519198,0.971655,0.089308
random forest regressor with squared error,0.220909,0.548062,0.185693,0.549812,0.84413,0.083477


In [None]:
feature_importance = {model_name: [] for model_name in models.keys()}
for model_name, model in models.items():
    for importance in model.feature_importances_:
        feature_importance[model_name].append(importance)

feature_importance = pd.DataFrame(feature_importance, index=band_names)
# sorted(list(zip(band_names, importances)), key=lambda x: x[1], reverse=True)
feature_importance['mean'] = feature_importance.mean(axis=1)
# sort by mean
feature_importance = feature_importance.sort_values(by='mean', ascending=False)
feature_importance

In [None]:
# Compare actual and predicted mean
np.mean(y_eval), np.mean(list(models.values())[0].predict(X_eval))

In [None]:
# Compare prediction with ground truth
import matplotlib.pyplot as plt

model = list(models.values())[0]

# Load dataset
with rasterio.open('fusion.tif') as src:
    raster = src.read()
    bands = src.count
    image = raster.transpose(1, 2, 0).reshape(-1, bands)

with rasterio.open('plot.tif') as src:
    annotation = src.read(1)

prediction = model.predict(image)
prediction = prediction.reshape(annotation.shape)

# mask nan values
mask = np.isnan(annotation)
prediction[mask] = np.nan
prediction = np.clip(prediction, 0, 1)

plt.imshow(prediction, cmap='magma')
plt.show()
plt.imshow(annotation, cmap='magma')
plt.show()

# Experiments (can be deleted)

In [None]:
import folium

# Create a map.
lat, lon = 47.9357, 12.66535
my_map = folium.Map(location=[lat, lon], zoom_start=15)

# Define a method for displaying Earth Engine image tiles on a folium map.
def add_ee_layer(self, ee_image_object, vis_params, name):
    """Adds a method for displaying Earth Engine image tiles to folium map."""
    map_id_dict = ee.Image(ee_image_object).getMapId(vis_params)
    folium.raster_layers.TileLayer(
        tiles=map_id_dict['tile_fetcher'].url_format,
        attr='Map Data &copy; <a href="https://earthengine.google.com/">Google Earth Engine</a>',
        name=name,
        overlay=True,
        control=True
    ).add_to(self)

# Add Earth Engine drawing method to folium.
folium.Map.add_ee_layer = add_ee_layer

# Add the stacked composites to the map.
dem = ee.Image('projects/leaf-type-mixture/assets/DEM')
my_map.add_ee_layer(dem.multiply(100).toUint16().entropy(ee.Kernel.square(4)), {'min': 0, 'max': 10}, 'DEM')

# Add a layer control panel to the map.
my_map.add_child(folium.LayerControl())

# Display the map.
display(my_map)

In [1]:
from sklearn.datasets import load_iris
from sklearn.model_selection import KFold
import numpy as np

# Load dataset
data = load_iris()
X, y = data.data, data.target

n_samples = len(y)
min_val_size = 30
n_splits = int(n_samples / min_val_size)

# Initialize KFold
kf = KFold(n_splits=n_splits, shuffle=True)#, random_state=42)

# Perform cross-validation
for fold, (train_index, val_index) in enumerate(kf.split(X)):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    
    # Print fold information
    print(f"Fold {fold+1}:")
    print(f"  Training samples: {len(X_train)}")
    print(f"  Validation samples: {len(X_val)}")

    print(f"  Training samples: {y_train}")
    print(f"  Validation samples: {y_val}")

# Output the number of splits
print("Number of splits:", kf.get_n_splits())


Fold 1:
  Training samples: 120
  Validation samples: 30
  Training samples: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2]
  Validation samples: [0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2]
Fold 2:
  Training samples: 120
  Validation samples: 30
  Training samples: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2]
  Validation samples: [0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2]
Fold 3:
  Training samples: 120
  Validation samples: 30
  Training samples: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

In [15]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
iris = load_iris()
logistic = LogisticRegression(solver='saga', tol=1e-2, max_iter=200, random_state=0)
distributions = dict(C=uniform(loc=0, scale=4), penalty=['l2', 'l1'])
clf = RandomizedSearchCV(logistic, distributions, random_state=0)
search = clf.fit(iris.data, iris.target)
search.best_params_
search.predict(iris.data[:1000]), iris.target[:1000]

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2

In [13]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

def evaluate_model(model, X, y):
    scores = -cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=5)
    rmse_scores = np.sqrt(scores)
    return rmse_scores

def tune_hyperparameters(model, param_grid, X, y):
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, 
                               scoring='neg_mean_squared_error', cv=5)
    grid_search.fit(X, y)
    best_model = grid_search.best_estimator_
    best_rmse = np.sqrt(-grid_search.best_score_)
    return best_model, best_rmse

iris = load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_regressor = RandomForestRegressor(random_state=42)
gb_regressor = GradientBoostingRegressor(random_state=42)

rf_rmse_scores = evaluate_model(rf_regressor, X_train, y_train)
gb_rmse_scores = evaluate_model(gb_regressor, X_train, y_train)

print("Random Forest RMSE scores:", rf_rmse_scores)
print("Gradient Boosting RMSE scores:", gb_rmse_scores)

param_grid_gb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

best_gb_model, best_gb_rmse = tune_hyperparameters(gb_regressor, param_grid_gb, X_train, y_train)

print("Best Gradient Boosting model:", best_gb_model)
print("Best Gradient Boosting RMSE:", best_gb_rmse)

rf_regressor.fit(X_train, y_train)
best_gb_model.fit(X_train, y_train)

rf_test_predictions = rf_regressor.predict(X_test)
gb_test_predictions = best_gb_model.predict(X_test)

rf_test_rmse = np.sqrt(mean_squared_error(y_test, rf_test_predictions))
gb_test_rmse = np.sqrt(mean_squared_error(y_test, gb_test_predictions))

print("Random Forest test RMSE:", rf_test_rmse)
print("Best Gradient Boosting test RMSE:", gb_test_rmse)

Random Forest RMSE scores: [0.13724977 0.10070584 0.38427421 0.10045729 0.21360985]
Gradient Boosting RMSE scores: [0.15317587 0.16812654 0.40824027 0.14468759 0.26957703]
Best Gradient Boosting model: GradientBoostingRegressor(learning_rate=0.01, max_depth=5, n_estimators=300,
                          random_state=42)
Best Gradient Boosting RMSE: 0.23362877869206836
Random Forest test RMSE: 0.037193189340702336
Best Gradient Boosting test RMSE: 0.04167816551670047


In [16]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier  # Make sure you have xgboost installed

# Split the data into training, validation, and test sets
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_eval, y_eval, test_size=0.5, random_state=42)

# Define the parameter grids for each algorithm
param_dist = {
    'randomforest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20],
        # other hyperparameters
    },
    'xgboost': {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 4, 5],
        # other hyperparameters
    },
    'gradientboost': {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 4, 5],
        # other hyperparameters
    }
}

# Perform RandomizedSearchCV for each algorithm
for model_name, param_grid in param_dist.items():
    if model_name == 'randomforest':
        model = RandomForestClassifier()
    elif model_name == 'xgboost':
        model = XGBClassifier()
    elif model_name == 'gradientboost':
        model = GradientBoostingClassifier()
    
    random_search = RandomizedSearchCV(
        model,
        param_distributions=param_grid,
        n_iter=10,  # Adjust the number of iterations as needed
        scoring='accuracy',
        cv=5,  # Number of cross-validation folds for hyperparameter tuning
        verbose=2,
        n_jobs=-1,
        random_state=42
    )
    
    random_search.fit(X_val, y_val)
    
    best_model = random_search.best_estimator_
    # Evaluate best_model on the test set and save the results
    
    print(f'Best {model_name} parameters: {random_search.best_params_}')

# Choose the best-performing model based on the test results
# Train the chosen model on the combined training and validation data




Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best randomforest parameters: {'n_estimators': 100, 'max_depth': None}
Fitting 5 folds for each of 9 candidates, totalling 45 fits




Best xgboost parameters: {'n_estimators': 100, 'max_depth': 3}
Fitting 5 folds for each of 9 candidates, totalling 45 fits




Best gradientboost parameters: {'n_estimators': 100, 'max_depth': 3}


In [17]:
from sklearn import datasets, linear_model
from sklearn.model_selection import cross_val_score
diabetes = datasets.load_diabetes()
X = diabetes.data[:150]
y = diabetes.target[:150]
lasso = linear_model.Lasso()
print(cross_val_score(lasso, X, y, cv=3))

[0.3315057  0.08022103 0.03531816]
