In [None]:
import ee
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
import random
import matplotlib.pyplot as plt

In [None]:
import geopandas as gpd


Authenticate and initialize Earth Engine.

**You will need an Earth Engine account. It can be requested having a Google account at https://earthengine.google.com/**

You will be prompted to enter your credentials after running the lines below

In [None]:
# Set seed for reproducability
SEED = 2023
random.seed(SEED)
np.random.seed(SEED)

In [None]:
# Load the Natural Earth dataset with country boundaries
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

  world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))


In [None]:
# Load files
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
sample_submission = pd.read_csv('SampleSubmission.csv')

# Preview head of train
train.head()

Unnamed: 0,ID,Lat,Lon,Target
0,ID_SJ098E7S2SY9,34.162491,70.763668,0
1,ID_CWCD60FGJJYY,32.075695,48.492047,0
2,ID_R1XF70RMVGL3,14.542826,33.313483,1
3,ID_0ZBIDY0PEBVO,14.35948,33.284108,1
4,ID_C20R2C0AYIT0,14.419128,33.52845,0


In [None]:
# Preview head of test
test.head()

Unnamed: 0,ID,Lat,Lon
0,ID_9ZLHTVF6NSU7,34.254835,70.348699
1,ID_LNN7BFCVEZKA,32.009669,48.535526
2,ID_SOYSG7W04UH3,14.431884,33.399991
3,ID_EAP7EXXV8ZDE,14.281866,33.441224
4,ID_QPRX1TUQVGHU,14.399365,33.109566


In [None]:
# Preview head of the sample submission
sample_submission.head()

Unnamed: 0,ID,Target
0,ID_9ZLHTVF6NSU7,
1,ID_LNN7BFCVEZKA,
2,ID_SOYSG7W04UH3,
3,ID_EAP7EXXV8ZDE,
4,ID_QPRX1TUQVGHU,


In [None]:
from shapely.geometry import Point

# Create a GeoDataFrame from your latitude and longitude data
geometry = [Point(lon, lat) for lon, lat in zip(train['Lon'], train['Lat'])]
gdf_train = gpd.GeoDataFrame(train, geometry=geometry)
gdf_test = gpd.GeoDataFrame(test, geometry=geometry)

In [None]:
# Perform a spatial join with the Natural Earth dataset
gdf_with_country_train = gpd.sjoin(gdf_train, world[['geometry', 'name']], how='left', op='within')
gdf_with_country_test = gpd.sjoin(gdf_test, world[['geometry', 'name']], how='left', op='within')


  if (await self.run_code(code, result,  async_=asy)):
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: None
Right CRS: EPSG:4326

  gdf_with_country_train = gpd.sjoin(gdf_train, world[['geometry', 'name']], how='left', op='within')
  if (await self.run_code(code, result,  async_=asy)):
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: None
Right CRS: EPSG:4326

  gdf_with_country_test = gpd.sjoin(gdf_test, world[['geometry', 'name']], how='left', op='within')


In [None]:
train['Country'] = gdf_with_country_train['name']
test['Country'] = gdf_with_country_test['name']


In [None]:
train

Unnamed: 0,ID,Lat,Lon,Target,Country
0,ID_SJ098E7S2SY9,34.162491,70.763668,0,Afghanistan
1,ID_CWCD60FGJJYY,32.075695,48.492047,0,Iran
2,ID_R1XF70RMVGL3,14.542826,33.313483,1,Sudan
3,ID_0ZBIDY0PEBVO,14.359480,33.284108,1,Sudan
4,ID_C20R2C0AYIT0,14.419128,33.528450,0,Sudan
...,...,...,...,...,...
1495,ID_T3WAJGS3B84X,34.292702,70.404578,0,Afghanistan
1496,ID_Q31SWRW623Y8,32.105698,48.294867,1,Iran
1497,ID_O2KBC7HPNOAQ,32.235685,48.185542,0,Iran
1498,ID_8VNTHGHLSUNI,14.329207,33.253206,0,Sudan


In [None]:
test

Unnamed: 0,ID,Lat,Lon,Country
0,ID_9ZLHTVF6NSU7,34.254835,70.348699,Afghanistan
1,ID_LNN7BFCVEZKA,32.009669,48.535526,Iran
2,ID_SOYSG7W04UH3,14.431884,33.399991,Sudan
3,ID_EAP7EXXV8ZDE,14.281866,33.441224,Sudan
4,ID_QPRX1TUQVGHU,14.399365,33.109566,Sudan
...,...,...,...,...
1495,ID_6LVHE89NN5VE,34.077906,70.697531,Afghanistan
1496,ID_M51GDSUBKS8Q,32.202177,48.344005,Iran
1497,ID_469MTLRKJC64,32.340069,48.157425,Iran
1498,ID_DMH9P3N6O3DK,14.260575,33.551627,Sudan


In [None]:
train_combined = train.query("Country != 'Afghanistan'")
test_combined = test.query("Country != 'Afghanistan'")

In [None]:
# Get authetication token and sign in to Google Earth Engine
ee.Authenticate()
ee.Initialize()

To authorize access needed by Earth Engine, open the following URL in a web browser and follow the instructions. If the web browser does not start automatically, please manually browse the URL below.

    https://code.earthengine.google.com/client-auth?scopes=https%3A//www.googleapis.com/auth/earthengine%20https%3A//www.googleapis.com/auth/devstorage.full_control&request_id=S-oGZozQX5MSnSw5EqiNRJLMoX-v8EFiP8zwuAK-NX8&tc=v4XySbf39CtreVhp9Q-jT9T4XrcVZ4GXUNcYj_IaLZk&cc=sthsXfXn901YTixvA_DSA_dQcDxWyJtbiAMBM78Jn1A

The authorization workflow will generate a code, which you should paste in the box below.
Enter verification code: 4/1AfJohXnWp-ubo9iJmVf6tAg1rwGY0lO3FvVkAwQqexXo3awK3c_oF08qXPo

Successfully saved authorization token.


Load Sentinel-2 imagery from Earth Engine and select the bands.
In the example we use the mean value, but other options might work as well.
It might be usefule to apply a cloud mask, to avoid odd values. Please see https://developers.google.com/earth-engine/tutorials/community/sentinel-2-s2cloudless for reference

Load the training dataset from CSV
(make sure the path fits with the location you stored the data) and transform in training points as Earth Engine features.

In [None]:
%%time
# Load the S2 image collection
s2_collection = ee.ImageCollection('COPERNICUS/S2_SR_HARMONIZED')

# Define the bands of interest
bands = ['B1', 'B2', 'B3', 'B4', 'B5','B6', 'B7', 'B8', 'B8A', 'B9', 'B11','B12']

# Read the CSV table
csv_data = train_combined.copy()

# Function to extract mean pixel values for a labeled location
def extract_mean_pixel_values(row):
    lon = row['Lon']
    lat = row['Lat']

    # Create a point geometry for the labeled location
    point = ee.Geometry.Point(lon, lat)

    # Filter the S2 collection by the location and desired time range
    s2_filtered = s2_collection.filterBounds(point).filterDate('2019-07-01', '2020-06-30')

    # Calculate the mean pixel values for the bands of interest at the labeled location
    mean_values = s2_filtered.mean().reduceRegion(
        reducer=ee.Reducer.mean(),
        geometry=point,
        scale=10
    )

    # Extract the mean pixel values for the bands
    values = [mean_values.get(band).getInfo() for band in bands]

    return values

# Extract mean pixel values for each label and create additional columns in the DataFrame
mean_pixel_values = csv_data.apply(extract_mean_pixel_values, axis=1, result_type='expand')
mean_pixel_values.columns = bands
train_combined = pd.concat([csv_data, mean_pixel_values], axis=1)

# Preview the updated DataFrame
train_combined.head()

train_combined.to_csv('train_data_iran_sudan.csv')



CPU times: user 16min 9s, sys: 16.7 s, total: 16min 25s
Wall time: 2h 1min 33s


In [None]:
%%time
# Load the S2 image collection
s2_collection = ee.ImageCollection('COPERNICUS/S2_SR_HARMONIZED')

# Define the bands of interest
bands = ['B1', 'B2', 'B3', 'B4', 'B5','B6', 'B7', 'B8', 'B8A', 'B9', 'B11','B12']

# Read the CSV table
csv_data = test_combined.copy()

# Function to extract mean pixel values for a labeled location
def extract_mean_pixel_values(row):
    lon = row['Lon']
    lat = row['Lat']

    # Create a point geometry for the labeled location
    point = ee.Geometry.Point(lon, lat)

    # Filter the S2 collection by the location and desired time range
    s2_filtered = s2_collection.filterBounds(point).filterDate('2019-07-01', '2020-06-30')

    # Calculate the mean pixel values for the bands of interest at the labeled location
    mean_values = s2_filtered.mean().reduceRegion(
        reducer=ee.Reducer.mean(),
        geometry=point,
        scale=10
    )

    # Extract the mean pixel values for the bands
    values = [mean_values.get(band).getInfo() for band in bands]

    return values

# Extract mean pixel values for each label and create additional columns in the DataFrame
mean_pixel_values = csv_data.apply(extract_mean_pixel_values, axis=1, result_type='expand')
mean_pixel_values.columns = bands
test_combined = pd.concat([csv_data, mean_pixel_values], axis=1)

# Preview the updated DataFrame
test_combined.head()

test_combined.to_csv('test_data_iran_sudan.csv')

CPU times: user 16min 3s, sys: 16.5 s, total: 16min 19s
Wall time: 1h 57min 26s
