#### Requirements

In [66]:
import sys
import os

import geopandas as gpd
import pandas as pd
import numpy as np

from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.decomposition import PCA

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from tensorflow import keras

#### Constants

In [67]:
# General
BASE_PATH = os.path.dirname(os.getcwd())

# Mapping
CHANGE_TYPE_MAP = {'Demolition': 0, 'Road': 1, 'Residential': 2, 'Commercial': 3, 'Industrial': 4,
                   'Mega Projects': 5}
CHANGE_STATUS_MAP = {None: 0, 'Greenland': 1, 'Land Cleared': 2, 'Materials Introduced': 3,
                     'Prior Construction': 4, 'Excavation': 5, 'Construction Started': 6,
                     'Construction Midway': 7, 'Materials Dumped': 8, 'Construction Done': 9,
                     'Operational': 10}

# Data
COLORS = ['red', 'green', 'blue']
METRICS = ['std', 'mean']
GEOGRAPHY_TYPES = ['Dense Forest', 'Grass Land', 'Sparse Forest', 'Farms', 'River',
                   'Coastal', 'Lakes', 'Barren Land', 'Desert', 'Hills', 'Snow'] 
URBAN_TYPES = ['Sparse Urban', 'Rural', 'Dense Urban', 'Urban Slum', 'Industrial']

# Column groups
COLUMNS_TO_DROP = ['geography_type', 'urban_type', 'geometry', 'date0', 'date1', 'date2', 'date3', 'date4']
DATE_COLUMNS = ['date0', 'date1', 'date2', 'date3', 'date4']

# Feature types
BINARY_FEATURES = ['Dense Forest', 'Grass Land', 'Sparse Forest', 'Farms', 'River',
                   'Coastal', 'Lakes', 'Barren Land', 'Desert', 'Hills', 'Snow',
                   'Sparse Urban', 'Rural', 'Dense Urban', 'Urban Slum', 'Industrial'] 
CATEGORICAL_FEATURES = ['change_status_date0', 'change_status_date1', 'change_status_date2', 'change_status_date3',
                      'change_status_date4']

# Output file
OUTPUT_FILE = 'preprocessed_train.geojson'

#### Data preprocessing

In [49]:
## Read data
original_train_df = gpd.read_file(f'{BASE_PATH}/data/train.geojson', index_col=0)
#test_df = gpd.read_file(f'{BASE_PATH}/data/test.geojson', index_col=0)

In [76]:
# Copy data
train_df = original_train_df.copy(deep=True)

# Apply Mapping
train_df['change_type'] = train_df['change_type'].map(CHANGE_TYPE_MAP)
for i in range(5): train_df[f'change_status_date{i}'] = train_df[f'change_status_date{i}'].map(CHANGE_STATUS_MAP)

# Fill missing data with 0
train_df = train_df.fillna(0)

In [77]:
# One-hot encoding
for geograph_type in GEOGRAPHY_TYPES:
    train_df[geograph_type] = train_df['geography_type'].apply(lambda x: 1 if geograph_type in x else 0)
for urban_type in URBAN_TYPES:
    train_df[urban_type] = train_df['urban_type'].apply(lambda x: 1 if urban_type in x else 0)

In [78]:
# Create new polygon features
train_df['area'] = train_df['geometry'].area
train_df['length'] = train_df['geometry'].length
train_df['centroid_x'] = train_df['geometry'].centroid.x
train_df['centroid_y'] = train_df['geometry'].centroid.y

# Create new date related features
train_df[DATE_COLUMNS] = train_df[DATE_COLUMNS].apply(pd.to_datetime)
for metric in METRICS:
    for color in COLORS:
        for i in range(2, 6):
            delta = train_df[f'img_{color}_{metric}_date{i}'] - train_df[f'img_{color}_{metric}_date{i-1}']
            train_df[f'img_{color}_{metric}_delta{i}'] = delta
        train_df[f'img_{color}_{metric}_delta_total'] = train_df[f'img_{color}_{metric}_date5'] - train_df[f'img_{color}_{metric}_date1']
for i in range(1, 5):
    train_df[f'date_delta{i}'] = (train_df[f'date{i}'] - train_df[f'date{i-1}']).dt.days.astype(int)
train_df['date_delta_total'] = (train_df[f'date4'] - train_df[f'date1']).dt.days.astype(int)


  train_df['area'] = train_df['geometry'].area

  train_df['length'] = train_df['geometry'].length

  train_df['centroid_x'] = train_df['geometry'].centroid.x

  train_df['centroid_y'] = train_df['geometry'].centroid.y
  train_df[DATE_COLUMNS] = train_df[DATE_COLUMNS].apply(pd.to_datetime)
  train_df[DATE_COLUMNS] = train_df[DATE_COLUMNS].apply(pd.to_datetime)
  train_df[DATE_COLUMNS] = train_df[DATE_COLUMNS].apply(pd.to_datetime)
  train_df[DATE_COLUMNS] = train_df[DATE_COLUMNS].apply(pd.to_datetime)
  train_df[DATE_COLUMNS] = train_df[DATE_COLUMNS].apply(pd.to_datetime)


In [79]:
# Drop uncessary columns
train_df = train_df.drop(columns=COLUMNS_TO_DROP)

In [101]:
# Standardization of numeric features
numeric_features = [col for col in train_df.columns if col not in BINARY_FEATURES + CATEGORICAL_FEATURES]
numeric_features.remove('change_type')
for col_name in numeric_features:
    mean_value = train_df[col_name].mean()
    std_value = train_df[col_name].std()
    train_df[col_name] = (train_df[col_name] - mean_value) / std_value

In [105]:
train_df.columns[train_df.isna().any()].tolist()

[]

#### Neural network without feature selection

In [119]:
# Define features and target variable
X = np.array(train_df.drop('change_type', axis=1))
y = np.array(train_df['change_type'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = keras.Sequential([
    keras.layers.Dense(1024, activation='relu', input_shape=(X_train.shape[1],)),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(516, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(256, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(6, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Original accuracy without feature selection
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.1, verbose=2)
_, original_accuracy = model.evaluate(X_test, y_test)

Epoch 1/10
6664/6664 - 7s - loss: 1.0370 - accuracy: 0.5551 - val_loss: 0.8894 - val_accuracy: 0.6339 - 7s/epoch - 1ms/step
Epoch 2/10
6664/6664 - 6s - loss: 0.9226 - accuracy: 0.6133 - val_loss: 0.8516 - val_accuracy: 0.6494 - 6s/epoch - 901us/step
Epoch 3/10
6664/6664 - 6s - loss: 0.9009 - accuracy: 0.6276 - val_loss: 0.8404 - val_accuracy: 0.6543 - 6s/epoch - 893us/step
Epoch 4/10
6664/6664 - 6s - loss: 0.8914 - accuracy: 0.6301 - val_loss: 0.8435 - val_accuracy: 0.6560 - 6s/epoch - 862us/step
Epoch 5/10
6664/6664 - 7s - loss: 0.8861 - accuracy: 0.6335 - val_loss: 0.8274 - val_accuracy: 0.6580 - 7s/epoch - 1ms/step
Epoch 6/10
6664/6664 - 7s - loss: 0.8807 - accuracy: 0.6367 - val_loss: 0.8299 - val_accuracy: 0.6576 - 7s/epoch - 1ms/step
Epoch 7/10
6664/6664 - 6s - loss: 0.8793 - accuracy: 0.6371 - val_loss: 0.8257 - val_accuracy: 0.6565 - 6s/epoch - 952us/step
Epoch 8/10
6664/6664 - 6s - loss: 0.8755 - accuracy: 0.6388 - val_loss: 0.8223 - val_accuracy: 0.6606 - 6s/epoch - 875us/ste

#### Neural network with tree-based methods

In [127]:
# Train random forest
model_rf = RandomForestClassifier()
model_rf.fit(X, y)

# Select features
importances_rf = model_rf.feature_importances_
selected_features_rf = train_df.columns[importances_rf > 0.05] #ADJUST AS NEEDED

# Train model and evaluate accuracy
X_rf = np.array(train_df.drop('change_type', axis=1)[selected_features_rf])
X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(X_rf, y, test_size=0.2, random_state=42)
model.fit(X_rf, y_train_rf, epochs=10, batch_size=32, validation_split=0.1, verbose=2)
accuracy_rf = model.evaluate(X_rf, y_test_rf)

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [130]:
importances_rf

array([1.04897911e-06, 1.22045434e-04, 3.20515363e-04, 4.03005988e-04,
       4.69997253e-04, 7.30753138e-04, 1.07559690e-03, 1.23989401e-03,
       1.26465342e-03, 1.30604251e-03, 1.38456977e-03, 1.45127151e-03,
       1.52459177e-03, 1.52885532e-03, 2.23257786e-03, 5.37559599e-03,
       8.98361593e-03, 9.09044365e-03, 9.10536103e-03, 9.18390725e-03,
       9.18705059e-03, 9.18857899e-03, 9.20033518e-03, 9.24084851e-03,
       9.24334547e-03, 9.26423038e-03, 9.26653853e-03, 9.35583821e-03,
       9.37804865e-03, 9.48830878e-03, 9.49003235e-03, 9.49054055e-03,
       9.49970214e-03, 9.50261245e-03, 9.50400438e-03, 9.52414735e-03,
       9.53587806e-03, 9.53950973e-03, 9.67400214e-03, 9.67474122e-03,
       9.70160209e-03, 9.72037811e-03, 9.72490179e-03, 9.72523285e-03,
       9.79050794e-03, 9.83345891e-03, 9.86182764e-03, 9.91448667e-03,
       9.92761394e-03, 9.94983621e-03, 9.96209902e-03, 9.97558526e-03,
       1.00212124e-02, 1.00856644e-02, 1.01016046e-02, 1.01281580e-02,
      

#### Neural network with PCA

In [152]:
# Calculate PCA
pca = PCA(n_components=80) # ADJUST

# Define features and target variable
X_pca = pca.fit_transform(X)
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Create and compile model
pca_model = keras.Sequential([
    keras.layers.Dense(512, activation='relu', input_shape=(X_train_pca.shape[1],)),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(256, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(128, activation='relu'),  # Additional layer
    keras.layers.Dropout(0.5),
    keras.layers.Dense(6, activation='softmax')
])
pca_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train model and evaluate accuracy
pca_model.fit(X_train_pca, y_train_pca, epochs=20, batch_size=32, validation_split=0.1, verbose=2)
_, original_accuracy = pca_model.evaluate(X_test_pca, y_test_pca)

Epoch 1/20
6664/6664 - 16s - loss: 0.9241 - accuracy: 0.6062 - val_loss: 0.8262 - val_accuracy: 0.6509 - 16s/epoch - 2ms/step
Epoch 2/20
6664/6664 - 14s - loss: 0.8485 - accuracy: 0.6503 - val_loss: 0.8009 - val_accuracy: 0.6691 - 14s/epoch - 2ms/step
Epoch 3/20
6664/6664 - 14s - loss: 0.8300 - accuracy: 0.6576 - val_loss: 0.7867 - val_accuracy: 0.6744 - 14s/epoch - 2ms/step
Epoch 4/20
6664/6664 - 15s - loss: 0.8203 - accuracy: 0.6623 - val_loss: 0.7806 - val_accuracy: 0.6815 - 15s/epoch - 2ms/step
Epoch 5/20
6664/6664 - 14s - loss: 0.8123 - accuracy: 0.6659 - val_loss: 0.7772 - val_accuracy: 0.6836 - 14s/epoch - 2ms/step
Epoch 6/20
6664/6664 - 14s - loss: 0.8103 - accuracy: 0.6658 - val_loss: 0.7703 - val_accuracy: 0.6816 - 14s/epoch - 2ms/step
Epoch 7/20
6664/6664 - 14s - loss: 0.8042 - accuracy: 0.6686 - val_loss: 0.7680 - val_accuracy: 0.6829 - 14s/epoch - 2ms/step
Epoch 8/20
6664/6664 - 15s - loss: 0.8012 - accuracy: 0.6698 - val_loss: 0.7685 - val_accuracy: 0.6830 - 15s/epoch - 2