#### Requirements

In [2]:
import os

import geopandas as gpd
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score

from tensorflow import keras
from tensorflow import argmax

#### Constants

In [3]:
# General
BASE_PATH = os.path.dirname(os.getcwd())

# Mapping
CHANGE_TYPE_MAP = {'Demolition': 0, 'Road': 1, 'Residential': 2, 'Commercial': 3, 'Industrial': 4,
                   'Mega Projects': 5}
CHANGE_STATUS_MAP = {None: None, 'Greenland': 1, 'Land Cleared': 2, 'Materials Introduced': 3,
                     'Prior Construction': 4, 'Excavation': 5, 'Construction Started': 6,
                     'Construction Midway': 7, 'Materials Dumped': 8, 'Construction Done': 9,
                     'Operational': 10}

# Data
COLORS = ['red', 'green', 'blue']
METRICS = ['std', 'mean']
GEOGRAPHY_TYPES = ['Dense Forest', 'Grass Land', 'Sparse Forest', 'Farms', 'River',
                   'Coastal', 'Lakes', 'Barren Land', 'Desert', 'Hills', 'Snow'] 
URBAN_TYPES = ['Sparse Urban', 'Rural', 'Dense Urban', 'Urban Slum', 'Industrial']

# Column groups
COLUMNS_TO_DROP = ['geography_type', 'urban_type', 'geometry', 'date0', 'date1', 'date2', 'date3', 'date4']
DATE_COLUMNS = ['date0', 'date1', 'date2', 'date3', 'date4']

# Feature types
BINARY_FEATURES = ['Dense Forest', 'Grass Land', 'Sparse Forest', 'Farms', 'River',
                   'Coastal', 'Lakes', 'Barren Land', 'Desert', 'Hills', 'Snow',
                   'Sparse Urban', 'Rural', 'Dense Urban', 'Urban Slum', 'Industrial'] 
CATEGORICAL_FEATURES = ['change_status_date0', 'change_status_date1', 'change_status_date2', 'change_status_date3',
                      'change_status_date4']

# Output file
OUTPUT_FILE = 'preprocessed_train.geojson'

#### Data preprocessing

In [4]:
## Read data
original_train_df = gpd.read_file(f'{BASE_PATH}/data/train.geojson', index_col=0)
#test_df = gpd.read_file(f'{BASE_PATH}/data/test.geojson', index_col=0)

In [156]:
# Copy data
train_df = original_train_df.copy(deep=True)

# Apply Mapping
train_df['change_type'] = train_df['change_type'].map(CHANGE_TYPE_MAP)
for i in range(5): train_df[f'change_status_date{i}'] = train_df[f'change_status_date{i}'].map(CHANGE_STATUS_MAP)

# Fill missing img data with 0
train_df = train_df.fillna({col: 0 if 'img_' in col else np.nan for col in train_df.columns})

# Change date type
train_df[DATE_COLUMNS] = train_df[DATE_COLUMNS].apply(lambda x: pd.to_datetime(x, format='%d-%m-%Y', errors='coerce'))

In [157]:
def sort_dates(row):

    # Sort columns by date
    columns_order = np.argsort(row[DATE_COLUMNS].values)
    new_row = row.copy(deep=True)

    # Update date and change_status order
    for i in range(5):
        new_row[f'date{i}'] = row[f'date{columns_order[i]}']
        new_row[f'change_status_date{i}'] = row[f'change_status_date{columns_order[i]}']

    # Update color metrics order
    for metric in METRICS:
        for color in COLORS:
            for i in range(1, 6):
                new_row[f'img_{color}_{metric}_date{i}'] = row[f'img_{color}_{metric}_date{columns_order[i-1]+1}']
    
    return new_row

train_df = train_df.apply(sort_dates, axis=1)


In [161]:
# One-hot encoding
for geograph_type in GEOGRAPHY_TYPES:
    train_df[geograph_type] = train_df['geography_type'].apply(lambda x: 1 if geograph_type in x else 0)
for urban_type in URBAN_TYPES:
    train_df[urban_type] = train_df['urban_type'].apply(lambda x: 1 if urban_type in x else 0)

In [162]:
# Create new polygon features
train_df = train_df.to_crs('EPSG:3857')
train_df['area'] = train_df['geometry'].area
train_df['length'] = train_df['geometry'].length
train_df['centroid_x'] = train_df['geometry'].centroid.x
train_df['centroid_y'] = train_df['geometry'].centroid.y

# Create new date related features
for metric in METRICS:
    for color in COLORS:
        for i in range(2, 6):
            delta = train_df[f'img_{color}_{metric}_date{i}'] - train_df[f'img_{color}_{metric}_date{i-1}']
            train_df[f'img_{color}_{metric}_delta{i}'] = delta
        train_df[f'img_{color}_{metric}_delta_total'] = train_df[f'img_{color}_{metric}_date5'] - train_df[f'img_{color}_{metric}_date1']

for i in range(1, 5):
    date_delta = (train_df[f'date{i}'] - train_df[f'date{i-1}']).dt.days
    train_df[f'date_delta{i}'] = date_delta.apply(lambda value: int(value) if pd.notna(value) else np.nan)

date_delta_total = (train_df[f'date0'] - train_df[f'date4']).dt.days
train_df[f'date_delta_total'] = date_delta_total.apply(lambda value: int(value) if pd.notna(value) else np.nan)

  return lib.area(geometry, **kwargs)
  return lib.length(geometry, **kwargs)
  return lib.centroid(geometry, **kwargs)
  return lib.centroid(geometry, **kwargs)


In [115]:
## Fix date
#train_df[DATE_COLUMNS] = train_df[DATE_COLUMNS].apply(lambda x: pd.to_datetime(x, format='%d-%m-%Y', errors='coerce'))
time_ctt = 1e9 * 60 * 90 * 24

def fit_and_predict(row):

    if row.isna().any():
        return np.nan
    
    x_sample = row[DATE_COLUMNS].apply(lambda x: x.timestamp()).astype(np.float64) / time_ctt
    y_sample = row.filter(regex=r'^change_status_date\d$')
    
    model = LinearRegression()
    model.fit(x_sample.values.reshape(-1, 1), y_sample.values.reshape(-1, 1))
    
    return model.coef_[0, 0]

train_df["civilizating_rate"] = train_df.apply(fit_and_predict, axis=1)

In [144]:
# Drop uncessary columns
train_df = train_df.drop(columns=COLUMNS_TO_DROP).dropna()

In [148]:
# Standardization of numeric features
# MUDAR PARA SO FAZER NO TREINO (N STANDARDIZAR O DF INTEIRO PQ PARTE VAI PRA TESTE)
numeric_features = [col for col in train_df.columns if col not in BINARY_FEATURES + CATEGORICAL_FEATURES]
numeric_features.remove('change_type')
for col_name in numeric_features:
    mean_value = train_df[col_name].mean()
    std_value = train_df[col_name].std()
    train_df[col_name] = (train_df[col_name] - mean_value) / std_value

In [78]:
train_df.columns[train_df.isna().any()].tolist()

['date0',
 'change_status_date0',
 'date1',
 'change_status_date1',
 'date2',
 'change_status_date2',
 'date3',
 'change_status_date3',
 'date4',
 'change_status_date4']

In [154]:
# Define features and target variable
X = np.array(train_df.drop('change_type', axis=1))
y = np.array(train_df['change_type'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Neural network without feature selection

In [150]:
model = keras.Sequential([
    keras.layers.Dense(516, activation='relu', input_shape=(X_train.shape[1],)),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(256, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(6, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Original accuracy without feature selection
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1, verbose=2)
_, original_accuracy = model.evaluate(X_test, y_test)

# Calculate F1 score
y_pred_probs = model.predict(X_test)
y_pred = argmax(y_pred_probs, axis=1)
f1 = f1_score(y_test, y_pred, average='weighted')
print('F1 Score: ', f1)

Epoch 1/10
6630/6630 - 27s - loss: 0.9464 - accuracy: 0.5966 - val_loss: 0.8263 - val_accuracy: 0.6595 - 27s/epoch - 4ms/step
Epoch 2/10
6630/6630 - 25s - loss: 0.8645 - accuracy: 0.6413 - val_loss: 0.7941 - val_accuracy: 0.6699 - 25s/epoch - 4ms/step
Epoch 3/10
6630/6630 - 24s - loss: 0.8465 - accuracy: 0.6489 - val_loss: 0.7985 - val_accuracy: 0.6716 - 24s/epoch - 4ms/step
Epoch 4/10
6630/6630 - 24s - loss: 0.8367 - accuracy: 0.6529 - val_loss: 0.7853 - val_accuracy: 0.6808 - 24s/epoch - 4ms/step
Epoch 5/10
6630/6630 - 22s - loss: 0.8309 - accuracy: 0.6565 - val_loss: 0.7812 - val_accuracy: 0.6755 - 22s/epoch - 3ms/step
Epoch 6/10
6630/6630 - 24s - loss: 0.8256 - accuracy: 0.6578 - val_loss: 0.7815 - val_accuracy: 0.6727 - 24s/epoch - 4ms/step
Epoch 7/10
6630/6630 - 24s - loss: 0.8246 - accuracy: 0.6597 - val_loss: 0.7770 - val_accuracy: 0.6746 - 24s/epoch - 4ms/step
Epoch 8/10
6630/6630 - 24s - loss: 0.8220 - accuracy: 0.6607 - val_loss: 0.7753 - val_accuracy: 0.6837 - 24s/epoch - 4

#### Neural network with tree-based feature selection

In [127]:
# Train random forest
model_rf = RandomForestClassifier()
model_rf.fit(X, y)

# Select features
importances_rf = model_rf.feature_importances_
selected_features_rf = train_df.columns[importances_rf > 0.05] #ADJUST AS NEEDED

# Train model and evaluate accuracy
X_rf = np.array(train_df.drop('change_type', axis=1)[selected_features_rf])
X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(X_rf, y, test_size=0.2, random_state=42)
model.fit(X_rf, y_train_rf, epochs=10, batch_size=32, validation_split=0.1, verbose=2)
accuracy_rf = model.evaluate(X_rf, y_test_rf)

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [130]:
importances_rf

array([1.04897911e-06, 1.22045434e-04, 3.20515363e-04, 4.03005988e-04,
       4.69997253e-04, 7.30753138e-04, 1.07559690e-03, 1.23989401e-03,
       1.26465342e-03, 1.30604251e-03, 1.38456977e-03, 1.45127151e-03,
       1.52459177e-03, 1.52885532e-03, 2.23257786e-03, 5.37559599e-03,
       8.98361593e-03, 9.09044365e-03, 9.10536103e-03, 9.18390725e-03,
       9.18705059e-03, 9.18857899e-03, 9.20033518e-03, 9.24084851e-03,
       9.24334547e-03, 9.26423038e-03, 9.26653853e-03, 9.35583821e-03,
       9.37804865e-03, 9.48830878e-03, 9.49003235e-03, 9.49054055e-03,
       9.49970214e-03, 9.50261245e-03, 9.50400438e-03, 9.52414735e-03,
       9.53587806e-03, 9.53950973e-03, 9.67400214e-03, 9.67474122e-03,
       9.70160209e-03, 9.72037811e-03, 9.72490179e-03, 9.72523285e-03,
       9.79050794e-03, 9.83345891e-03, 9.86182764e-03, 9.91448667e-03,
       9.92761394e-03, 9.94983621e-03, 9.96209902e-03, 9.97558526e-03,
       1.00212124e-02, 1.00856644e-02, 1.01016046e-02, 1.01281580e-02,
      

#### Neural network with PCA

In [1]:
# Calculate PCA
pca = PCA(n_components=0.8) # ADJUST

# Define features and target variable
X_pca = pca.fit_transform(X)
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Create and compile model
pca_model = keras.Sequential([
    keras.layers.Dense(512, activation='relu', input_shape=(X_train_pca.shape[1],)),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(256, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(128, activation='relu'),  # Additional layer
    keras.layers.Dropout(0.5),
    keras.layers.Dense(6, activation='softmax')
])
pca_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train model and evaluate accuracy
pca_model.fit(X_train_pca, y_train_pca, epochs=20, batch_size=32, validation_split=0.1, verbose=2)
_, original_accuracy = pca_model.evaluate(X_test_pca, y_test_pca)

# Calculate F1 score
y_pred_probs = pca_model.predict(X_test_pca)
y_pred = argmax(y_pred_probs, axis=1)
f1 = f1_score(y_test, y_pred, average='weighted')
print('F1 Score: ', f1)

NameError: name 'PCA' is not defined

#### Neural Network with SelectKBest feature selection 

In [166]:
scaler = MinMaxScaler()
X_train_kbest = scaler.fit_transform(X_train)
X_test_kbest = scaler.transform(X_test)

# Perform feature selection using SelectKBest and chi-squared
k_best = SelectKBest(score_func=chi2, k=80)
X_train_kbest = k_best.fit_transform(X_train_kbest, y_train)
X_test_kbest = k_best.transform(X_test_kbest)

# Create and compile model
kbest_model = keras.Sequential([
    keras.layers.Dense(512, activation='relu', input_shape=(X_train_kbest.shape[1],)),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(256, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(6, activation='softmax')
])
kbest_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train model and evaluate accuracy
kbest_model.fit(X_train_kbest, y_train, epochs=10, batch_size=32, validation_split=0.1, verbose=2)
_, kbest_accuracy = kbest_model.evaluate(X_test_kbest, y_test)

# Calculate F1 score
y_pred_probs = kbest_model.predict(X_test_kbest)
y_pred = argmax(y_pred_probs, axis=1)
f1 = f1_score(y_test, y_pred, average='weighted')
print('F1 Score: ', f1)

Epoch 1/10
6630/6630 - 21s - loss: 1.0146 - accuracy: 0.5513 - val_loss: 0.9094 - val_accuracy: 0.6013 - 21s/epoch - 3ms/step
Epoch 2/10
6630/6630 - 19s - loss: 0.9382 - accuracy: 0.5919 - val_loss: 0.8813 - val_accuracy: 0.6308 - 19s/epoch - 3ms/step
Epoch 3/10
6630/6630 - 19s - loss: 0.9165 - accuracy: 0.6030 - val_loss: 0.8508 - val_accuracy: 0.6417 - 19s/epoch - 3ms/step
Epoch 4/10
6630/6630 - 19s - loss: 0.9033 - accuracy: 0.6111 - val_loss: 0.8504 - val_accuracy: 0.6416 - 19s/epoch - 3ms/step
Epoch 5/10
6630/6630 - 19s - loss: 0.8955 - accuracy: 0.6170 - val_loss: 0.8437 - val_accuracy: 0.6487 - 19s/epoch - 3ms/step
Epoch 6/10
6630/6630 - 19s - loss: 0.8914 - accuracy: 0.6199 - val_loss: 0.8310 - val_accuracy: 0.6541 - 19s/epoch - 3ms/step
Epoch 7/10
6630/6630 - 19s - loss: 0.8863 - accuracy: 0.6229 - val_loss: 0.8244 - val_accuracy: 0.6534 - 19s/epoch - 3ms/step
Epoch 8/10
6630/6630 - 21s - loss: 0.8831 - accuracy: 0.6240 - val_loss: 0.8228 - val_accuracy: 0.6615 - 21s/epoch - 3