#### Requirements

In [1]:
import sys
import os

import geopandas as gpd
import pandas as pd
import numpy as np
import datetime as dt

from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

#### Constants

In [2]:
# General
BASE_PATH = os.path.dirname(os.getcwd())

# Mapping
CHANGE_TYPE_MAP = {'Demolition': 0, 'Road': 1, 'Residential': 2, 'Commercial': 3, 'Industrial': 4,
       'Mega Projects': 5}

# Data
COLORS = ['red', 'green', 'blue']
METRICS = ['std', 'mean']
GEOGRAPHY_TYPES = ['Dense Forest', 'Grass Land', 'Sparse Forest', 'Farms', 'River',
                   'Coastal', 'Lakes', 'Barren Land', 'Desert', 'Hills', 'Snow'] 

# Columns groups
COLUMNS_TO_DROP = ['geography_type', 'geometry']
DATE_COLUMNS = ['date0', 'date1', 'date2', 'date3', 'date4']

#### Data preprocessing

In [3]:
## Read data
train_df = gpd.read_file(f'{BASE_PATH}/data/train.geojson', index_col=0)
# test_df = gpd.read_file(f'{BASE_PATH}/data/test.geojson', index_col=0)

In [4]:
# One-hot encoding for geography_type
for geograph_type in GEOGRAPHY_TYPES:
    train_df[geograph_type] = train_df['geography_type'].apply(lambda x: 1 if geograph_type in x else 0)

In [6]:
# Create new polygon features
train_df['area'] = train_df['geometry'].area
train_df['length'] = train_df['geometry'].length
train_df['centroid'] = train_df['geometry'].centroid

# Create new date related features
train_df[DATE_COLUMNS] = train_df[DATE_COLUMNS].apply(pd.to_datetime)
for metric in METRICS:
    for color in COLORS:
        for i in range(2, 6):
            delta = train_df[f'img_{color}_{metric}_date{i}'] - train_df[f'img_{color}_{metric}_date{i-1}']
            train_df[f'img_{color}_{metric}_delta{i}'] = delta
        train_df[f'img_{color}_{metric}_delta_total'] = train_df[f'img_{color}_{metric}_date5'] - train_df[f'img_{color}_{metric}_date1']
for i in range(1, 5):
    train_df[f'date_delta{i}'] = train_df[f'date{i}'] - train_df[f'date{i-1}']
train_df['date_delta_total'] = train_df[f'date4'] - train_df[f'date1']

# Map change_type
train_df['change_type'] = train_df['change_type'].map(CHANGE_TYPE_MAP)


  train_df['area'] = train_df['geometry'].area

  train_df['length'] = train_df['geometry'].length

  train_df['centroid'] = train_df['geometry'].centroid
  train_df[DATE_COLUMNS] = train_df[DATE_COLUMNS].apply(pd.to_datetime)
  train_df[DATE_COLUMNS] = train_df[DATE_COLUMNS].apply(pd.to_datetime)
  train_df[DATE_COLUMNS] = train_df[DATE_COLUMNS].apply(pd.to_datetime)
  train_df[DATE_COLUMNS] = train_df[DATE_COLUMNS].apply(pd.to_datetime)
  train_df[DATE_COLUMNS] = train_df[DATE_COLUMNS].apply(pd.to_datetime)


Unnamed: 0,urban_type,change_type,img_red_mean_date1,img_green_mean_date1,img_blue_mean_date1,img_red_std_date1,img_green_std_date1,img_blue_std_date1,img_red_mean_date2,img_green_mean_date2,...,img_blue_mean_delta2,img_blue_mean_delta3,img_blue_mean_delta4,img_blue_mean_delta5,img_blue_mean_delta_total,date_delta1,date_delta2,date_delta3,date_delta4,date_delta_total
0,Sparse Urban,1,93.371775,107.291113,89.827379,29.812040,28.328368,25.324294,125.773062,139.833243,...,45.073321,14.455983,-69.786620,23.274274,13.016959,-1579 days,1123 days,1016 days,-728 days,1411 days
1,Sparse Urban,1,96.071674,107.061702,90.755556,24.896240,22.275180,22.080686,133.097679,145.385190,...,46.336962,37.142813,-82.787496,13.387707,14.079986,-1579 days,1123 days,1016 days,-728 days,1411 days
2,Sparse Urban,1,101.212148,113.462178,95.670574,24.179684,21.873401,21.285197,120.713490,131.633447,...,28.765918,24.134778,-66.824643,17.450527,3.526579,-1579 days,1123 days,1016 days,-728 days,1411 days
3,Rural,1,94.463311,99.995531,84.470046,26.869852,23.767679,19.351983,114.819776,127.827828,...,35.965327,27.745425,-68.750643,12.982767,7.942876,-1579 days,1123 days,1016 days,-728 days,1411 days
4,Dense Urban,0,151.883646,191.710197,211.569244,52.465332,59.441844,52.304349,141.514462,171.079581,...,-29.608632,4.857941,-92.677046,36.866442,-80.561296,-1579 days,1123 days,1016 days,-728 days,1411 days
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296141,"N,A",3,239.297084,229.193482,215.205832,25.969706,31.586712,32.155574,140.346141,116.700172,...,-116.871355,79.628130,-33.448714,17.806175,-52.885763,829 days,-1125 days,1521 days,-821 days,-425 days
296142,Sparse Urban,2,162.912319,143.865217,122.935145,56.127846,44.184674,49.760802,103.760507,81.104710,...,-52.934058,101.575362,-54.150000,1.481522,-4.027174,829 days,-1125 days,1521 days,-821 days,-425 days
296143,Sparse Urban,2,111.304320,94.723404,80.374597,21.540545,17.786801,18.143091,68.845906,62.948420,...,-29.058672,0.000000,-0.629271,22.715667,-6.972276,829 days,-1125 days,1521 days,-821 days,-425 days
296144,Sparse Urban,2,137.374613,136.108359,113.544892,32.344779,30.077877,29.759516,98.718266,85.318885,...,-40.972136,0.000000,71.427245,-12.860681,17.594427,829 days,-1125 days,1521 days,-821 days,-425 days


In [8]:
# Drop uncessary columns
train_df = train_df.drop(columns=COLUMNS_TO_DROP)

#### Save where you want

In [None]:
# Write in excel
#train_df.to_excel('train_head.xlsx', index=False)
#test_df.to_excel('test_head.xlsx', index=False)

In [None]:
# Write in geojson
import json
with open('preprocessed_train.geojson', 'w') as fp:
    json.dump(train_df, fp) 