2EL1730 Machine Learning Project - Jan. 2024

Libraries

In [9]:
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
from matplotlib import pyplot
import seaborn as sb
from datetime import date
import geopandas as gpd
import seaborn as sn
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression

from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import roc_curve, roc_auc_score, RocCurveDisplay
from sklearn.metrics import recall_score, precision_score,accuracy_score, average_precision_score, f1_score, log_loss
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import shap
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import cross_val_score

Constants

In [10]:
# General
BASE_PATH = os.path.dirname(os.getcwd())
COLORS = ['red', 'green', 'blue']
METRICS = ['std', 'mean']
GEOGRAPHY_TYPES = ['Dense Forest', 'Grass Land', 'Sparse Forest', 'Farms', 'River','Coastal', 'Lakes', 'Barren Land', 'Desert', 'Hills', 'Snow']
URBAN_TYPES = ['Sparse Urban', 'Rural', 'Dense Urban', 'Urban Slum', 'Industrial']
change_type_map = {'Demolition': 0, 'Road': 1, 'Residential': 2, 'Commercial': 3, 'Industrial': 4,'Mega Projects': 5}

CHANGE_STATUS_MAP = {None: 0, 'Greenland': 1, 'Land Cleared': 2, 'Materials Introduced': 3,
                     'Prior Construction': 4, 'Excavation': 5, 'Construction Started': 6,
                     'Construction Midway': 7, 'Materials Dumped': 8, 'Construction Done': 9,
                     'Operational': 10}


# Columns groups
COLUMNS_TO_DROP = ['geography_type', 'urban_type', 'geometry', 'date0', 'date1', 'date2', 'date3', 'date4']
DATE_COLUMNS = ['date0', 'date1', 'date2', 'date3', 'date4']

# Output file
OUTPUT_FILE = 'preprocessed_train.geojson'

# Feature types
BINARY_FEATURES = ['Dense Forest', 'Grass Land', 'Sparse Forest', 'Farms', 'River',
                   'Coastal', 'Lakes', 'Barren Land', 'Desert', 'Hills', 'Snow',
                   'Sparse Urban', 'Rural', 'Dense Urban', 'Urban Slum', 'Industrial']
CATEGORICAL_FEATURES = ['change_status_date0', 'change_status_date1', 'change_status_date2', 'change_status_date3',
                      'change_status_date4']

Dataset

In [12]:
train_df = gpd.read_file(f'{BASE_PATH}/data/train.geojson', index_col=0)
test_df = gpd.read_file(f'{BASE_PATH}/data/test.geojson', index_col=0)

Data preprocessing

In [13]:
train_df_pre_process = train_df.copy()

# Fill missing data with 0
train_df_pre_process = train_df_pre_process.fillna(0)

# One-hot encoding for geography_type
for geograph_type in GEOGRAPHY_TYPES:
    train_df_pre_process[geograph_type] = train_df_pre_process['geography_type'].apply(lambda x: 1 if geograph_type in x else 0)

for urban_type in URBAN_TYPES:
    train_df_pre_process[urban_type] = train_df_pre_process['urban_type'].apply(lambda x: 1 if urban_type in x else 0)

# Create new polygon features
train_df_pre_process['area'] = train_df_pre_process['geometry'].area
train_df_pre_process['length'] = train_df_pre_process['geometry'].length
train_df_pre_process['centroid_x'] = train_df_pre_process['geometry'].centroid.x
train_df_pre_process['centroid_y'] = train_df_pre_process['geometry'].centroid.y

# Create new date related features
train_df_pre_process[DATE_COLUMNS] = train_df_pre_process[DATE_COLUMNS].apply(pd.to_datetime)
for metric in METRICS:
    for color in COLORS:
        for i in range(2, 6):
            delta = train_df_pre_process[f'img_{color}_{metric}_date{i}'] - train_df_pre_process[f'img_{color}_{metric}_date{i-1}']
            train_df_pre_process[f'img_{color}_{metric}_delta{i}'] = delta
        train_df_pre_process[f'img_{color}_{metric}_delta_total'] = train_df_pre_process[f'img_{color}_{metric}_date5'] - train_df_pre_process[f'img_{color}_{metric}_date1']
for i in range(1, 5):
    train_df_pre_process[f'date_delta{i}'] = train_df_pre_process[f'date{i}'] - train_df_pre_process[f'date{i-1}']
train_df_pre_process['date_delta_total'] = train_df_pre_process[f'date4'] - train_df_pre_process[f'date1']

# Map change_type
train_df_pre_process['change_type'] = train_df_pre_process['change_type'].map(change_type_map)
for i in range(5): train_df_pre_process[f'change_status_date{i}'] = train_df_pre_process[f'change_status_date{i}'].map(CHANGE_STATUS_MAP)

train_df_pre_process['date_delta_total'] = train_df_pre_process['date_delta_total']/np.timedelta64(1, 'D')
train_df_pre_process['date_delta1'] = train_df_pre_process['date_delta1']/np.timedelta64(1, 'D')
train_df_pre_process['date_delta2'] = train_df_pre_process['date_delta2']/np.timedelta64(1, 'D')
train_df_pre_process['date_delta3'] = train_df_pre_process['date_delta3']/np.timedelta64(1, 'D')
train_df_pre_process['date_delta4'] = train_df_pre_process['date_delta4']/np.timedelta64(1, 'D')

# Drop uncessary columns
train_df_pre_process = train_df_pre_process.drop(columns=COLUMNS_TO_DROP)

Geometry is in a geographic CRS. Results from 'area' are likely incorrect. Use 'GeoSeries.to_crs()' to re-project geometries to a projected CRS before this operation.

Geometry is in a geographic CRS. Results from 'length' are likely incorrect. Use 'GeoSeries.to_crs()' to re-project geometries to a projected CRS before this operation.

Geometry is in a geographic CRS. Results from 'centroid' are likely incorrect. Use 'GeoSeries.to_crs()' to re-project geometries to a projected CRS before this operation.

Geometry is in a geographic CRS. Results from 'centroid' are likely incorrect. Use 'GeoSeries.to_crs()' to re-project geometries to a projected CRS before this operation.

Parsing dates in DD/MM/YYYY format when dayfirst=False (the default) was specified. This may lead to inconsistently parsed dates! Specify a format to ensure consistent parsing.
Parsing dates in DD/MM/YYYY format when dayfirst=False (the default) was specified. This may lead to inconsistently parsed dates! Specify a f

Informations

In [14]:
# Fill missing data with 0
train_df_pre_process = train_df_pre_process.fillna(0)
train_df_pre_process.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 296146 entries, 0 to 296145
Data columns (total 92 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   change_type                 296146 non-null  int64  
 1   img_red_mean_date1          296146 non-null  float64
 2   img_green_mean_date1        296146 non-null  float64
 3   img_blue_mean_date1         296146 non-null  float64
 4   img_red_std_date1           296146 non-null  float64
 5   img_green_std_date1         296146 non-null  float64
 6   img_blue_std_date1          296146 non-null  float64
 7   img_red_mean_date2          296146 non-null  float64
 8   img_green_mean_date2        296146 non-null  float64
 9   img_blue_mean_date2         296146 non-null  float64
 10  img_red_std_date2           296146 non-null  float64
 11  img_green_std_date2         296146 non-null  float64
 12  img_blue_std_date2          296146 non-null  float64
 13  img_red_mean_d

Standardization of numeric features and splitting the data

In [15]:
# Creating X and y
X = np.array(train_df_pre_process.drop('change_type', axis=1))
y = np.array(train_df_pre_process['change_type'])

#enn = EditedNearestNeighbours(kind_sel="all")
#X, y = enn.fit_resample(X, y) #undersampling

# Splitting the data into training and test sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Indices of numeric and categorical columns
numeric_features = [i for i in range(X.shape[1]) if train_df_pre_process.drop(columns = ['change_type']).columns[i] not in BINARY_FEATURES + CATEGORICAL_FEATURES]

# Create the transformer
preprocessor = ColumnTransformer(transformers=[('num', StandardScaler(), numeric_features),])

# Fit and transform the training data
X_train_scaled = preprocessor.fit_transform(X_train)

# Transform the test data
X_test_scaled = preprocessor.transform(X_val)

Data preprocessing of test data

In [16]:
test_df_pre_process = test_df.copy()

# Fill missing data with 0
test_df_pre_process = test_df_pre_process.fillna(0)

# One-hot encoding for geography_type
for geograph_type in GEOGRAPHY_TYPES:
    test_df_pre_process[geograph_type] = test_df_pre_process['geography_type'].apply(lambda x: 1 if geograph_type in x else 0)

for urban_type in URBAN_TYPES:
    test_df_pre_process[urban_type] = test_df_pre_process['urban_type'].apply(lambda x: 1 if urban_type in x else 0)

# Create new polygon features
test_df_pre_process['area'] = test_df_pre_process['geometry'].area
test_df_pre_process['length'] = test_df_pre_process['geometry'].length
test_df_pre_process['centroid_x'] = test_df_pre_process['geometry'].centroid.x
test_df_pre_process['centroid_y'] = test_df_pre_process['geometry'].centroid.y

# Create new date related features
test_df_pre_process[DATE_COLUMNS] = test_df_pre_process[DATE_COLUMNS].apply(pd.to_datetime)
for metric in METRICS:
    for color in COLORS:
        for i in range(2, 6):
            delta = test_df_pre_process[f'img_{color}_{metric}_date{i}'] - test_df_pre_process[f'img_{color}_{metric}_date{i-1}']
            test_df_pre_process[f'img_{color}_{metric}_delta{i}'] = delta
        test_df_pre_process[f'img_{color}_{metric}_delta_total'] = test_df_pre_process[f'img_{color}_{metric}_date5'] - test_df_pre_process[f'img_{color}_{metric}_date1']
for i in range(1, 5):
    test_df_pre_process[f'date_delta{i}'] = test_df_pre_process[f'date{i}'] - test_df_pre_process[f'date{i-1}']
test_df_pre_process['date_delta_total'] = test_df_pre_process[f'date4'] - test_df_pre_process[f'date1']

for i in range(5): test_df_pre_process[f'change_status_date{i}'] = test_df_pre_process[f'change_status_date{i}'].map(CHANGE_STATUS_MAP)

test_df_pre_process['date_delta_total'] = test_df_pre_process['date_delta_total']/np.timedelta64(1, 'D')
test_df_pre_process['date_delta1'] = test_df_pre_process['date_delta1']/np.timedelta64(1, 'D')
test_df_pre_process['date_delta2'] = test_df_pre_process['date_delta2']/np.timedelta64(1, 'D')
test_df_pre_process['date_delta3'] = test_df_pre_process['date_delta3']/np.timedelta64(1, 'D')
test_df_pre_process['date_delta4'] = test_df_pre_process['date_delta4']/np.timedelta64(1, 'D')

# Drop uncessary columns
test_df_pre_process = test_df_pre_process.drop(columns=COLUMNS_TO_DROP)

Geometry is in a geographic CRS. Results from 'area' are likely incorrect. Use 'GeoSeries.to_crs()' to re-project geometries to a projected CRS before this operation.

Geometry is in a geographic CRS. Results from 'length' are likely incorrect. Use 'GeoSeries.to_crs()' to re-project geometries to a projected CRS before this operation.

Geometry is in a geographic CRS. Results from 'centroid' are likely incorrect. Use 'GeoSeries.to_crs()' to re-project geometries to a projected CRS before this operation.

Geometry is in a geographic CRS. Results from 'centroid' are likely incorrect. Use 'GeoSeries.to_crs()' to re-project geometries to a projected CRS before this operation.

Parsing dates in DD/MM/YYYY format when dayfirst=False (the default) was specified. This may lead to inconsistently parsed dates! Specify a format to ensure consistent parsing.
Parsing dates in DD/MM/YYYY format when dayfirst=False (the default) was specified. This may lead to inconsistently parsed dates! Specify a f

In [17]:
test_df_pre_process = test_df_pre_process.fillna(0)

numeric_features = [col for col in test_df_pre_process.columns if col not in BINARY_FEATURES + CATEGORICAL_FEATURES]
for col_name in numeric_features:
    mean_value = test_df_pre_process[col_name].mean()
    std_value = test_df_pre_process[col_name].std()
    test_df_pre_process[col_name] = (test_df_pre_process[col_name] - mean_value) / std_value

In [18]:
print(Counter(y))

Counter({2: 148435, 3: 100422, 0: 31509, 1: 14305, 4: 1324, 5: 151})


Random Forest - without feature selection

In [None]:
#test data
X_test = np.array(test_df_pre_process)

clf = RandomForestClassifier(bootstrap = True, max_depth= 40, min_samples_leaf= 1, min_samples_split= 3, n_estimators= 100)

# Train the model
clf = clf.fit(X_train, y_train)

# Make predictions on the validation data
y_pred = clf.predict(X_val)

test_accuracy = clf.score(X_val, y_val)

print(classification_report(y_val, y_pred))

# Calculate and print the F1-score
f1 = f1_score(y_val, y_pred , average='weighted')
print(f'Model F1-score on validation data: {f1}')

# Make predictions on the test data
y_pred_test = clf.predict(X_test)

## Save results to submission file
pred_df = pd.DataFrame(y_pred_test, columns=['change_type'])
#pred_df_LR.to_csv("LR_submission.csv", index=True, index_label='Id')

Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


              precision    recall  f1-score   support

           0       0.76      0.76      0.76      6345
           1       0.87      0.48      0.61      2806
           2       0.78      0.80      0.79     29776
           3       0.67      0.69      0.68     19999
           4       0.67      0.01      0.01       281
           5       0.00      0.00      0.00        23

    accuracy                           0.74     59230
   macro avg       0.62      0.46      0.48     59230
weighted avg       0.74      0.74      0.74     59230

Model F1-score on validation data: 0.7373963468212685


Random Forest - with feature selection

In [None]:
#test data
X_test = np.array(test_df_pre_process)

#Embedded methods to feature selection
sel = SelectFromModel(RandomForestClassifier(n_estimators= 100, random_state=10), threshold='median')
sel.fit(X_train, y_train)

X_train_selected = sel.transform(X_train)
X_val_selected = sel.transform(X_val)
X_test_selected = sel.transform(X_test)

clf = RandomForestClassifier(bootstrap = True, max_depth= 40, min_samples_leaf= 1, min_samples_split= 3, n_estimators= 100)
clf.fit(X_train_selected, y_train)

# Make predictions on the validation data
y_pred = clf.predict(X_val_selected)

print(classification_report(y_val, y_pred))

# Calculate and print the F1-score
f1 = f1_score(y_val, y_pred , average='weighted')
print(f'Model F1-score on validation data: {f1}')

# Make predictions on the test data
y_pred_test = clf.predict(X_test_selected)

## Save results to submission file
pred_df = pd.DataFrame(y_pred_test, columns=['change_type'])
#pred_df_LR.to_csv("LR_submission.csv", index=True, index_label='Id')

'''
score = cross_val_score(lr, X_train_selected, y_train, scoring='f1', cv=10)
print('CV F1-score: %.3f +/- %.3f' % (np.mean(score), np.std(score)))

# Get features
print(select.get_support(indices=True))
'''

Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


              precision    recall  f1-score   support

           0       0.77      0.80      0.79      6345
           1       0.85      0.53      0.65      2806
           2       0.79      0.80      0.80     29776
           3       0.68      0.70      0.69     19999
           4       0.57      0.01      0.03       281
           5       0.00      0.00      0.00        23

    accuracy                           0.75     59230
   macro avg       0.61      0.47      0.49     59230
weighted avg       0.75      0.75      0.75     59230

Model F1-score on validation data: 0.748347481796586


"\nscore = cross_val_score(lr, X_train_selected, y_train, scoring='f1', cv=10)\nprint('CV F1-score: %.3f +/- %.3f' % (np.mean(score), np.std(score)))\n\n# Get features\nprint(select.get_support(indices=True))\n"

Xgboost - without feature selection

In [None]:
#test data
X_test = np.array(test_df_pre_process)

clf = xgb.XGBClassifier(n_estimators = 150, subsample = 0.99, learning_rate=0.2,colsample_bytree=1, random_state=137)

clf = clf.fit(X_train, y_train)

y_pred = clf.predict(X_val)

test_accuracy = clf.score(X_val, y_val)

print(classification_report(y_val, y_pred))

# Calculate and print the F1-score
f1 = f1_score(y_val, y_pred , average='weighted')
print(f'Model F1-score on validation data: {f1}')

# Make predictions on the test data
y_pred_test = clf.predict(X_test)

## Save results to submission file
pred_df = pd.DataFrame(y_pred_test, columns=['change_type'])
#pred_df.to_csv("Xgboost_submission.csv", index=True, index_label='Id')

Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


              precision    recall  f1-score   support

           0       0.77      0.89      0.83      6345
           1       0.81      0.68      0.74      2806
           2       0.79      0.80      0.80     29776
           3       0.70      0.68      0.69     19999
           4       0.50      0.03      0.06       281
           5       0.00      0.00      0.00        23

    accuracy                           0.76     59230
   macro avg       0.60      0.51      0.52     59230
weighted avg       0.76      0.76      0.76     59230

Model F1-score on validation data: 0.7568684544208875


Xgboost - with feature selection

In [None]:
#Embedded methods to feature selection
sel = SelectFromModel(RandomForestClassifier(n_estimators= 100, random_state=10), threshold='median')

# Train the model
sel.fit(X_train, y_train)
X_train_selected = sel.transform(X_train)
X_val_selected = sel.transform(X_val)
X_test_selected = sel.transform(X_test)

clf = xgb.XGBClassifier(n_estimators = 150, subsample = 0.99, learning_rate=0.2,colsample_bytree=1, random_state=137)
clf.fit(X_train_selected, y_train)

y_pred = clf.predict(X_val_selected)

print(classification_report(y_val, y_pred))

# Calculate and print the F1-score
f1 = f1_score(y_val, y_pred , average='weighted')
print(f'Model F1-score on validation data: {f1}')

# Make predictions on the test data
y_pred_test = clf.predict(X_test_selected)

## Save results to submission file
pred_df = pd.DataFrame(y_pred_test, columns=['change_type'])
#pred_df.to_csv("Xgboost_submission.csv", index=True, index_label='Id')

Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


              precision    recall  f1-score   support

           0       0.77      0.89      0.83      6345
           1       0.78      0.57      0.66      2806
           2       0.79      0.80      0.79     29776
           3       0.69      0.67      0.68     19999
           4       0.43      0.04      0.07       281
           5       0.00      0.00      0.00        23

    accuracy                           0.75     59230
   macro avg       0.58      0.50      0.50     59230
weighted avg       0.75      0.75      0.75     59230

Model F1-score on validation data: 0.7489683871544114
