2EL1730 Machine Learning Project - Jan. 2024

Libraries

In [3]:
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
from matplotlib import pyplot
import seaborn as sb
from datetime import date
import geopandas as gpd
import seaborn as sn
import sys
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression

from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import roc_curve, roc_auc_score, RocCurveDisplay
from sklearn.metrics import recall_score, precision_score,accuracy_score, average_precision_score, f1_score, log_loss
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import shap
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import cross_val_score

Constants

In [None]:
# General
BASE_PATH = os.path.dirname(os.getcwd())

# Mapping
CHANGE_TYPE_MAP = {'Demolition': 0, 'Road': 1, 'Residential': 2, 'Commercial': 3, 'Industrial': 4,
       'Mega Projects': 5}
CHANGE_STATUS_MAP = {'Greenland': 0, 'Land Cleared': 1, 'Excavation': 1, 'Materials Dumped': 3, 'Prior Construction': 3, 'Materials Introduced': 4, 'Construction Started': 5, 'Construction Midway': 6, 'Construction Done': 8, 'Operational': 10, None: None}

# Data
COLORS = ['red', 'green', 'blue']
METRICS = ['std', 'mean']
GEOGRAPHY_TYPES = ['Dense Forest', 'Grass Land', 'Sparse Forest', 'Farms', 'River',
                   'Coastal', 'Lakes', 'Barren Land', 'Desert', 'Hills', 'Snow'] 
URBAN_TYPES = ['Sparse Urban', 'Rural', 'Dense Urban', 'Urban Slum', 'Industrial']
change_type_map = {'Demolition': 0, 'Road': 1, 'Residential': 2, 'Commercial': 3, 'Industrial': 4,'Mega Projects': 5}

# Columns groups
COLUMNS_TO_DROP = ['geography_type', 'urban_type']
DATE_COLUMNS = ['date0', 'date1', 'date2', 'date3', 'date4']
CHANGE_STATUS_COLUMNS = ['change_status_date0', 'change_status_date1', 'change_status_date2', 'change_status_date3', 'change_status_date4']
CHANGE_STATUS_VALUE_COLUMNS = ['change_status_value_date0', 'change_status_value_date1', 'change_status_value_date2', 'change_status_value_date3', 'change_status_value_date4']

# Feature types
BINARY_FEATURES = ['Dense Forest', 'Grass Land', 'Sparse Forest', 'Farms', 'River',
                   'Coastal', 'Lakes', 'Barren Land', 'Desert', 'Hills', 'Snow',
                   'Sparse Urban', 'Rural', 'Dense Urban', 'Urban Slum', 'Industrial']
CATEGORICAL_FEATURES = ['change_status_date0', 'change_status_date1', 'change_status_date2', 'change_status_date3',
                      'change_status_date4']

# Output file
OUTPUT_FILE = 'preprocessed_train.geojson'

Dataset

In [2]:
train_df = gpd.read_file(f'{BASE_PATH}/data/train.geojson', index_col=0)
test_df = gpd.read_file(f'{BASE_PATH}/data/test.geojson', index_col=0)

Data preprocessing

In [8]:
train_df_pre_process = train_df.copy()

# Fill missing data with 0
train_df_pre_process = train_df_pre_process.fillna(0)

# One-hot encoding for geography_type
for geograph_type in GEOGRAPHY_TYPES:
    train_df_pre_process[geograph_type] = train_df_pre_process['geography_type'].apply(lambda x: 1 if geograph_type in x else 0)

for urban_type in URBAN_TYPES:
    train_df_pre_process[urban_type] = train_df_pre_process['urban_type'].apply(lambda x: 1 if urban_type in x else 0)

## Create new polygon features
train_df_pre_process['area'] = train_df_pre_process['geometry'].area
train_df_pre_process['length'] = train_df_pre_process['geometry'].length
#train_df_pre_process['area_to_length_ratio'] = train_df_pre_process['area'] / train_df_pre_process['length'] # the more, the closer to square
train_df_pre_process['centroid_x'] = train_df_pre_process['geometry'].centroid.x
train_df_pre_process['centroid_y'] = train_df_pre_process['geometry'].centroid.y

'''
### Convert date from string to date
train_df_pre_process[DATE_COLUMNS] = train_df_pre_process[DATE_COLUMNS].apply(lambda x: pd.to_datetime(x, format='%d-%m-%Y', errors='coerce'))

## Create deltas color[std, mean] 
for metric in METRICS:
    for color in COLORS:
        for i in range(2, 6):
            delta = train_df_pre_process[f'img_{color}_{metric}_date{i}'] - train_df_pre_process[f'img_{color}_{metric}_date{i-1}']
            train_df_pre_process[f'img_{color}_{metric}_delta{i}'] = delta
        train_df_pre_process[f'img_{color}_{metric}_delta_total'] = train_df_pre_process[f'img_{color}_{metric}_date5'] - train_df[f'img_{color}_{metric}_date1']

## Create deltas time 
for i in range(1, 5):
    train_df_pre_process[f'date_delta{i}'] = train_df_pre_process[f'date{i}'] - train_df_pre_process[f'date{i-1}']
train_df_pre_process['date_delta_total'] = train_df_pre_process[f'date4'] - train_df_pre_process[f'date0']

## Standardizing colors mean by the proportion
for i in range(1, 6):
    color_sum = train_df_pre_process[f'img_blue_mean_date{i}'] + train_df_pre_process[f'img_green_mean_date{i}'] + train_df_pre_process[f'img_red_mean_date{i}']
    for color in COLORS:
        train_df_pre_process[f'img_{color}_mean_prop_date{i}'] = train_df_pre_process[f'img_{color}_mean_date{i}']/color_sum

## Create img_{color}_mean_prop_rate
num_samples = train_df_pre_process.shape[0]
ones = np.ones((num_samples,5,1))

for color in COLORS:
    coef = np.zeros((num_samples))
    COLOR_MEAN_COLUMNS = [f'img_{color}_mean_prop_date{i}' for i in range (1,6)]
    
    Y = np.array(train_df_pre_process[COLOR_MEAN_COLUMNS].astype(float))
    nan_mask = np.isnan(Y) | np.isnan(X[:,:,1])
    X = np.array(train_df_pre_process[DATE_COLUMNS].astype(int))[:,:,np.newaxis]/time_ctt
    X = np.dstack((ones,X))
    X[nan_mask,:] = 0
    Y[nan_mask] = 0

    eye = np.eye(2)*0.0001
    for i in range(num_samples):
        x = X[i].reshape((5,2))
        y = Y[i].reshape((5))
        coef[i] = (np.linalg.inv(eye+x.T@x)@x.T@y)[1]
        
    train_df_pre_process[f'img_{color}_mean_prop_rate'] = coef
'''

# Create new date related features
train_df_pre_process[DATE_COLUMNS] = train_df_pre_process[DATE_COLUMNS].apply(pd.to_datetime)
for metric in METRICS:
    for color in COLORS:
        for i in range(2, 6):
            delta = train_df_pre_process[f'img_{color}_{metric}_date{i}'] - train_df_pre_process[f'img_{color}_{metric}_date{i-1}']
            train_df_pre_process[f'img_{color}_{metric}_delta{i}'] = delta
        train_df_pre_process[f'img_{color}_{metric}_delta_total'] = train_df_pre_process[f'img_{color}_{metric}_date5'] - train_df_pre_process[f'img_{color}_{metric}_date1']
for i in range(1, 5):
    train_df_pre_process[f'date_delta{i}'] = train_df_pre_process[f'date{i}'] - train_df_pre_process[f'date{i-1}']
train_df_pre_process['date_delta_total'] = train_df_pre_process[f'date4'] - train_df_pre_process[f'date1']

# Map change_type
train_df_pre_process['change_type'] = train_df_pre_process['change_type'].map(change_type_map)
for i in range(5): train_df_pre_process[f'change_status_date{i}'] = train_df_pre_process[f'change_status_date{i}'].map(CHANGE_STATUS_MAP)

train_df_pre_process['date_delta_total'] = train_df_pre_process['date_delta_total']/np.timedelta64(1, 'D')
train_df_pre_process['date_delta1'] = train_df_pre_process['date_delta1']/np.timedelta64(1, 'D')
train_df_pre_process['date_delta2'] = train_df_pre_process['date_delta2']/np.timedelta64(1, 'D')
train_df_pre_process['date_delta3'] = train_df_pre_process['date_delta3']/np.timedelta64(1, 'D')
train_df_pre_process['date_delta4'] = train_df_pre_process['date_delta4']/np.timedelta64(1, 'D')

# Drop uncessary columns
train_df_pre_process = train_df_pre_process.drop(columns=COLUMNS_TO_DROP)

Geometry is in a geographic CRS. Results from 'area' are likely incorrect. Use 'GeoSeries.to_crs()' to re-project geometries to a projected CRS before this operation.

Geometry is in a geographic CRS. Results from 'length' are likely incorrect. Use 'GeoSeries.to_crs()' to re-project geometries to a projected CRS before this operation.

Geometry is in a geographic CRS. Results from 'centroid' are likely incorrect. Use 'GeoSeries.to_crs()' to re-project geometries to a projected CRS before this operation.

Geometry is in a geographic CRS. Results from 'centroid' are likely incorrect. Use 'GeoSeries.to_crs()' to re-project geometries to a projected CRS before this operation.

Parsing dates in DD/MM/YYYY format when dayfirst=False (the default) was specified. This may lead to inconsistently parsed dates! Specify a format to ensure consistent parsing.
Parsing dates in DD/MM/YYYY format when dayfirst=False (the default) was specified. This may lead to inconsistently parsed dates! Specify a f

Informations

In [9]:
# Fill missing data with 0
train_df_pre_process = train_df_pre_process.fillna(0)
train_df_pre_process.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 296146 entries, 0 to 296145
Data columns (total 92 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   change_type                 296146 non-null  int64  
 1   img_red_mean_date1          296146 non-null  float64
 2   img_green_mean_date1        296146 non-null  float64
 3   img_blue_mean_date1         296146 non-null  float64
 4   img_red_std_date1           296146 non-null  float64
 5   img_green_std_date1         296146 non-null  float64
 6   img_blue_std_date1          296146 non-null  float64
 7   img_red_mean_date2          296146 non-null  float64
 8   img_green_mean_date2        296146 non-null  float64
 9   img_blue_mean_date2         296146 non-null  float64
 10  img_red_std_date2           296146 non-null  float64
 11  img_green_std_date2         296146 non-null  float64
 12  img_blue_std_date2          296146 non-null  float64
 13  img_red_mean_d

Standardization of numeric features and splitting the data

In [40]:
# Creating X and y
X = np.array(train_df_pre_process.drop('change_type', axis=1))
y = np.array(train_df_pre_process['change_type'])

# Splitting the data into training and test sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Indices of numeric and categorical columns
numeric_features = [i for i in range(X.shape[1]) if train_df_pre_process.drop(columns = ['change_type']).columns[i] not in BINARY_FEATURES + CATEGORICAL_FEATURES]
categorical_features = [i for i in range(X.shape[1]) if train_df_pre_process.columns[i] in CATEGORICAL_FEATURES]
binary_features = [i for i in range(X.shape[1]) if train_df_pre_process.columns[i] in BINARY_FEATURES]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', 'passthrough', categorical_features),
        ('binary', 'passthrough', binary_features)
    ])

# Fit and transform the training data
X_train_scaled = preprocessor.fit_transform(X_train)

# Transform the validation data
X_val_scaled = preprocessor.transform(X_val)

Data test preprocessing

In [42]:
test_df_pre_process = test_df.copy()

# Fill missing data with 0
test_df_pre_process = test_df_pre_process.fillna(0)

# One-hot encoding for geography_type
for geograph_type in GEOGRAPHY_TYPES:
    test_df_pre_process[geograph_type] = test_df_pre_process['geography_type'].apply(lambda x: 1 if geograph_type in x else 0)

for urban_type in URBAN_TYPES:
    test_df_pre_process[urban_type] = test_df_pre_process['urban_type'].apply(lambda x: 1 if urban_type in x else 0)

# Create new polygon features
test_df_pre_process['area'] = test_df_pre_process['geometry'].area
test_df_pre_process['length'] = test_df_pre_process['geometry'].length
test_df_pre_process['centroid_x'] = test_df_pre_process['geometry'].centroid.x
test_df_pre_process['centroid_y'] = test_df_pre_process['geometry'].centroid.y

# Create new date related features
test_df_pre_process[DATE_COLUMNS] = test_df_pre_process[DATE_COLUMNS].apply(pd.to_datetime)
for metric in METRICS:
    for color in COLORS:
        for i in range(2, 6):
            delta = test_df_pre_process[f'img_{color}_{metric}_date{i}'] - test_df_pre_process[f'img_{color}_{metric}_date{i-1}']
            test_df_pre_process[f'img_{color}_{metric}_delta{i}'] = delta
        test_df_pre_process[f'img_{color}_{metric}_delta_total'] = test_df_pre_process[f'img_{color}_{metric}_date5'] - test_df_pre_process[f'img_{color}_{metric}_date1']
for i in range(1, 5):
    test_df_pre_process[f'date_delta{i}'] = test_df_pre_process[f'date{i}'] - test_df_pre_process[f'date{i-1}']
test_df_pre_process['date_delta_total'] = test_df_pre_process[f'date4'] - test_df_pre_process[f'date1']

for i in range(5): test_df_pre_process[f'change_status_date{i}'] = test_df_pre_process[f'change_status_date{i}'].map(CHANGE_STATUS_MAP)

test_df_pre_process['date_delta_total'] = test_df_pre_process['date_delta_total']/np.timedelta64(1, 'D')
test_df_pre_process['date_delta1'] = test_df_pre_process['date_delta1']/np.timedelta64(1, 'D')
test_df_pre_process['date_delta2'] = test_df_pre_process['date_delta2']/np.timedelta64(1, 'D')
test_df_pre_process['date_delta3'] = test_df_pre_process['date_delta3']/np.timedelta64(1, 'D')
test_df_pre_process['date_delta4'] = test_df_pre_process['date_delta4']/np.timedelta64(1, 'D')

# Drop uncessary columns
test_df_pre_process = test_df_pre_process.drop(columns=COLUMNS_TO_DROP)

Geometry is in a geographic CRS. Results from 'area' are likely incorrect. Use 'GeoSeries.to_crs()' to re-project geometries to a projected CRS before this operation.

Geometry is in a geographic CRS. Results from 'length' are likely incorrect. Use 'GeoSeries.to_crs()' to re-project geometries to a projected CRS before this operation.

Geometry is in a geographic CRS. Results from 'centroid' are likely incorrect. Use 'GeoSeries.to_crs()' to re-project geometries to a projected CRS before this operation.

Geometry is in a geographic CRS. Results from 'centroid' are likely incorrect. Use 'GeoSeries.to_crs()' to re-project geometries to a projected CRS before this operation.

Parsing dates in DD/MM/YYYY format when dayfirst=False (the default) was specified. This may lead to inconsistently parsed dates! Specify a format to ensure consistent parsing.
Parsing dates in DD/MM/YYYY format when dayfirst=False (the default) was specified. This may lead to inconsistently parsed dates! Specify a f

In [43]:
test_df_pre_process = test_df_pre_process.fillna(0)

numeric_features = [col for col in test_df_pre_process.columns if col not in BINARY_FEATURES + CATEGORICAL_FEATURES]
for col_name in numeric_features:
    mean_value = test_df_pre_process[col_name].mean()
    std_value = test_df_pre_process[col_name].std()
    test_df_pre_process[col_name] = (test_df_pre_process[col_name] - mean_value) / std_value

In [44]:
print(Counter(y))

Counter({2: 148435, 3: 100422, 0: 31509, 1: 14305, 4: 1324, 5: 151})


Logistic Regression without feature selection

In [45]:
#test data
X_test = np.array(test_df_pre_process)

# Initialize the logistic regression model
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)

# Train the model
model.fit(X_train_scaled, y_train)

# Make predictions on the validation data
y_pred = model.predict(X_val_scaled)

# Evaluate the accuracy of the model on the validation data
accuracy = accuracy_score(y_val, y_pred)
print(f'Model accuracy on validation data: {accuracy}')

# Calculate and print the F1-score
f1 = f1_score(y_val, y_pred , average='weighted')
print(f'Model F1-score on validation data: {f1}')

'''
# 5 - fold cross validation
n_scores = cross_val_score(model, X, y, scoring= 'f1', cv= RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1), n_jobs=-1)
# report the model performance
print('Mean F1-score: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))
'''

# Make predictions on the test data
y_pred_test = model.predict(X_test)

## Save results to submission file
pred_df = pd.DataFrame(y_pred_test, columns=['change_type'])
#pred_df_LR.to_csv("LR_submission.csv", index=True, index_label='Id')

Model accuracy on validation data: 0.5027182171197029
Model F1-score on validation data: 0.33635794514878264


Logistic Regression with feature selection - takes a lot of time

In [55]:
'''
#test data
X_test = np.array(test_df_pre_process)

#Embedded methods to feature selection
sel = SelectFromModel(RandomForestClassifier(n_estimators= 100, random_state=10), threshold='median')
sel.fit(X_train_scaled, y_train)

X_train_selected = sel.transform(X_train_scaled)
X_val_selected = sel.transform(X_val_scaled)
X_test_selected = sel.transform(X_test)

# Initialize the logistic regression model
model_LR = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
#model_LR = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, class_weight = 'balanced')

# Train the model
model_LR.fit(X_train_selected, y_train)

# Make predictions on the validation data
y_pred = model_LR.predict(X_val_selected)

# Evaluate the accuracy of the model on the validation data
accuracy = accuracy_score(y_val, y_pred)
print(f'Model accuracy on validation data: {accuracy}')

# Calculate and print the F1-score
f1 = f1_score(y_val, y_pred , average='weighted')
print(f'Model F1-score on validation data: {f1}')

# Make predictions on the test data
pred_y = model_LR.predict(X_test_selected)

## Save results to submission file
pred_df = pd.DataFrame(pred_y, columns=['change_type'])
#pred_df_LR.to_csv("LR_submission.csv", index=True, index_label='Id')
'''

KeyboardInterrupt: 