In [None]:
#This is a collection og all scripts for testing PCs as ML input

In [None]:
import geopandas as gpd
import pandas as pd

In [None]:
# Replace the path with the correct path to your .geojson file
file_path = r'D:\FOLDER FROM THESIS\THESIS\Processed data\Training ML\filled_manipulated_28_11.geojson'

# Load the .geojson file into a GeoDataFrame
gdf = gpd.read_file(file_path)

with pd.option_context('display.max_columns', None, 'display.max_rows', None):
    # Print the first 20 rows
    print("First 20 rows:")
    print(gdf.head(20))

    # Print the last 20 rows
    print("\nLast 20 rows:")
    print(gdf.tail(20))

In [None]:
categorical_columns = ['byg032YdervæggensMateriale', 'byg033Tagdækningsmateriale', 'byg056Varmeinstallation', 
                       'eta006BygningensEtagebetegnelse', 'landscape', 'TSYM', 'byg021BygningensAnvendelse_grouped']

In [None]:
# Drop irrelevant fields
columns_to_drop = ['byg021BygningensAnvendelse', 'geometry', 'byg404Koordinat', 'byg406Koordinatsystem', 'x', 'y']
gdf = gdf.drop(columns=columns_to_drop)

gdf_reduced = pd.get_dummies(gdf, columns=categorical_columns)

In [None]:
# List of numeric variables to be dropped
coorelated_variables_to_drop = [
    'maksimal5d', 'maksimal14', 'doegn10mm', 'doegn20mm', 'time2aarsh', 'time5aarsh', 
    'time10aars', 'time20aars', 'time50aars', 'time100aar', 'doegn5aars', 'doegn10aar', 
    'doegn20aar', 'doegn50aar', 'doegn100aa', 'toerredage', 'toerreperi', 'potentielf', 
    'solindstra', 'dagligmint', 'lavestetem', 'gennemsn_1', 'gennemsnit', 'varmeboelg', 'doegnetste', 
    'hedeboelge', 'hoejestete', 'vaekstsaes', 'ekstremvin', 'maksimaldo', 'skybrud', 
    'aaretstemp', 'e_value', 'g_value', 'count', 'building', 'clay_accu_', 'streamlake', 'sand_accu'
]

# Drop specified numeric variables and exclude non-numeric columns
gdf_reduced_2 = gdf_reduced.drop(columns=coorelated_variables_to_drop)

In [None]:
#RF

import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report

gdf_reduced = pd.get_dummies(gdf.drop(columns=['Damage']), columns=categorical_columns)

# Standardize the Data
scaler = StandardScaler()
gdf_scaled = scaler.fit_transform(gdf_reduced)

# Perform PCA
pca = PCA()
pca_result = pca.fit_transform(gdf_scaled)

X = pca_result[:, :200]  # Select the first 200 PCs
y = gdf['Damage']

# Define k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Store overall evaluation metrics
overall_metrics = []

# Classification report for each fold
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Initialize and train the Random Forest Classifier
    rf_model = RandomForestClassifier(
        n_estimators=200,
        max_depth=60,
        min_samples_split=4,
        min_samples_leaf=1,
        random_state=42
    )
    rf_model.fit(X_train, y_train)

    # Predictions and Evaluation
    y_pred = rf_model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    overall_metrics.append(report)
    print("Classification Report for a fold:\n", classification_report(y_test, y_pred))

In [None]:
#Adaboost

from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

gdf_reduced = pd.get_dummies(gdf.drop(columns=['Damage']), columns=categorical_columns)

# Standardize the Data
scaler = StandardScaler()
gdf_scaled = scaler.fit_transform(gdf_reduced)

# Perform PCA
pca = PCA()
pca_result = pca.fit_transform(gdf_scaled)

# Extract the first 200 PCs for the ML model
X = pca_result[:, :200]
y = gdf['Damage']

# Define k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize the base estimator for AdaBoost
base_estimator = DecisionTreeClassifier(max_depth=6)

# Classification report for each fold
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Initialize and train the AdaBoostClassifier
    ada_model = AdaBoostClassifier(
        base_estimator=base_estimator,
        n_estimators=200,
        learning_rate=0.05,
        random_state=42
    )
    ada_model.fit(X_train, y_train)

    # Predictions and Evaluation
    y_pred = ada_model.predict(X_test)
    print("Classification Report for a fold:\n", classification_report(y_test, y_pred))


In [None]:
#Gradient boosting

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

gdf_reduced = pd.get_dummies(gdf.drop(columns=['Damage']), columns=categorical_columns)

# Standardize the Data
scaler = StandardScaler()
gdf_scaled = scaler.fit_transform(gdf_reduced)

# Perform PCA
pca = PCA()
pca_result = pca.fit_transform(gdf_scaled)

# Extract the first 200 PCs for the ML model
X = pca_result[:, :200]
y = gdf['Damage']

# Define k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Classification report for each fold
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Initialize and train the Gradient Boosting Classifier
    gb_model = GradientBoostingClassifier(
        n_estimators=900,
        learning_rate=0.07333333333333333,
        max_depth=8,
        min_samples_split=10,
        min_samples_leaf=5,
        random_state=42
    )
    gb_model.fit(X_train, y_train)

    # Predictions and Evaluation
    y_pred = gb_model.predict(X_test)
    print("Classification Report for a fold:\n", classification_report(y_test, y_pred))


In [None]:
#ANN

import geopandas as gpd
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# One-Hot Encode Categorical Variables
categorical_columns = ['byg032YdervæggensMateriale', 'byg033Tagdækningsmateriale', 'byg056Varmeinstallation', 
                       'eta006BygningensEtagebetegnelse', 'landscape', 'TSYM', 'byg021BygningensAnvendelse_grouped']
gdf_encoded = pd.get_dummies(gdf, columns=categorical_columns)

# Standardize the Data
scaler = StandardScaler()
gdf_scaled = scaler.fit_transform(gdf_encoded.drop(columns=['Damage']))

# Perform PCA
pca = PCA()
pca_result = pca.fit_transform(gdf_scaled)

# Extract the first 200 PCs for the ML model
X = pca_result[:, :200]
y = gdf['Damage']

# K-Fold Cross-validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold_no = 1

for train_index, test_index in kf.split(X):
    X_train_fold, X_test_fold = X[train_index], X[test_index]
    y_train_fold, y_test_fold = y[train_index], y[test_index]

    # Define the model architecture
    model = Sequential([
        Dense(128, activation='relu', input_shape=(X_train_fold.shape[1],)),
        Dropout(0.1),
        Dense(1, activation='sigmoid')
    ])

    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Train the model
    print('Training for fold', fold_no)
    model.fit(X_train_fold, y_train_fold, epochs=20, batch_size=128, verbose=0)

    # Generate generalization metrics
    scores = model.evaluate(X_test_fold, y_test_fold, verbose=0)
    print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
    fold_no += 1

    # Predictions and Evaluation
    y_pred_fold = (model.predict(X_test_fold) > 0.5).astype("int32")
    print("Classification Report for a fold:\n", classification_report(y_test_fold, y_pred_fold))


In [None]:
#XG Boost

import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# One-Hot Encode Categorical Variables and exclude the 'Damage' variable
categorical_columns = ['byg032YdervæggensMateriale', 'byg033Tagdækningsmateriale', 'byg056Varmeinstallation', 
                       'eta006BygningensEtagebetegnelse', 'landscape', 'TSYM', 'byg021BygningensAnvendelse_grouped']
gdf_encoded = pd.get_dummies(gdf.drop(columns=['Damage']), columns=categorical_columns)

# Standardize the Data
scaler = StandardScaler()
gdf_scaled = scaler.fit_transform(gdf_encoded)

# Perform PCA
pca = PCA()
pca_result = pca.fit_transform(gdf_scaled)

# Extract the first 200 PCs for the ML model
X = pca_result[:, :200]
y = gdf['Damage']

# Define k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Classification report for each fold
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Initialize and train the XGBClassifier
    xgb_model = xgb.XGBClassifier(
        n_estimators=800,
        learning_rate=0.2,
        max_depth=2,
        subsample=0.6,
        colsample_bytree=0.1,
        gamma=0.2,
        min_child_weight=6,
        reg_alpha=0.1,
        reg_lambda=0.2,
        random_state=42
    )
    xgb_model.fit(X_train, y_train)

    # Predictions and Evaluation
    y_pred = xgb_model.predict(X_test)
    print("Classification Report for a fold:\n", classification_report(y_test, y_pred))


In [None]:
#Light GBM

import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

gdf_reduced = pd.get_dummies(gdf.drop(columns=['Damage']), columns=categorical_columns)

# Standardize the Data
scaler = StandardScaler()
gdf_scaled = scaler.fit_transform(gdf_reduced)

# Perform PCA
pca = PCA()
pca_result = pca.fit_transform(gdf_scaled)

# Extract the first 200 PCs for the ML model
X = pca_result[:, :200]
y = gdf['Damage']

# Define k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Classification report for each fold
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Initialize and train the LightGBM Classifier
    lgb_model = lgb.LGBMClassifier(
        n_estimators=250,
        learning_rate=0.15,
        max_depth=9,
        num_leaves=100,
        random_state=42
    )
    lgb_model.fit(X_train, y_train)

    # Predictions and Evaluation
    y_pred = lgb_model.predict(X_test)
    print("Classification Report for a fold:\n", classification_report(y_test, y_pred))


In [None]:
#Gaussian Naive Bayes

from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# One-Hot Encode Categorical Variables and exclude the 'Damage' variable
categorical_columns = ['byg032YdervæggensMateriale', 'byg033Tagdækningsmateriale', 'byg056Varmeinstallation', 
                       'eta006BygningensEtagebetegnelse', 'landscape', 'TSYM', 'byg021BygningensAnvendelse_grouped']
gdf_encoded = pd.get_dummies(gdf.drop(columns=['Damage']), columns=categorical_columns)

# Standardize the Data
scaler = StandardScaler()
gdf_scaled = scaler.fit_transform(gdf_encoded)

# Perform PCA
pca = PCA()
pca_result = pca.fit_transform(gdf_scaled)

# Extract the first 200 PCs for the ML model
X = pca_result[:, :200]
y = gdf['Damage']

# Define k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Classification report for each fold
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Initialize and train the Gaussian Naive Bayes model
    gnb_model = GaussianNB(var_smoothing=5.455594781168514e-06)
    gnb_model.fit(X_train, y_train)

    # Predictions and Evaluation
    y_pred = gnb_model.predict(X_test)
    print("Classification Report for a fold:\n", classification_report(y_test, y_pred))


In [None]:
#KNN 

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

gdf_encoded = pd.get_dummies(gdf.drop(columns=['Damage']), columns=categorical_columns)

# Standardize the Data
scaler = StandardScaler()
gdf_scaled = scaler.fit_transform(gdf_encoded)

# Perform PCA
pca = PCA()
pca_result = pca.fit_transform(gdf_scaled)

# Extract the first 200 PCs for the ML model
X = pca_result[:, :200]
y = gdf['Damage']

# Define k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Classification report for each fold
for fold, (train_index, test_index) in enumerate(kf.split(X), 1):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Initialize and train the KNN Classifier
    knn_model = KNeighborsClassifier(
        n_neighbors=7,
        weights='distance',
        algorithm='ball_tree'
    )
    knn_model.fit(X_train, y_train)

    # Predictions and Evaluation
    y_pred = knn_model.predict(X_test)
    print(f"Classification Report for fold {fold}:\n", classification_report(y_test, y_pred))


In [None]:
#Stacking

import numpy as np
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import RidgeClassifier
from lightgbm import LGBMClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import classification_report
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import geopandas as gpd

gdf = gpd.read_file(file_path)

# Drop irrelevant fields and one-hot encode categorical variables
columns_to_drop = ['byg021BygningensAnvendelse', 'geometry', 'byg404Koordinat', 'byg406Koordinatsystem', 'x', 'y']
categorical_columns = ['byg032YdervæggensMateriale', 'byg033Tagdækningsmateriale', 'byg056Varmeinstallation', 
                       'eta006BygningensEtagebetegnelse', 'landscape', 'TSYM', 'byg021BygningensAnvendelse_grouped']
gdf = gdf.drop(columns=columns_to_drop)
gdf_encoded = pd.get_dummies(gdf, columns=categorical_columns)

# Standardize the Data
scaler = StandardScaler()
gdf_scaled = scaler.fit_transform(gdf_encoded.drop(columns=['Damage']))

# Perform PCA
pca = PCA()
pca_result = pca.fit_transform(gdf_scaled)

# Extract the first 200 PCs for the ML model
X = pca_result[:, :200]
y = gdf['Damage']

# Define the base learners
base_learners = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('lgbm', LGBMClassifier(n_estimators=100, random_state=42)),
    ('gnb', GaussianNB())
]

# Define the final estimator
final_estimator = RidgeClassifier()

# Define the Stacking Classifier
stacked_model = StackingClassifier(
    estimators=base_learners, 
    final_estimator=final_estimator, 
    cv=5
)

# Define 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Classification report for each fold
fold_no = 1
for train_index, test_index in kf.split(X):
    X_train_fold, X_test_fold = X[train_index], X[test_index]
    y_train_fold, y_test_fold = y[train_index], y[test_index]

    # Train the stacked model
    print('Training for fold', fold_no)
    stacked_model.fit(X_train_fold, y_train_fold)

    # Predictions and Evaluation
    y_pred_fold = stacked_model.predict(X_test_fold)
    print(f"Classification Report for fold {fold_no}:\n", classification_report(y_test_fold, y_pred_fold))
    fold_no += 1
