In [1]:
# Initialization for the machine learning model
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold
import optuna
import xgboost as xgb
import os
import pandas as pd
import numpy as np
from scipy.interpolate import interp1d
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from matplotlib import pyplot as plt

## load data

In [2]:
country = 'afghan'
mode = 'Train'
feature = 'raw'

path = '/app/dev/GeoITU/data'

X = np.load(os.path.join(path, '{}_{}_{}.npy'.format(country, mode, feature)), allow_pickle=True)
y_labels= np.load(os.path.join(path, '{}_{}_labels.npy'.format(country, mode)), allow_pickle=True)

assert X.shape[0] == y_labels.shape[0]

X = X.reshape(X.shape[0], -1)
print(X.shape, y_labels.shape)

(500, 180) (500,)


## rf

In [6]:
# # Setting up the stratified k-fold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

accuracies = []  # List to store accuracy of each fold
f_scores = []
fold_num = 0
num_trial = 50

# 5-fold Stratified Cross Validation loop
for train_index, test_index in skf.split(X, y_labels):
    fold_num += 1

    # Splitting the dataset for this fold
    X_train, X_test = [X[i] for i in train_index], [X[i] for i in test_index]
    y_train_labels, y_test_labels = [y_labels[i] for i in train_index], [
        y_labels[i] for i in test_index
    ]
    
    
    # optimize params for single fold
    if fold_num == 1:  


        def objective(trial):
            params = {
            'n_estimators' : trial.suggest_int('n_estimators', 100, 1000),
            'max_depth' : trial.suggest_int('max_depth', 3, 15),
            # 'max_features' : trial.suggest_categorical('max_features', ['auto', 'sqrt']), 
            'min_samples_split' : trial.suggest_int('min_samples_split', 5, 32),
            'bootstrap' : trial.suggest_categorical('bootstrap', [True, False]),
            'n_jobs' : trial.suggest_categorical('n_jobs', [-1]) #fixed. use all cpus
            }

            clf = RandomForestClassifier(**params)
            clf.fit(X_train, y_train_labels)

            # Making predictions on the test set
            y_pred = clf.predict(X_test)

            # Calculating and reporting the accuracy
            accuracy = accuracy_score(y_test_labels, y_pred)
            return accuracy


        # optimize study
        study = optuna.create_study(direction="maximize")
        study.optimize(objective, n_trials=num_trial) 
        print(study.best_params)
                        
    # Model training
    clf = RandomForestClassifier(**study.best_params)  # change classifier here
    clf.fit(X_train, y_train_labels)
    
    # clf = xgb.XGBClassifier()
    # clf.fit(X_train, le.transform(y_train_labels))

    # Making predictions on the test set
    y_pred = clf.predict(X_test)

    # Calculating and reporting the accuracy
    accuracy = accuracy_score(y_test_labels, y_pred)
    accuracies.append(accuracy)  # Storing the accuracy
    
    # Calculating and reporting the fscore
    f_score = f1_score(y_test_labels, y_pred, average='weighted')
    f_scores.append(f_score)  # Storing the accuracy
    print(f"Fold {fold_num} Accuracy: {accuracy}")
    print(f"Fold {fold_num} F_score: {f_score}")

# Reporting the final results
avg_accuracy = np.mean(accuracies)
avg_fscore = np.mean(f_scores)
print(f"Average Accuracy across all folds: {avg_accuracy:.4f}")
print(f"Average Fscore across all folds: {avg_fscore:.4f}")


[32m[I 2023-10-27 23:40:13,291][0m A new study created in memory with name: no-name-7ece32f2-7c85-4cf7-b819-5a37e4c3d465[0m
[32m[I 2023-10-27 23:40:14,863][0m Trial 0 finished with value: 0.81 and parameters: {'n_estimators': 552, 'max_depth': 12, 'min_samples_split': 5, 'bootstrap': True, 'n_jobs': -1}. Best is trial 0 with value: 0.81.[0m
[32m[I 2023-10-27 23:40:15,517][0m Trial 1 finished with value: 0.78 and parameters: {'n_estimators': 289, 'max_depth': 11, 'min_samples_split': 32, 'bootstrap': False, 'n_jobs': -1}. Best is trial 0 with value: 0.81.[0m
[32m[I 2023-10-27 23:40:17,280][0m Trial 2 finished with value: 0.81 and parameters: {'n_estimators': 820, 'max_depth': 8, 'min_samples_split': 11, 'bootstrap': False, 'n_jobs': -1}. Best is trial 0 with value: 0.81.[0m
[32m[I 2023-10-27 23:40:19,857][0m Trial 3 finished with value: 0.79 and parameters: {'n_estimators': 937, 'max_depth': 6, 'min_samples_split': 7, 'bootstrap': True, 'n_jobs': -1}. Best is trial 0 with 

{'n_estimators': 552, 'max_depth': 12, 'min_samples_split': 5, 'bootstrap': True, 'n_jobs': -1}
Fold 1 Accuracy: 0.8
Fold 1 F_score: 0.8000000000000002
Fold 2 Accuracy: 0.87
Fold 2 F_score: 0.8699869986998701
Fold 3 Accuracy: 0.79
Fold 3 F_score: 0.7899789978997899
Fold 4 Accuracy: 0.88
Fold 4 F_score: 0.879951980792317
Fold 5 Accuracy: 0.78
Fold 5 F_score: 0.7796474358974359
Average Accuracy across all folds: 0.8240
Average Fscore across all folds: 0.8239
