#####  07-11-2024 Trying from scratch

In [1]:
import pandas as pd

# Load the dataset to explore its structure
file_path = 'C:/Users/muvva/OneDrive/Desktop/Rp/2021.csv'
df = pd.read_csv(file_path)

# Display the first few rows and general information of the dataset to understand its structure and columns
df.head(), df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38263 entries, 0 to 38262
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   lat        38263 non-null  float64
 1   lon        38263 non-null  float64
 2   date       38263 non-null  object 
 3   vpd_avg    38263 non-null  float64
 4   vpd_max    38263 non-null  float64
 5   vpd_min    38263 non-null  float64
 6   vpd_avg_1  38263 non-null  float64
 7   vpd_max_1  38263 non-null  float64
 8   vpd_min_1  38263 non-null  float64
dtypes: float64(8), object(1)
memory usage: 2.6+ MB


(     lat    lon        date   vpd_avg    vpd_max   vpd_min  vpd_avg_1  \
 0  29.54  79.44  01-01-2021  5.112470  14.693333  1.232054   2.942601   
 1  30.04  80.05  01-01-2021  6.163787  12.119992  3.539694   5.202076   
 2  30.49  78.58  01-01-2021  4.586646  10.282862  1.684033   3.256362   
 3  30.66  78.97  01-01-2021  1.607819   2.736088  0.861731   1.478523   
 4  31.29  77.28  01-01-2021  3.901893   9.718175  1.339673   2.795041   
 
    vpd_max_1  vpd_min_1  
 0   8.837433   0.181849  
 1   8.625843   3.514763  
 2   6.369186   1.195103  
 3   2.910768   0.960221  
 4   6.661133   1.028199  ,
 None)

In [2]:
import numpy as np

# Define the proportion to decrease VPD values to simulate non-fire conditions
vpd_reduction_factor = 0.9  # 10% reduction in VPD values

# Copy the original dataset and apply the VPD reduction factor to generate synthetic non-fire samples
non_fire_df = df.copy()
non_fire_df['vpd_avg'] *= vpd_reduction_factor
non_fire_df['vpd_max'] *= vpd_reduction_factor
non_fire_df['vpd_min'] *= vpd_reduction_factor
non_fire_df['vpd_avg_1'] *= vpd_reduction_factor
non_fire_df['vpd_max_1'] *= vpd_reduction_factor
non_fire_df['vpd_min_1'] *= vpd_reduction_factor

# Add labels to both the original and synthetic datasets: 1 for fire, 0 for non-fire
df['label'] = 1  # Original fire data
non_fire_df['label'] = 0  # Synthetic non-fire data

# Concatenate the original and synthetic datasets to create a balanced dataset
balanced_df = pd.concat([df, non_fire_df], ignore_index=True)

# Check the balanced dataset structure and label distribution
balanced_df.head(), balanced_df['label'].value_counts()


(     lat    lon        date   vpd_avg    vpd_max   vpd_min  vpd_avg_1  \
 0  29.54  79.44  01-01-2021  5.112470  14.693333  1.232054   2.942601   
 1  30.04  80.05  01-01-2021  6.163787  12.119992  3.539694   5.202076   
 2  30.49  78.58  01-01-2021  4.586646  10.282862  1.684033   3.256362   
 3  30.66  78.97  01-01-2021  1.607819   2.736088  0.861731   1.478523   
 4  31.29  77.28  01-01-2021  3.901893   9.718175  1.339673   2.795041   
 
    vpd_max_1  vpd_min_1  label  
 0   8.837433   0.181849      1  
 1   8.625843   3.514763      1  
 2   6.369186   1.195103      1  
 3   2.910768   0.960221      1  
 4   6.661133   1.028199      1  ,
 label
 1    38263
 0    38263
 Name: count, dtype: int64)

In [3]:
from sklearn.cluster import KMeans
from datetime import datetime

# Feature 1: Calculate VPD differences and relative changes
balanced_df['vpd_avg_diff'] = balanced_df['vpd_avg'] - balanced_df['vpd_avg_1']
balanced_df['vpd_max_diff'] = balanced_df['vpd_max'] - balanced_df['vpd_max_1']
balanced_df['vpd_min_diff'] = balanced_df['vpd_min'] - balanced_df['vpd_min_1']

balanced_df['vpd_avg_pct_change'] = ((balanced_df['vpd_avg'] - balanced_df['vpd_avg_1']) / balanced_df['vpd_avg_1']).replace([np.inf, -np.inf], 0)
balanced_df['vpd_max_pct_change'] = ((balanced_df['vpd_max'] - balanced_df['vpd_max_1']) / balanced_df['vpd_max_1']).replace([np.inf, -np.inf], 0)
balanced_df['vpd_min_pct_change'] = ((balanced_df['vpd_min'] - balanced_df['vpd_min_1']) / balanced_df['vpd_min_1']).replace([np.inf, -np.inf], 0)

# Feature 2: Apply clustering on spatial data (lat, lon) to identify fire-prone regions
# Use a KMeans approach for simplicity; we'll test with 10 clusters initially
num_clusters = 10
kmeans = KMeans(n_clusters=num_clusters, random_state=0)
balanced_df['location_cluster'] = kmeans.fit_predict(balanced_df[['lat', 'lon']])

# Calculate the distance to the cluster center for each point as a feature
balanced_df['distance_to_cluster_center'] = kmeans.transform(balanced_df[['lat', 'lon']]).min(axis=1)

# Feature 3: Extract month from the date for seasonal analysis
balanced_df['month'] = balanced_df['date'].apply(lambda x: datetime.strptime(x, '%d-%m-%Y').month)

# Review the first few rows to verify feature engineering steps
balanced_df.head()


Unnamed: 0,lat,lon,date,vpd_avg,vpd_max,vpd_min,vpd_avg_1,vpd_max_1,vpd_min_1,label,vpd_avg_diff,vpd_max_diff,vpd_min_diff,vpd_avg_pct_change,vpd_max_pct_change,vpd_min_pct_change,location_cluster,distance_to_cluster_center,month
0,29.54,79.44,01-01-2021,5.11247,14.693333,1.232054,2.942601,8.837433,0.181849,1,2.169869,5.8559,1.050205,0.737398,0.662625,5.775165,4,0.608438,1
1,30.04,80.05,01-01-2021,6.163787,12.119992,3.539694,5.202076,8.625843,3.514763,1,0.961711,3.494149,0.024931,0.184871,0.405079,0.007093,4,1.124033,1
2,30.49,78.58,01-01-2021,4.586646,10.282862,1.684033,3.256362,6.369186,1.195103,1,1.330284,3.913676,0.48893,0.408518,0.61447,0.409111,4,0.70473,1
3,30.66,78.97,01-01-2021,1.607819,2.736088,0.861731,1.478523,2.910768,0.960221,1,0.129296,-0.17468,-0.098489,0.087449,-0.060012,-0.102569,4,0.778272,1
4,31.29,77.28,01-01-2021,3.901893,9.718175,1.339673,2.795041,6.661133,1.028199,1,1.106852,3.057042,0.311474,0.396006,0.458937,0.302931,4,2.174198,1


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, mean_squared_error
import xgboost as xgb
import numpy as np

# Separate features and target label
X = balanced_df.drop(columns=['label', 'date'])  # Drop label and date as they are not features
y = balanced_df['label']

# Split data into training and test sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Feature scaling for numerical columns
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train the XGBoost classifier
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_clf.fit(X_train_scaled, y_train)

# Predictions on test set
y_pred = xgb_clf.predict(X_test_scaled)
y_pred_proba = xgb_clf.predict_proba(X_test_scaled)[:, 1]  # Probabilities for calculating RMSE

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred_proba))

accuracy, precision, recall, rmse


Parameters: { "use_label_encoder" } are not used.



(0.8154971906441918,
 0.8405981097474962,
 0.7786488958578335,
 0.37113018502286776)

In [7]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, mean_squared_error
import numpy as np

# Initialize and train XGBoost classifier
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_clf.fit(X_train_scaled, y_train)

# Predictions
y_pred = xgb_clf.predict(X_test_scaled)
y_pred_proba = xgb_clf.predict_proba(X_test_scaled)[:, 1]

# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred_proba))

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("RMSE:", rmse)


Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.8154971906441918
Precision: 0.8405981097474962
Recall: 0.7786488958578335
RMSE: 0.37113018502286776


In [8]:
######

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.pipeline import make_pipeline

# Initialize the base models
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
rf_clf = RandomForestClassifier(random_state=42)
lr_clf = LogisticRegression(max_iter=1000, random_state=42)

# Create a stacking ensemble with the base models and a logistic regression as the meta-model
stacking_clf = StackingClassifier(
    estimators=[('xgb', xgb_clf), ('rf', rf_clf), ('lr', lr_clf)],
    final_estimator=LogisticRegression(max_iter=1000, random_state=42),
    stack_method='predict_proba'  # Use probabilities for better performance in meta-model
)

# Train the stacking ensemble model
stacking_clf.fit(X_train_scaled, y_train)

# Predictions on the test set
y_pred_stacking = stacking_clf.predict(X_test_scaled)
y_pred_proba_stacking = stacking_clf.predict_proba(X_test_scaled)[:, 1]

# Evaluate the stacking model performance
accuracy_stacking = accuracy_score(y_test, y_pred_stacking)
precision_stacking = precision_score(y_test, y_pred_stacking)
recall_stacking = recall_score(y_test, y_pred_stacking)
rmse_stacking = np.sqrt(mean_squared_error(y_test, y_pred_proba_stacking))

accuracy_stacking, precision_stacking, recall_stacking, rmse_stacking


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



(0.9014765451456945,
 0.9045424621461488,
 0.897687181497452,
 0.25196805762487023)

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
rf_clf = RandomForestClassifier(random_state=42)
lr_clf = LogisticRegression(max_iter=1000, random_state=42)

from sklearn.ensemble import StackingClassifier

stacking_clf = StackingClassifier(
    estimators=[('xgb', xgb_clf), ('rf', rf_clf), ('lr', lr_clf)],
    final_estimator=LogisticRegression(max_iter=1000, random_state=42),
    stack_method='predict_proba'
)

stacking_clf.fit(X_train_scaled, y_train)
y_pred_stacking = stacking_clf.predict(X_test_scaled)
y_pred_proba_stacking = stacking_clf.predict_proba(X_test_scaled)[:, 1]

from sklearn.metrics import accuracy_score, precision_score, recall_score, mean_squared_error
import numpy as np

accuracy = accuracy_score(y_test, y_pred_stacking)
precision = precision_score(y_test, y_pred_stacking)
recall = recall_score(y_test, y_pred_stacking)
rmse = np.sqrt(mean_squared_error(y_test, y_pred_proba_stacking))

print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, RMSE: {rmse}")


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.9014765451456945, Precision: 0.9045424621461488, Recall: 0.897687181497452, RMSE: 0.25196805762487023


In [11]:
#####

In [12]:
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb

# Define parameter grid for advanced tuning
param_grid = {
    'n_estimators': [100, 200, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7, 10],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.1, 0.2, 0.3],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [1, 1.5, 2]
}

# Initialize the XGBoost classifier
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Setup RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=xgb_clf,
    param_distributions=param_grid,
    n_iter=50,  # Limit number of parameter settings for faster tuning
    scoring='accuracy',
    cv=3,  # 3-fold cross-validation
    verbose=1,
    random_state=42,
    n_jobs=-1
)

# Perform the random search on scaled training data
random_search.fit(X_train_scaled, y_train)

# Retrieve best parameters and score
best_params = random_search.best_params_
best_score = random_search.best_score_

best_params, best_score


Fitting 3 folds for each of 50 candidates, totalling 150 fits


Parameters: { "use_label_encoder" } are not used.



({'subsample': 1.0,
  'reg_lambda': 1,
  'reg_alpha': 0.5,
  'n_estimators': 500,
  'min_child_weight': 1,
  'max_depth': 10,
  'learning_rate': 0.1,
  'gamma': 0,
  'colsample_bytree': 0.6},
 0.8789448215053207)

In [13]:
#####

In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
import xgboost as xgb

# Load your dataset (replace with your actual data)
# df = pd.read_csv('your_dataset.csv')

# Assuming 'X' is your feature set and 'y' is your target variable
# Split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply SMOTE to handle data imbalance (if applicable)
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

# Initialize base models
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
rf_clf = RandomForestClassifier(random_state=42)
lr_clf = LogisticRegression(max_iter=1000, random_state=42)

# Define hyperparameter grids for base models
xgb_param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [3, 5, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'reg_lambda': [1, 1.5],
    'reg_alpha': [0, 0.5],
    'min_child_weight': [1, 5]
}

rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['auto', 'sqrt', 'log2']
}

lr_param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l2'],
    'solver': ['liblinear', 'saga']
}

# Perform RandomizedSearchCV for XGBoost
xgb_random_search = RandomizedSearchCV(xgb_clf, xgb_param_grid, n_iter=20, cv=5, scoring='accuracy', random_state=42)
xgb_random_search.fit(X_train_resampled, y_train_resampled)

# Perform RandomizedSearchCV for RandomForest
rf_random_search = RandomizedSearchCV(rf_clf, rf_param_grid, n_iter=10, cv=5, scoring='accuracy', random_state=42)
rf_random_search.fit(X_train_resampled, y_train_resampled)

# Perform RandomizedSearchCV for LogisticRegression
lr_random_search = RandomizedSearchCV(lr_clf, lr_param_grid, n_iter=10, cv=5, scoring='accuracy', random_state=42)
lr_random_search.fit(X_train_resampled, y_train_resampled)

# Get the best models after tuning
best_xgb = xgb_random_search.best_estimator_
best_rf = rf_random_search.best_estimator_
best_lr = lr_random_search.best_estimator_

# Create a stacking ensemble model
stacking_clf = StackingClassifier(
    estimators=[('xgb', best_xgb), ('rf', best_rf), ('lr', best_lr)],
    final_estimator=LogisticRegression(max_iter=1000, random_state=42),
    stack_method='predict_proba'
)

# Train the stacking model on resampled data
stacking_clf.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set
y_pred_stacking = stacking_clf.predict(X_test_scaled)
y_pred_proba_stacking = stacking_clf.predict_proba(X_test_scaled)[:, 1]

# Evaluate the performance
accuracy_stacking = accuracy_score(y_test, y_pred_stacking)
precision_stacking = precision_score(y_test, y_pred_stacking)
recall_stacking = recall_score(y_test, y_pred_stacking)
f1_stacking = f1_score(y_test, y_pred_stacking)

# Print the evaluation metrics
print(f"Accuracy: {accuracy_stacking:.4f}")
print(f"Precision: {precision_stacking:.4f}")
print(f"Recall: {recall_stacking:.4f}")
print(f"F1-Score: {f1_stacking:.4f}")

# Print classification report for a detailed view
print("\nClassification Report:")
print(classification_report(y_test, y_pred_stacking))

# Evaluate using cross-validation
cv_scores = cross_val_score(stacking_clf, X_train_resampled, y_train_resampled, cv=5, scoring='accuracy')
print(f"Cross-validated accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Accuracy: 0.8987
Precision: 0.9010
Recall: 0.8961
F1-Score: 0.8986

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.90      0.90      7645
           1       0.90      0.90      0.90      7661

    accuracy                           0.90     15306
   macro avg       0.90      0.90      0.90     15306
weighted avg       0.90      0.90      0.90     15306



Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Cross-validated accuracy: 0.8876 ± 0.0022


In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# Load your dataset (replace with your actual data)
# df = pd.read_csv('your_dataset.csv')

# Assuming 'X' is your feature set and 'y' is your target variable
X = balanced_df.drop(columns=['target_column'])  # Replace 'target_column' with the actual target column name
y = balanced_df['target_column']  # Replace 'target_column' with the actual target column name

# Split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply SMOTE to handle data imbalance (if applicable)
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

# Initialize RandomForestClassifier
rf_clf = RandomForestClassifier(random_state=42)

# Define hyperparameter grid for Random Forest
rf_param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Perform RandomizedSearchCV for hyperparameter tuning
rf_random_search = RandomizedSearchCV(rf_clf, rf_param_grid, n_iter=20, cv=5, scoring='accuracy', random_state=42)
rf_random_search.fit(X_train_resampled, y_train_resampled)

# Get the best model after tuning
best_rf = rf_random_search.best_estimator_

# Train the model with the best parameters
best_rf.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set
y_pred_rf = best_rf.predict(X_test_scaled)

# Evaluate the performance
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)

# Print evaluation metrics
print(f"Accuracy: {accuracy_rf:.4f}")
print(f"Precision: {precision_rf:.4f}")
print(f"Recall: {recall_rf:.4f}")
print(f"F1-Score: {f1_rf:.4f}")

# Print classification report for a detailed view
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf))

# Evaluate using cross-validation
cv_scores = cross_val_score(best_rf, X_train_resampled, y_train_resampled, cv=5, scoring='accuracy')
print(f"Cross-validated accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")


KeyError: "['target_column'] not found in axis"