In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate
from utils_python import *

#plotting
import matplotlib.pyplot as plt
import seaborn as sns

np.random.seed(420)
pd.set_option('display.max_columns', None)  # To show all columns

In [None]:
# Read dataset with encoding
df = pd.read_parquet("C:/Users/Asus/Box/Flood Damage PredictionProject/Dataset/filtered_dataset_with_geo_info_with_encoding.parquet.gzip")

In [None]:
df.head()

### Drop all county and yearOfLoss columns

In [None]:
# Identify columns that start with "countyCode"
columns_to_remove = [col for col in df.columns if col.startswith("countyCode_")]

# Combine these columns "yearOfLoss" for removal
columns_to_drop = columns_to_remove + ["yearOfLoss"]

df = drop_columns(df, columns_to_drop)

### Scale the numeric columns and get sample df if needed 

In [None]:
# Scale if needed
df = scale_variables(df, "buildingrelativeDamage", scale=False)

# Sample if needed
df = sample_dataframe(df, 10000, sampling_required=True)

### Data creation for modeling

In [None]:
# Create a list of the SVD column names
svd_columns = [f'svd_{i}' for i in range(1, 31)]

# Create a new DataFrame with only the SVD columns
svd_df = df[svd_columns].copy()

# Drop the SVD columns from the original DataFrame
df.drop(columns=svd_columns, inplace=True)

In [None]:
# Create baseline dataframes
Y = df['buildingrelativeDamage'].values
X = df.drop(columns='buildingrelativeDamage').values

# Initialize a dictionary to store the MAE for baseline and each SVD variable inclusion
# Setting the baseline MAE to 0 and each SVD variable's MAE to 0 initially
mae_scores = {'Baseline': 0}
mae_scores.update({col: 0 for col in svd_columns})

### Baseline Model

In [None]:
# Perform 5 fold cross-validation
regr = RandomForestRegressor(criterion='absolute_error',  random_state=420)
cv_results = cross_validate(regr, X, Y, cv=5, scoring='neg_mean_absolute_error')
mae_scores['Baseline'] = -np.mean(cv_results['test_score'])

print(f"Baseline MAE: {mae_scores['Baseline']}")

### Check geoinfo column inclusion

In [None]:
# Initialize an array to store the MAE for each selection round
svd_selection_cv_scores = np.zeros(30)

# Variable selection and iterative cross-validation
for i in range(1, 31):
    # Identify the SVD variable with the lowest MAE
    best_svd_var, svd_selection_cv_scores[i-1] = svd_selection_rf(X, svd_df, Y)

    # Add the best_svd_var to X
    X = np.concatenate([X, svd_df[[best_svd_var]].values], axis=1)
    
    # Drop the included svd column from the svd dataframe
    svd_df = svd_df.drop(columns=best_svd_var)
    
    # Rerun 5-fold CV with the newly included SVD variable
    regr = RandomForestRegressor(criterion='absolute_error', random_state=420)
    cv_results = cross_validate(regr, X, Y, cv=5, scoring='neg_mean_absolute_error')
    mae_scores[best_svd_var] = -np.mean(cv_results['test_score'])
    print(f"Iteration: {i}, Best SVD Var: {best_svd_var}, MAE: {mae_scores[best_svd_var]}")

### Plots

In [None]:
# Convert mae_scores to a pandas DataFrame
mae_df = pd.DataFrame(list(mae_scores.items()), columns=['Variable', 'MAE'])

# Plotting
plt.figure(figsize=(10, 6))
sns.lineplot(data=mae_df, x='Variable', y='MAE', marker='o', color='darkgreen', markersize=8)

plt.xticks(rotation=45, ha='right')
plt.title('MAE Across Models with SVD Variable Inclusion (RandomForest Regressor)', fontsize=14)
plt.xlabel('SVD Variable Included', fontsize=12)
plt.ylabel('Mean Absolute Error (MAE)', fontsize=12)
sns.despine()  # Removes the top and right spines
plt.grid(True, which='major', linestyle='--', linewidth='0.5', color='gray')  # Adds a grid
plt.tight_layout()  # Adjusts subplots to fit into the figure area.

plt.show()