### Feature Visualizations

In [None]:
from helper import dataset

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

d = dataset()
df = d.get_training()

# features related to sunlight
sunlight = ['Aspect', 'Slope', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm']

# features related to distance
distance = ['Elevation', 'H_Dist_Roadways', 'H_Dist_Fire_Points', 'H_Dist_Hydrology', 
                'V_Dist_Hydrology']

# features related to wilderness area
wilderness = ['Wilderness_Area_0', 'Wilderness_Area_1', 'Wilderness_Area_2',
                'Wilderness_Area_3']

# features related to soil type (features 14-53)
soil = df.columns[14:54]


# Create a copy of the dataframe to preserve original data
df_trimmed = df.copy()

# Define numerical features to trim
numerical_features = sunlight + distance

print("=== TRIMMING NUMERICAL FEATURES TO 1.5 STANDARD DEVIATIONS ===")
print("=" * 70)


trim_classes = []
trimming_stats = []

for type in range(7):
        print(f"\nClass {type + 1}...")
        
        # Filter the DataFrame for the current class
        df_class = df[df['Cover_Type'] == type+1]
        
        # Reset index for the filtered DataFrame
        df_class.reset_index(drop=True, inplace=True)
        
        # Update the main DataFrame to only include the current class
        df_class_trimmed = df_class.copy()


        for feature in numerical_features:
                # Calculate mean and standard deviation
                mean_val = df_class[feature].mean()
                std_val = df_class[feature].std()
                
                # Calculate bounds (mean ± 1.5 * standard deviation)
                lower_bound = mean_val - (1.5 * std_val)
                if lower_bound < 0:
                        lower_bound = 0
                upper_bound = mean_val + (1.5 * std_val)
                
                # Count observations outside bounds (before trimming)
                original_count = len(df_class)
                outliers_below = len(df_class[df_class[feature] < lower_bound])
                outliers_above = len(df_class[df_class[feature] > upper_bound])
                total_outliers = outliers_below + outliers_above
                
                # Trim the data
                df_class_trimmed = df_class_trimmed[
                        (df_class_trimmed[feature] >= lower_bound) & 
                        (df_class_trimmed[feature] <= upper_bound)
                ]
                
                # Count observations after trimming
                trimmed_count = len(df_class_trimmed)
                removed_count = original_count - trimmed_count
                
                print(f"{feature:25s}: {lower_bound:8.2f} to {upper_bound:8.2f}")
                print(f"{'':25s}  Removed: {removed_count:,} obs ({removed_count/original_count*100:.2f}%)")
                
                # Store statistics
                trimming_stats.append({
                        'Feature': feature,
                        'Mean': mean_val,
                        'Std': std_val,
                        'Lower_Bound': lower_bound,
                        'Upper_Bound': upper_bound,
                        'Outliers_Below': outliers_below,
                        'Outliers_Above': outliers_above,
                        'Total_Outliers': total_outliers,
                        'Outlier_Percentage': (total_outliers/original_count)*100
                })

        print("\n" + "=" * 70)
        print("TRIMMING SUMMARY")
        print("=" * 70)
        print(f"Original dataset size: {len(df_class):,} observations")
        print(f"Trimmed dataset size:  {len(df_class_trimmed):,} observations")
        print(f"Total removed:         {len(df_class) - len(df_class_trimmed):,} observations")
        print(f"Percentage retained:   {len(df_trimmed)/len(df)*100:.2f}%")

        trim_classes.append(df_class_trimmed)



### Sunlight Features

In [None]:
# Create subplots for numerical features
fig, axes = plt.subplots(7, 5, figsize=(20, 40))
fig.suptitle('Sunlight Features by Class', fontsize=30)

# subtitle underneath suptitle
fig.text(0.5, 0.96, 'Classes 1-7 displayed top-to-bottom', ha='center', fontsize=18, style='italic')

colors = sns.color_palette("husl", 7)  # Generate 7 distinct colors

# Create histograms for each feature
for type in range(len(trim_classes)): # for each class type

        trim_class = trim_classes[type]

        for i, feature in enumerate(sunlight):
                
                row = type
                col = i
                
                sns.histplot(data=trim_class, x=feature, bins=30
                                , ax=axes[type, col], color=colors[type])
                axes[row, col].set_title(f'{feature}')


                # Set x-axis range for hillshade features only
                if 'Hillshade' in feature:
                        axes[row, col].set_xlim(0, 255)
                        axes[row, col].set_xlabel("Hillshade Index (0-255)")
                if 'Aspect' in feature:
                        axes[row, col].set_xlim(0, 360)
                        axes[row, col].set_xlabel("Degrees Azimuth (0-360)")
                if 'Slope' in feature:
                        axes[row, col].set_xlim(0, 60)
                        axes[row, col].set_xlabel("Degrees (0-360)")


# formatting
plt.tight_layout() # plots won't overlap
plt.subplots_adjust(top=0.93)  # make room for suptitle
plt.show()


In [None]:
# Create subplots for numerical features
fig, axes = plt.subplots(14, 5, figsize=(20, 40))
fig.suptitle('Sunlight Feature Relationships by Class', fontsize=30)

# subtitle underneath suptitle
fig.text(0.5, 0.96, 'Classes 1-7 displayed top-to-bottom', ha='center', fontsize=18, style='italic')
colors = sns.color_palette("husl", 7)  # Generate 7 distinct colors

# Pairs for scatterplots (x-feature vs y-feature)
feature_pairs = [
        ('Aspect', 'Slope'),
        ('Aspect', 'Hillshade_9am'),
        ('Aspect', 'Hillshade_Noon'),
        ('Aspect', 'Hillshade_3pm'),
        ('Slope', 'Hillshade_9am'),
        ('Slope', 'Hillshade_Noon'),
        ('Slope', 'Hillshade_3pm'),
        ('Hillshade_9am', 'Hillshade_Noon'),
        ('Hillshade_9am', 'Hillshade_3pm'),
        ('Hillshade_Noon', 'Hillshade_3pm'),
]

# Create scatterplots for each feature pair
for type in range(len(trim_classes)): # for each class type

        trim_class = trim_classes[type]
        
        for i, (x_feature, y_feature) in enumerate(feature_pairs):
                
                row = (i // 5) + (type * 2)
                col = i % 5
                
                # Create scatterplot: x_feature vs y_feature
                sns.scatterplot(data=trim_class, x=x_feature, y=y_feature, 
                                ax=axes[row, col], color=colors[type], alpha=0.6, s=10)
                axes[row, col].set_title(f'{x_feature} vs {y_feature}')

# formatting
plt.tight_layout() # plots won't overlap
plt.subplots_adjust(top=0.93)  # make room for suptitle
plt.show()


### Distance Features

In [None]:
# create subplot space
fig, axes = plt.subplots(7, 5, figsize=(20, 40))
fig.suptitle('Distance Features by Class', fontsize=30)

# subtitle underneath suptitle
fig.text(0.5, 0.96, 'Classes 1-7 displayed top-to-bottom', ha='center', fontsize=18, style='italic')

for type in range(7):

        for i, feature in enumerate(distance):
                
                row = type
                col = i
                data_covertype=df[df['Cover_Type'] == type+1]
                sns.histplot(data = data_covertype, x=feature, bins=30, ax=axes[row, col], color=colors[type]) # ax = array. 1D arrays initialized with only columns
                axes[row, col].set_title(f'{feature}')
                
                # Set x-axis range for hillshade features only
                if 'Elevation' in feature:
                        axes[row, col].set_xlim(1500, 3800)
                        axes[row, col].set_xlabel("Meters (m)")
                if 'Roadways' in feature:
                        axes[row, col].set_xlim(0, 7000)
                        axes[row, col].set_xlabel("Meters (m)")
                if 'Fire' in feature:
                        axes[row, col].set_xlim(0, 7000)
                        axes[row, col].set_xlabel("Meters (m)")
                if 'H_Dist_Hydrology' in feature:
                        axes[row, col].set_xlim(0, 1400)
                        axes[row, col].set_xlabel("Meters (m)")
                if 'V_Dist_Hydrology' in feature:
                        axes[row, col].set_xlim(0, 500)
                        axes[row, col].set_xlabel("Meters (m)")

plt.tight_layout()
plt.subplots_adjust(top=0.93)  # make room for suptitle
plt.show()


In [None]:
# Create subplots for numerical features
fig, axes = plt.subplots(14, 5, figsize=(20, 40))
fig.suptitle('Distance Feature Relationships by Class', fontsize=30)

# subtitle underneath suptitle
fig.text(0.5, 0.96, 'Classes 1-7 displayed top-to-bottom', ha='center', fontsize=18, style='italic')
colors = sns.color_palette("husl", 7)  # Generate 7 distinct colors

# Pairs for scatterplots (x-feature vs y-feature)
feature_pairs = [
        ('Elevation', 'H_Dist_Roadways'),
        ('Elevation', 'H_Dist_Fire_Points'),
        ('Elevation', 'H_Dist_Hydrology'),
        ('Elevation', 'V_Dist_Hydrology'),
        ('H_Dist_Roadways', 'H_Dist_Fire_Points'),
        ('H_Dist_Roadways', 'H_Dist_Hydrology'),
        ('H_Dist_Roadways', 'V_Dist_Hydrology'),
        ('H_Dist_Fire_Points', 'H_Dist_Hydrology'),
        ('H_Dist_Fire_Points', 'V_Dist_Hydrology'),
        ('H_Dist_Hydrology', 'V_Dist_Hydrology'),
]

# Create scatterplots for each feature pair
for type in range(7): # for each class type

        for i, (x_feature, y_feature) in enumerate(feature_pairs):
                
                row = (i // 5) + (type * 2)
                col = i % 5
                
                # Create scatterplot: x_feature vs y_feature
                cover_data = df[df['Cover_Type'] == type+1]
                sns.scatterplot(data=cover_data, x=x_feature, y=y_feature, 
                                ax=axes[row, col], color=colors[type], alpha=0.6, s=10)
                axes[row, col].set_title(f'{x_feature} vs {y_feature}')

# formatting
plt.tight_layout() # plots won't overlap
plt.subplots_adjust(top=0.93)  # make room for suptitle
plt.show()


### Sunlight and Distance Features

In [None]:
# Create subplots for numerical features
fig, axes = plt.subplots(7, 5, figsize=(20, 40))
fig.suptitle('Aspect and Distance Feature Relationships by Class', fontsize=30)

# subtitle underneath suptitle
fig.text(0.5, 0.96, 'Classes 1-7 displayed top-to-bottom', ha='center', fontsize=18, style='italic')
colors = sns.color_palette("husl", 7)  # Generate 7 distinct colors

# Pairs for scatterplots (x-feature vs y-feature)
feature_pairs = [
        ('Aspect', 'Elevation'),
        ('Aspect', 'H_Dist_Roadways'),
        ('Aspect', 'H_Dist_Fire_Points'),
        ('Aspect', 'H_Dist_Hydrology'),
        ('Aspect', 'V_Dist_Hydrology'),
]

# Create scatterplots for each feature pair
for type in range(7): # for each class type

        for i, (x_feature, y_feature) in enumerate(feature_pairs):
                
                row = (i // 5) + type
                col = i % 5
                
                # Create scatterplot: x_feature vs y_feature
                cover_data = df[df['Cover_Type'] == type+1]
                sns.scatterplot(data=cover_data, x=x_feature, y=y_feature, 
                                ax=axes[row, col], color=colors[type], alpha=0.6, s=10)
                axes[row, col].set_title(f'{x_feature} vs {y_feature}')

# Set consistent axis limits for all plots
for i in range(7):
        for j in range(5):
                if j < len(feature_pairs):  # Only set limits for plots that exist
                        x_feature, y_feature = feature_pairs[j]
                
                # Set x-axis limits based on feature type
                if 'Hillshade' in x_feature:
                        axes[i, j].set_xlim(0, 255)
                elif 'Aspect' in x_feature:
                        axes[i, j].set_xlim(0, 360)
                elif 'Slope' in x_feature:
                        axes[i, j].set_xlim(0, 60)
                
                # Set y-axis limits based on feature type
                if 'Elevation' in y_feature:
                        axes[i, j].set_ylim(1500, 3800)
                elif 'Roadways' in y_feature:
                        axes[i, j].set_ylim(0, 7000)
                elif 'Fire' in y_feature:
                        axes[i, j].set_ylim(0, 7000)
                elif 'H_Dist_Hydrology' in y_feature:
                        axes[i, j].set_ylim(0, 1400)
                elif 'V_Dist_Hydrology' in y_feature:
                        axes[i, j].set_ylim(0, 500)

# formatting
plt.tight_layout() # plots won't overlap
plt.subplots_adjust(top=0.93)  # make room for suptitle
plt.show()


In [None]:
# Create subplots for numerical features
fig, axes = plt.subplots(7, 5, figsize=(20, 40))
fig.suptitle('Aspect and Distance Feature Relationships by Class', fontsize=30)

# subtitle underneath suptitle
fig.text(0.5, 0.96, 'Classes 1-7 displayed top-to-bottom', ha='center', fontsize=18, style='italic')
colors = sns.color_palette("husl", 7)  # Generate 7 distinct colors

# Pairs for scatterplots (x-feature vs y-feature)
feature_pairs = [
        ('Slope', 'Elevation'),
        ('Slope', 'H_Dist_Roadways'),
        ('Slope', 'H_Dist_Fire_Points'),
        ('Slope', 'H_Dist_Hydrology'),
        ('Slope', 'V_Dist_Hydrology'),
]

# Create scatterplots for each feature pair
for type in range(7): # for each class type

        for i, (x_feature, y_feature) in enumerate(feature_pairs):
                
                row = (i // 5) + type
                col = i % 5
                
                # Create scatterplot: x_feature vs y_feature
                cover_data = df[df['Cover_Type'] == type+1]
                sns.scatterplot(data=cover_data, x=x_feature, y=y_feature, 
                                ax=axes[row, col], color=colors[type], alpha=0.6, s=10)
                axes[row, col].set_title(f'{x_feature} vs {y_feature}')

# Set consistent axis limits for all plots
for i in range(7):
        for j in range(5):
                if j < len(feature_pairs):  # Only set limits for plots that exist
                        x_feature, y_feature = feature_pairs[j]
                
                # Set x-axis limits based on feature type
                if 'Hillshade' in x_feature:
                        axes[i, j].set_xlim(0, 255)
                elif 'Aspect' in x_feature:
                        axes[i, j].set_xlim(0, 360)
                elif 'Slope' in x_feature:
                        axes[i, j].set_xlim(0, 60)
                
                # Set y-axis limits based on feature type
                if 'Elevation' in y_feature:
                        axes[i, j].set_ylim(1500, 3800)
                elif 'Roadways' in y_feature:
                        axes[i, j].set_ylim(0, 7000)
                elif 'Fire' in y_feature:
                        axes[i, j].set_ylim(0, 7000)
                elif 'H_Dist_Hydrology' in y_feature:
                        axes[i, j].set_ylim(0, 1400)
                elif 'V_Dist_Hydrology' in y_feature:
                        axes[i, j].set_ylim(0, 500)

# formatting
plt.tight_layout() # plots won't overlap
plt.subplots_adjust(top=0.93)  # make room for suptitle
plt.show()


In [None]:
# Create subplots for numerical features
fig, axes = plt.subplots(7, 5, figsize=(20, 40))
fig.suptitle('Aspect and Distance Feature Relationships by Class', fontsize=30)

# subtitle underneath suptitle
fig.text(0.5, 0.96, 'Classes 1-7 displayed top-to-bottom', ha='center', fontsize=18, style='italic')
colors = sns.color_palette("husl", 7)  # Generate 7 distinct colors

# Pairs for scatterplots (x-feature vs y-feature)
feature_pairs = [
        ('Hillshade_9am', 'Elevation'),
        ('Hillshade_9am', 'H_Dist_Roadways'),
        ('Hillshade_9am', 'H_Dist_Fire_Points'),
        ('Hillshade_9am', 'H_Dist_Hydrology'),
        ('Hillshade_9am', 'V_Dist_Hydrology'),
]

# Create scatterplots for each feature pair
for type in range(7): # for each class type

        for i, (x_feature, y_feature) in enumerate(feature_pairs):
                
                row = (i // 5) + type
                col = i % 5
                
                # Create scatterplot: x_feature vs y_feature
                cover_data = df[df['Cover_Type'] == type+1]
                sns.scatterplot(data=cover_data, x=x_feature, y=y_feature, 
                                ax=axes[row, col], color=colors[type], alpha=0.6, s=10)
                axes[row, col].set_title(f'{x_feature} vs {y_feature}')

# Set consistent axis limits for all plots
for i in range(7):
        for j in range(5):
                if j < len(feature_pairs):  # Only set limits for plots that exist
                        x_feature, y_feature = feature_pairs[j]
                
                # Set x-axis limits based on feature type
                if 'Hillshade' in x_feature:
                        axes[i, j].set_xlim(0, 255)
                elif 'Aspect' in x_feature:
                        axes[i, j].set_xlim(0, 360)
                elif 'Slope' in x_feature:
                        axes[i, j].set_xlim(0, 60)
                
                # Set y-axis limits based on feature type
                if 'Elevation' in y_feature:
                        axes[i, j].set_ylim(1500, 3800)
                elif 'Roadways' in y_feature:
                        axes[i, j].set_ylim(0, 7000)
                elif 'Fire' in y_feature:
                        axes[i, j].set_ylim(0, 7000)
                elif 'H_Dist_Hydrology' in y_feature:
                        axes[i, j].set_ylim(0, 1400)
                elif 'V_Dist_Hydrology' in y_feature:
                        axes[i, j].set_ylim(0, 500)

# formatting
plt.tight_layout() # plots won't overlap
plt.subplots_adjust(top=0.93)  # make room for suptitle
plt.show()


In [None]:
# Create subplots for numerical features
fig, axes = plt.subplots(7, 5, figsize=(20, 40))
fig.suptitle('Aspect and Distance Feature Relationships by Class', fontsize=30)

# subtitle underneath suptitle
fig.text(0.5, 0.96, 'Classes 1-7 displayed top-to-bottom', ha='center', fontsize=18, style='italic')
colors = sns.color_palette("husl", 7)  # Generate 7 distinct colors

# Pairs for scatterplots (x-feature vs y-feature)
feature_pairs = [
        ('Hillshade_Noon', 'Elevation'),
        ('Hillshade_Noon', 'H_Dist_Roadways'),
        ('Hillshade_Noon', 'H_Dist_Fire_Points'),
        ('Hillshade_Noon', 'H_Dist_Hydrology'),
        ('Hillshade_Noon', 'V_Dist_Hydrology'),
]

# Create scatterplots for each feature pair
for type in range(7): # for each class type

        for i, (x_feature, y_feature) in enumerate(feature_pairs):
                
                row = (i // 5) + type
                col = i % 5
                
                # Create scatterplot: x_feature vs y_feature
                cover_data = df[df['Cover_Type'] == type+1]
                sns.scatterplot(data=cover_data, x=x_feature, y=y_feature, 
                                ax=axes[row, col], color=colors[type], alpha=0.6, s=10)
                axes[row, col].set_title(f'{x_feature} vs {y_feature}')

# Set consistent axis limits for all plots
for i in range(7):
        for j in range(5):
                if j < len(feature_pairs):  # Only set limits for plots that exist
                        x_feature, y_feature = feature_pairs[j]
                
                # Set x-axis limits based on feature type
                if 'Hillshade' in x_feature:
                        axes[i, j].set_xlim(0, 255)
                elif 'Aspect' in x_feature:
                        axes[i, j].set_xlim(0, 360)
                elif 'Slope' in x_feature:
                        axes[i, j].set_xlim(0, 60)
                
                # Set y-axis limits based on feature type
                if 'Elevation' in y_feature:
                        axes[i, j].set_ylim(1500, 3800)
                elif 'Roadways' in y_feature:
                        axes[i, j].set_ylim(0, 7000)
                elif 'Fire' in y_feature:
                        axes[i, j].set_ylim(0, 7000)
                elif 'H_Dist_Hydrology' in y_feature:
                        axes[i, j].set_ylim(0, 1400)
                elif 'V_Dist_Hydrology' in y_feature:
                        axes[i, j].set_ylim(0, 500)

# formatting
plt.tight_layout() # plots won't overlap
plt.subplots_adjust(top=0.93)  # make room for suptitle
plt.show()


In [None]:
# Create subplots for numerical features
fig, axes = plt.subplots(7, 5, figsize=(20, 40))
fig.suptitle('Aspect and Distance Feature Relationships by Class', fontsize=30)

# subtitle underneath suptitle
fig.text(0.5, 0.96, 'Classes 1-7 displayed top-to-bottom', ha='center', fontsize=18, style='italic')
colors = sns.color_palette("husl", 7)  # Generate 7 distinct colors

# Pairs for scatterplots (x-feature vs y-feature)
feature_pairs = [
        ('Hillshade_3pm', 'Elevation'),
        ('Hillshade_3pm', 'H_Dist_Roadways'),
        ('Hillshade_3pm', 'H_Dist_Fire_Points'),
        ('Hillshade_3pm', 'H_Dist_Hydrology'),
        ('Hillshade_3pm', 'V_Dist_Hydrology')
]

# Create scatterplots for each feature pair
for type in range(7): # for each class type

        for i, (x_feature, y_feature) in enumerate(feature_pairs):
                
                row = (i // 5) + type
                col = i % 5
                
                # Create scatterplot: x_feature vs y_feature
                cover_data = df[df['Cover_Type'] == type+1]
                sns.scatterplot(data=cover_data, x=x_feature, y=y_feature, 
                                ax=axes[row, col], color=colors[type], alpha=0.6, s=10)
                axes[row, col].set_title(f'{x_feature} vs {y_feature}')

# Set consistent axis limits for all plots
for i in range(7):
        for j in range(5):
                if j < len(feature_pairs):  # Only set limits for plots that exist
                        x_feature, y_feature = feature_pairs[j]
                
                # Set x-axis limits based on feature type
                if 'Hillshade' in x_feature:
                        axes[i, j].set_xlim(0, 255)
                elif 'Aspect' in x_feature:
                        axes[i, j].set_xlim(0, 360)
                elif 'Slope' in x_feature:
                        axes[i, j].set_xlim(0, 60)
                
                # Set y-axis limits based on feature type
                if 'Elevation' in y_feature:
                        axes[i, j].set_ylim(1500, 3800)
                elif 'Roadways' in y_feature:
                        axes[i, j].set_ylim(0, 7000)
                elif 'Fire' in y_feature:
                        axes[i, j].set_ylim(0, 7000)
                elif 'H_Dist_Hydrology' in y_feature:
                        axes[i, j].set_ylim(0, 1400)
                elif 'V_Dist_Hydrology' in y_feature:
                        axes[i, j].set_ylim(0, 500)

# formatting
plt.tight_layout() # plots won't overlap
plt.subplots_adjust(top=0.93)  # make room for suptitle
plt.show()


### Wilderness Area Features

In [None]:
fig, axes = plt.subplots(7, 4, figsize=(20, 40))
fig.suptitle('Wilderness Area Features by Class', fontsize=30)

# subtitle underneath suptitle
fig.text(0.5, 0.96, 'Classes 1-7 displayed top-to-bottom', ha='center', fontsize=18, style='italic')

# dictionary assigning representative labels for categorical values
categorical_map = {'binary': {0: 'No', 1: 'Yes'}}


for type in range(7):

        for i, feature in enumerate(wilderness):

                row = type
                col = i

                data = df[df['Cover_Type'] == type+1] # section off each class type
                renamed_data = data[feature].map(categorical_map['binary'])  # Map binary values to 'No'/'Yes' in each class type
                
                sns.countplot(x=renamed_data, ax=axes[row, col], color=colors[type], order=['No', 'Yes']) # uses renamed_data as data, ax = array. 1D arrays initialized with only columns
                axes[row, col].set_title(f'{feature}')
                axes[row, col].set_xlabel("Wilderness Status")  # Set tick positions first

plt.tight_layout()
plt.subplots_adjust(top=0.95)  # make room for suptitle
plt.show()


### Soil Type Features

In [None]:
fig, axes = plt.subplots(28, 10, figsize=(20, 80))  # 8 rows and 5 columns for soil type features
fig.suptitle('Distribution of Soil Type Features', fontsize=30)

# subtitle underneath suptitle
fig.text(0.5, 0.96, 'Classes 1-7 displayed top-to-bottom', ha='center', fontsize=18, style='italic')

for type in range(7):  # for each class type

        for i, feature in enumerate(soil):
                row = (i + type * 40) // 10   # Calculate row position
                col = i % 10  # Calculate column position

                data = df[df['Cover_Type'] == type+1]
                renamed_data = data[feature].map(categorical_map['binary'])
                sns.countplot(x=renamed_data, ax=axes[row, col], color=colors[type], order=['No', 'Yes']) # uses renamed_data as data, ax = array. 1D arrays initialized with only columns
                
                axes[row, col].set_title(f'{feature}')
                axes[row, col].set_xlabel("Soil Type Status")  # Set tick positions first

plt.tight_layout()
plt.subplots_adjust(top=0.95)  # make room for suptitle
plt.show()


### Removing Soil Types

In [None]:
trim_classes_no_soil = []

for i, trim_class in enumerate(trim_classes):
    print(f"\nClass {i + 1}:")
    print(f"  Before removal: {trim_class.shape[1]} features")
    
    # Remove soil features
    trim_class_no_soil = trim_class.drop(columns=soil)
    
    print(f"  After removal:  {trim_class_no_soil.shape[1]} features")
    print(f"  Observations:   {trim_class_no_soil.shape[0]:,}")
    
    trim_classes_no_soil.append(trim_class_no_soil)

# Verify soil features are removed
print(f"\nVerification - Remaining features in Class 1:")
remaining_features = trim_classes_no_soil[0].columns.tolist()
for feature in remaining_features:
    print(f"  • {feature}")
