### Random Forest Crocodile Matcher (RF_croc_matcher.ipynb) 

This notebook builds a Random Forest model to classify the **Conservation Status** of crocodiles using the provided dataset.

In [None]:
# Core data tools
import numpy as np
import pandas as pd

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Model and training utilities
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

# Evaluation metrics
from sklearn.metrics import (
    accuracy_score, log_loss, classification_report, confusion_matrix, roc_auc_score
)

# Model
from sklearn.ensemble import RandomForestClassifier

#### 1. Dataset Overview & Missing Values

In [None]:
# Load dataset
croc_df = pd.read_csv("crocodile_dataset.csv")

# Preview first rows
display(croc_df.head())

# Display dataset information
print("\nDataset Info:")
croc_df.info()

# Display missing values in table format
print("\nMissing values:")
print(croc_df.isnull().sum())

#### 2. Target Distribution

In [None]:
# Show class distribution of the target variable
print("Conservation Status counts:")
print(croc_df["Conservation Status"].value_counts())

plt.figure(figsize=(8, 4))
sns.countplot(data=croc_df, x="Conservation Status")
plt.title("Distribution of Conservation Status")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

#### 3. Data Cleaning

In [None]:
# Remove the disputed species row
croc_df = croc_df[croc_df["Common Name"] != "Borneo Crocodile (disputed)"]

# Columns that reveal species or are irrelevant
drop_cols = [
    "Observation ID",     # random ID, no predictive value
    "Observer Name",      # irrelevant to conservation status
    "Notes",              # unstructured text not used in project
    "Scientific Name",    # reveals species
    "Common Name",        # reveals species
    "Family",             # related to species identity
    "Genus"               # related to species identity
]

# Drop unwanted columns
clean_df = croc_df.drop(columns=drop_cols)

clean_df.head()

#### 4. Process Date & Define Features/Target

In [None]:
# Convert string date to datetime object
clean_df["Date"] = pd.to_datetime(clean_df["Date of Observation"], format="%d-%m-%Y")

# Extract year only (day/month unnecessary for our prediction task)
clean_df["Year"] = clean_df["Date"].dt.year

# Remove raw date columns
clean_df = clean_df.drop(columns=["Date", "Date of Observation"])

# Define target (y) and features (X)
target = "Conservation Status"
y = clean_df[target]
X = clean_df.drop(columns=[target])

clean_df.head()

#### 5. Train / Test Split & Preprocessing

In [None]:
# Stratified split preserves class proportions in train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# Identify numeric and categorical columns for preprocessing
num_cols = ["Observed Length (m)", "Observed Weight (kg)", "Year"]
cat_cols = ["Age Class", "Sex", "Country/Region", "Habitat Type"]

# ColumnTransformer applies different preprocessing to numeric vs categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),      # scale numeric features
        ("cat", OneHotEncoder(handle_unknown="ignore"),   # convert categories to binary vectors
         cat_cols)
    ]
)

#### 6. Random Forest Pipeline & Tuning Grid

In [None]:
# Create the pipeline; preprocess first, then apply Random Forest
rf_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestClassifier(random_state=42))
])

# Hyperparameter search space for GridSearchCV
param_grid = {
    "model__n_estimators": [100, 200, 300],   # number of trees
    "model__max_depth": [None, 5, 10, 20],    # tree depth
    "model__max_features": ["sqrt", "log2"],  # number of features to consider at each split
    "model__min_samples_split": [2, 5, 10],   # internal node split threshold
    "model__min_samples_leaf": [1, 2, 4]      # minimum leaf size
}

#### 7. Hyperparameter Tuning with GridSearchCV

In [None]:
# Setup grid search with log-loss scoring and 5-fold cross-validation
rf_grid = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    cv=5,
    scoring="neg_log_loss",
    n_jobs=-1,
    verbose=1
)

# Fit on training data
rf_grid.fit(X_train, y_train)

# Show the best discovered parameters
rf_grid.best_params_

#### 8. Model Evaluation

In [None]:
# Retrieve best model from grid search
best_rf = rf_grid.best_estimator_

# Predict classes and prediction probabilities
y_pred = best_rf.predict(X_test)
y_proba = best_rf.predict_proba(X_test)

# Accuracy, ROC AUC, and log-loss metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC-AUC (OVR):", roc_auc_score(y_test, y_proba, multi_class="ovr"))
print("Log Loss:", log_loss(y_test, y_proba))

# Detailed per-class performance
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

#### 9. Confusion Matrix

In [None]:
# Compute the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plot heatmap for easier interpretation
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

#### 10. Feature Importance Analysis

In [None]:
# Extract trained RF model and fitted one-hot encoder
rf_clf = best_rf.named_steps["model"]
ohe = best_rf.named_steps["preprocessor"].named_transformers_["cat"]

# Get names of encoded categorical features
cat_features = ohe.get_feature_names_out(cat_cols)

# Combine numeric + encoded categorical names
all_features = num_cols + list(cat_features)

# Extract feature importance scores from Random Forest
importances = rf_clf.feature_importances_

# Print top 15 most important features
indices = np.argsort(importances)[::-1][:15]

print("Top 15 Important Features:\n")
for idx in indices:
    print(all_features[idx], ":", importances[idx])

# Plot top 15 feature importances
sorted_idx = indices

plt.figure(figsize=(10,5))
plt.barh([all_features[i] for i in sorted_idx], importances[sorted_idx])
plt.xlabel("Feature Importance")
plt.title("Top 15 Most Important Features")
plt.gca().invert_yaxis()   # highest at top
plt.show()