In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('/content/drive/MyDrive/dataset/SI_DIDSON_fish_data_2015_2016.csv')

In [None]:
df.head()

Unnamed: 0,id,date,location,site,habitat,transect_no,size_class,area_m2,number
0,bocas del toro_almirante_mangrove_3,2016-01-27,bocas,almirante,mangrove,3,112.5,195.45,4
1,bocas del toro_almirante_mangrove_3,2016-01-27,bocas,almirante,mangrove,3,137.5,195.45,1
2,bocas del toro_almirante_mangrove_3,2016-01-27,bocas,almirante,mangrove,3,162.5,195.45,1
3,bocas del toro_almirante_mangrove_3,2016-01-27,bocas,almirante,mangrove,3,187.5,195.45,0
4,bocas del toro_almirante_mangrove_3,2016-01-27,bocas,almirante,mangrove,3,212.5,195.45,0


In [None]:
df.head()

Unnamed: 0,id,date,location,site,habitat,transect_no,size_class,area_m2,number
0,bocas del toro_almirante_mangrove_3,2016-01-27,bocas,almirante,mangrove,3,112.5,195.45,4
1,bocas del toro_almirante_mangrove_3,2016-01-27,bocas,almirante,mangrove,3,137.5,195.45,1
2,bocas del toro_almirante_mangrove_3,2016-01-27,bocas,almirante,mangrove,3,162.5,195.45,1
3,bocas del toro_almirante_mangrove_3,2016-01-27,bocas,almirante,mangrove,3,187.5,195.45,0
4,bocas del toro_almirante_mangrove_3,2016-01-27,bocas,almirante,mangrove,3,212.5,195.45,0


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error
from datetime import datetime

# Load data (assuming df is already loaded)
df['date'] = pd.to_datetime(df['date'])
df['day_of_year'] = df['date'].dt.dayofyear

# Encode categorical variables using dictionary mappings
label_mappings = {}
for col in ['location', 'site', 'habitat']:
    unique_values = df[col].unique()
    label_mappings[col] = {value: idx for idx, value in enumerate(unique_values)}
    df[col] = df[col].map(label_mappings[col])

# **Model 1: Predict "site"**
X_site = df[['location', 'day_of_year']]
y_site = df['site']
X_train_site, X_test_site, y_train_site, y_test_site = train_test_split(X_site, y_site, test_size=0.2, random_state=42)

model_site = RandomForestClassifier(n_estimators=100, random_state=42)
model_site.fit(X_train_site, y_train_site)

y_pred_site = model_site.predict(X_test_site)
print("Site Prediction Accuracy:", accuracy_score(y_test_site, y_pred_site))

# **Model 2: Predict "habitat" using site from Model 1**
X_habitat = df[['location', 'day_of_year', 'site']]
y_habitat = df['habitat']
X_train_habitat, X_test_habitat, y_train_habitat, y_test_habitat = train_test_split(X_habitat, y_habitat, test_size=0.2, random_state=42)

model_habitat = RandomForestClassifier(n_estimators=100, random_state=42)
model_habitat.fit(X_train_habitat, y_train_habitat)

y_pred_habitat = model_habitat.predict(X_test_habitat)
print("Habitat Prediction Accuracy:", accuracy_score(y_test_habitat, y_pred_habitat))

# **Model 3: Predict "number" using XGBoost**
X_number = df[['location', 'day_of_year', 'site', 'habitat']]
y_number = df['number']
X_train_number, X_test_number, y_train_number, y_test_number = train_test_split(X_number, y_number, test_size=0.2, random_state=42)

xgb_reg = xgb.XGBRegressor(objective="reg:squarederror", random_state=42)

param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [3, 5],
    "learning_rate": [0.01, 0.05],
    "subsample": [0.8, 1.0]
}

grid_search = GridSearchCV(xgb_reg, param_grid, scoring="neg_mean_absolute_error", cv=3, verbose=1, n_jobs=-1)
grid_search.fit(X_train_number, y_train_number)

best_xgb_reg = grid_search.best_estimator_
y_pred_number = best_xgb_reg.predict(X_test_number)

mae = mean_absolute_error(y_test_number, y_pred_number)
print(f"Optimized Fish Shoal Prediction MAE: {mae}")

# **Unified Prediction Function**
def predict_fish_population(location, date):
    """Predicts site, habitat, and fish shoal count sequentially with mapped labels."""
    try:
        day_of_year = datetime.strptime(date, "%Y-%m-%d").timetuple().tm_yday

        # Handle unseen locations
        if location in label_mappings['location']:
            encoded_location = label_mappings['location'][location]
        else:
            print(f"Warning: Location '{location}' not seen before. Using most common site/habitat.")
            encoded_location = df['location'].mode()[0]

        # Predict Site
        site_pred = model_site.predict(pd.DataFrame([[encoded_location, day_of_year]], columns=['location', 'day_of_year']))[0]
        site_label = {v: k for k, v in label_mappings['site'].items()}.get(site_pred, "Unknown Site")  # Convert back to string

        # Predict Habitat
        habitat_pred = model_habitat.predict(pd.DataFrame([[encoded_location, day_of_year, site_pred]], columns=['location', 'day_of_year', 'site']))[0]
        habitat_label = {v: k for k, v in label_mappings['habitat'].items()}.get(habitat_pred, "Unknown Habitat")  # Convert back to string

        # Predict Number of Fish Shoals
        fish_shoal_pred = best_xgb_reg.predict(pd.DataFrame([[encoded_location, day_of_year, site_pred, habitat_pred]],
                                                             columns=['location', 'day_of_year', 'site', 'habitat']))[0]

        return {
            "location": location,
            "date": date,
            "predicted_site": site_label,  # Now correctly mapped to string
            "predicted_habitat": habitat_label,  # Now correctly mapped to string
            "predicted_number_of_fish_shoals": round(fish_shoal_pred, 2)
        }
    except Exception as e:
        print("Error during prediction:", e)
        return None


# **Example Prediction**
example_prediction = predict_fish_population("bocas", "2025-03-06")
print(example_prediction)


Site Prediction Accuracy: 0.48951048951048953
Habitat Prediction Accuracy: 0.7797202797202797
Fitting 3 folds for each of 16 candidates, totalling 48 fits
Optimized Fish Shoal Prediction MAE: 5.429888725280762
{'location': 'bocas', 'date': '2025-03-06', 'predicted_site': 'stri', 'predicted_habitat': 'dock', 'predicted_number_of_fish_shoals': 2.77}


In [None]:
print(label_mappings['location'])


{0: 0, 1: 1, 2: 2, 3: 3}


In [None]:
import joblib

# Save models
joblib.dump(model_site, "random_forest_site.pkl")
joblib.dump(model_habitat, "random_forest_habitat.pkl")
joblib.dump(best_xgb_reg, "xgboost_fish_shoal.pkl")

# Save label mappings
joblib.dump(label_mappings, "label_mappings.pkl")

print("Models and encoders saved successfully!")


Models and encoders saved successfully!


In [None]:
import joblib
import pandas as pd
from datetime import datetime

# Load the trained models and encoders
model_site = joblib.load("random_forest_site.pkl")
model_habitat = joblib.load("random_forest_habitat.pkl")
best_xgb_reg = joblib.load("xgboost_fish_shoal.pkl")
label_mappings = joblib.load("label_mappings.pkl")

def predict_fish_population(location, date):
    """Predicts site, habitat, and fish shoal count sequentially with mapped labels."""
    try:
        day_of_year = datetime.strptime(date, "%Y-%m-%d").timetuple().tm_yday

        # Handle unseen locations
        if location in label_mappings['location']:
            encoded_location = label_mappings['location'][location]
        else:
            print(f"Warning: Location '{location}' not seen before. Using most common site/habitat.")
            encoded_location = df['location'].mode()[0]

        # Predict Site
        site_pred = model_site.predict(pd.DataFrame([[encoded_location, day_of_year]], columns=['location', 'day_of_year']))[0]
        site_label = {v: k for k, v in label_mappings['site'].items()}.get(site_pred, "Unknown Site")

        # Predict Habitat
        habitat_pred = model_habitat.predict(pd.DataFrame([[encoded_location, day_of_year, site_pred]], columns=['location', 'day_of_year', 'site']))[0]
        habitat_label = {v: k for k, v in label_mappings['habitat'].items()}.get(habitat_pred, "Unknown Habitat")

        # Predict Number of Fish Shoals
        fish_shoal_pred = best_xgb_reg.predict(pd.DataFrame([[encoded_location, day_of_year, site_pred, habitat_pred]],
                                                             columns=['location', 'day_of_year', 'site', 'habitat']))[0]

        return {
            "location": location,
            "date": date,
            "predicted_site": site_label,
            "predicted_habitat": habitat_label,
            "predicted_number_of_fish_shoals": round(fish_shoal_pred, 2)
        }
    except Exception as e:
        print("Error during prediction:", e)
        return None


# **Example Prediction**
example_prediction = predict_fish_population("bocas", "2025-03-06")
print(example_prediction)


{'location': 'bocas', 'date': '2025-03-06', 'predicted_site': 'stri', 'predicted_habitat': 'dock', 'predicted_number_of_fish_shoals': 2.77}
