In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

In [9]:
df = pd.read_csv('main_data.csv')
le = LabelEncoder()
df['Species'] = le.fit_transform(df['Species'])

# Split the data into features (X) and target (y)
X = df.drop('Species', axis=1)
y = df['Species']
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the Random Forest classifier
rf = RandomForestClassifier(random_state=42)

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [13]:
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9100
Classification Report:
                  precision    recall  f1-score   support

Indian Mackerel       0.93      0.89      0.91       143
       Sardines       0.90      1.00      0.95       130
 Yellowfin Tuna       0.90      0.84      0.87       127

       accuracy                           0.91       400
      macro avg       0.91      0.91      0.91       400
   weighted avg       0.91      0.91      0.91       400

Confusion Matrix:
 [[127   4  12]
 [  0 130   0]
 [  9  11 107]]


In [17]:
# Example of predicting the species for a new data point
new_data = np.array([[20.5, 80.0, 26.5, 0.7, 33.7]])  # Example input
predicted_probabilities = best_rf.predict_proba(new_data)

# Output the probabilities for each species
for species, prob in zip(le.classes_, predicted_probabilities[0]):
    print(f"Species: {species}, Probability: {prob:.4f}")

#predicted_species = best_rf.predict(new_data)
#print(f"Predicted Species: {le.inverse_transform(predicted_species)}")


Species: Indian Mackerel, Probability: 0.2350
Species: Sardines, Probability: 0.0357
Species: Yellowfin Tuna, Probability: 0.7293


