In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from imblearn.over_sampling import SMOTE

In [2]:
# Load the resampled dataset
file_path = r'C:\Users\kevzm\Desktop\Projects\2024\August\Dissertations\Haritha_Animal Diseases Classification Using Big Data Techniques\Code\Data\data.csv'
df_resampled = pd.read_csv(file_path)


In [3]:
# Separate features and target
X_resampled = df_resampled.drop(columns=['Dangerous'])
y_resampled = df_resampled['Dangerous'].map({'Yes': 1, 'No': 0})  # Convert 'Yes'/'No' to 1/0

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [4]:
# Train and evaluate models
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier()
}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    print(f"--- {model_name} ---")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred):.4f}")
    print(f"Recall: {recall_score(y_test, y_pred):.4f}")
    print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
    print(classification_report(y_test, y_pred))
    print("\n")


--- Logistic Regression ---
Accuracy: 0.5550
Precision: 0.5657
Recall: 0.5283
F1 Score: 0.5463
              precision    recall  f1-score   support

           0       0.55      0.58      0.56       103
           1       0.57      0.53      0.55       106

    accuracy                           0.56       209
   macro avg       0.56      0.56      0.55       209
weighted avg       0.56      0.56      0.55       209



--- Random Forest ---
Accuracy: 0.5455
Precision: 0.5647
Recall: 0.4528
F1 Score: 0.5026
              precision    recall  f1-score   support

           0       0.53      0.64      0.58       103
           1       0.56      0.45      0.50       106

    accuracy                           0.55       209
   macro avg       0.55      0.55      0.54       209
weighted avg       0.55      0.55      0.54       209



--- Decision Tree ---
Accuracy: 0.5311
Precision: 0.5417
Recall: 0.4906
F1 Score: 0.5149
              precision    recall  f1-score   support

           0  

In [5]:
from sklearn.model_selection import RandomizedSearchCV

# Example: Tuning Random Forest
param_dist = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)
rf_random.fit(X_train, y_train)

# Evaluate the best model
best_rf = rf_random.best_estimator_
y_pred = best_rf.predict(X_test)
print(f"Best Random Forest Accuracy: {accuracy_score(y_test, y_pred):.4f}")

Fitting 3 folds for each of 100 candidates, totalling 300 fits
Best Random Forest Accuracy: 0.5502


In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score

# Expanded parameter grid
param_dist = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 6],
    'bootstrap': [True, False]
}

# Initialize RandomForestClassifier
rf = RandomForestClassifier()

# RandomizedSearchCV with more iterations
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=200, cv=3, verbose=2, random_state=42, n_jobs=-1)

# Fit the model
rf_random.fit(X_train, y_train)

# Evaluate the best model
best_rf = rf_random.best_estimator_
y_pred = best_rf.predict(X_test)

# Print the best accuracy
print(f"Best Random Forest Accuracy: {accuracy_score(y_test, y_pred):.4f}")

Fitting 3 folds for each of 200 candidates, totalling 600 fits
Best Random Forest Accuracy: 0.5694


In [7]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV

# Define the parameter grid
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

# Initialize XGBClassifier
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# RandomizedSearchCV
xgb_random = RandomizedSearchCV(estimator=xgb, param_distributions=param_dist, n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)
xgb_random.fit(X_train, y_train)

# Evaluate the best model
best_xgb = xgb_random.best_estimator_
y_pred = best_xgb.predict(X_test)
print(f"Best XGBoost Accuracy: {accuracy_score(y_test, y_pred):.4f}")

Fitting 3 folds for each of 100 candidates, totalling 300 fits


Parameters: { "use_label_encoder" } are not used.



Best XGBoost Accuracy: 0.4833
