In [None]:
# ================== Imports ==================
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.multioutput import MultiOutputClassifier
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

In [None]:
# ================== Load dataset ==================
df = pd.read_excel("/content/Rabies__Weather__War_Combined_1.4.25.xlsx")

In [None]:
# ================== Preprocessing ==================
df = df.drop(columns=['Date', 'War Name', 'Event Per Year', 'Index Event ID'])
df['War in Israel'] = df['War in Israel'].map({'Yes': 1, 'No': 0})

# Convert month number to month name
month_names = {
    1: 'January', 2: 'February', 3: 'March', 4: 'April', 5: 'May', 6: 'June',
    7: 'July', 8: 'August', 9: 'September', 10: 'October', 11: 'November', 12: 'December'
}
df['Month'] = df['Month'].map(month_names)

# Encode categorical features
label_cols = ['Animal Species', 'Rabies Species', 'Region', 'Settlement', 'Region_Weather', 'Month']
encode = LabelEncoder()
for col in label_cols:
    df[col] = encode.fit_transform(df[col].astype(str))

# Standardize numeric features
num_cols = ['x', 'y', 'Avg Temperature', 'Monthly Precipitation (mm)', 'Rainy Days']
df[num_cols] = StandardScaler().fit_transform(df[num_cols])

# Define features and targets
X = df.drop(columns=['Region', 'Month'])
y = df[['Region', 'Month']]

In [None]:
# ================== Base models ==================
base_models = {
    'Logistic Regression': LogisticRegression(max_iter=100),
    'LightGBM': lgb.LGBMClassifier(max_iter=100),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'SVC': SVC(),
    'Naive Bayes': GaussianNB(),
    'XGBoost': xgb.XGBClassifier(eval_metric='mlogloss'),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Extra Trees': ExtraTreesClassifier(),
}

# Wrap with MultiOutputClassifier
models = {name: MultiOutputClassifier(model) for name, model in base_models.items()}

In [None]:
df.head()

Unnamed: 0,Year,Month,Animal Species,Rabies Species,Region,Settlement,x,y,Region_Weather,Avg Temperature,Monthly Precipitation (mm),Rainy Days,War in Israel
0,2025,7,8,3,1,71,-1.206104,0.531453,1,-0.117704,-0.408285,0.144979,1
1,2025,7,4,3,0,233,-0.784158,-0.321362,1,-0.117704,-0.408285,0.144979,1
2,2025,7,2,3,5,196,0.264226,-1.077241,0,0.066881,-0.569357,-0.255895,1
3,2025,7,8,3,1,151,-0.773981,0.552108,1,-0.117704,-0.408285,0.144979,1
4,2025,7,4,3,3,115,-2.248555,-0.827479,0,0.066881,-0.569357,-0.255895,1


In [None]:
# ================== Evaluation functions ==================
def evaluate_multioutput_models(X, y, models, n_splits=5, n_runs=5):
    results = []

    for name, model in models.items():
        print(f"\nEvaluating {name}...")
        region_accuracies = []
        month_accuracies = []

        for run in range(n_runs):
            kf = KFold(n_splits=n_splits, shuffle=True, random_state=42 + run)

            run_region_accuracies = []
            run_month_accuracies = []

            for train_index, test_index in kf.split(X):
                X_train, X_test = X.iloc[train_index], X.iloc[test_index]
                y_train, y_test = y.iloc[train_index], y.iloc[test_index]

                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)

                region_accuracy = accuracy_score(y_test['Region'], y_pred[:, 0])
                month_accuracy = accuracy_score(y_test['Month'], y_pred[:, 1])

                run_region_accuracies.append(region_accuracy)
                run_month_accuracies.append(month_accuracy)

            region_accuracies.append(np.mean(run_region_accuracies))
            month_accuracies.append(np.mean(run_month_accuracies))

        avg_region_accuracy = np.mean(region_accuracies)
        avg_month_accuracy = np.mean(month_accuracies)

        results.append({
            'Model': name,
            'Average Region Accuracy': avg_region_accuracy,
            'Average Month Accuracy': avg_month_accuracy
        })

    return pd.DataFrame(results).sort_values(by='Average Region Accuracy', ascending=False)

def evaluate_catboost_multi_target(X, y, n_splits=5, n_runs=5):
    print("\nEvaluating CatBoost (separate models for Region & Month)...")
    region_accuracies = []
    month_accuracies = []

    for run in range(n_runs):
        kf = KFold(n_splits=n_splits, shuffle=True, random_state=42 + run)

        run_region_accuracies = []
        run_month_accuracies = []

        for train_index, test_index in kf.split(X):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train_region = y['Region'].iloc[train_index]
            y_test_region = y['Region'].iloc[test_index]
            y_train_month = y['Month'].iloc[train_index]
            y_test_month = y['Month'].iloc[test_index]

            model_region = CatBoostClassifier(verbose=0)
            model_month = CatBoostClassifier(verbose=0)

            model_region.fit(X_train, y_train_region)
            model_month.fit(X_train, y_train_month)

            y_pred_region = model_region.predict(X_test)
            y_pred_month = model_month.predict(X_test)

            run_region_accuracies.append(accuracy_score(y_test_region, y_pred_region))
            run_month_accuracies.append(accuracy_score(y_test_month, y_pred_month))

        region_accuracies.append(np.mean(run_region_accuracies))
        month_accuracies.append(np.mean(run_month_accuracies))

    avg_region_accuracy = np.mean(region_accuracies)
    avg_month_accuracy = np.mean(month_accuracies)

    print(f"\nCatBoost Accuracy - Region: {avg_region_accuracy:.4f}, Month: {avg_month_accuracy:.4f}")

    return pd.DataFrame([{
        'Model': 'CatBoost (Separate)',
        'Average Region Accuracy': avg_region_accuracy,
        'Average Month Accuracy': avg_month_accuracy
    }])

In [None]:
# ================== Run evaluations ==================
results_df = evaluate_multioutput_models(X, y, models, n_runs=5)
catboost_df = evaluate_catboost_multi_target(X, y)

# Combine all results
final_results = pd.concat([results_df, catboost_df], ignore_index=True)


Evaluating Logistic Regression...


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000094 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 760
[LightGBM] [Info] Number of data points in the train set: 570, number of used features: 11
[LightGBM] [Info] Start training from score -2.734718
[LightGBM] [Info] Start training from score -2.819276
[LightGBM] [Info] Start training from score -2.055177
[LightGBM] [Info] Start training from score -2.268099
[LightGBM] [Info] Start training from score -2.202502
[LightGBM] [Info] Start training from score -3.126761
[LightGBM] [Info] Start training from score -3.049799
[LightGBM] [Info] Start training from score -2.561447
[LightGBM] [Info] Start training from score -2.790288
[LightGBM] [Info] Start training from score -2.097141
[LightGBM] [Info] Start training from score -

In [None]:
# ================== Final Output ==================
print("\nFinal Evaluation Results:")
final_results['Averagge Accuracy Targets (Region & Month)'] = ( final_results['Average Region Accuracy'] + final_results['Average Month Accuracy']) / 2
final_results = final_results.sort_values(by='Averagge Accuracy Targets (Region & Month)', ascending=False)
final_result_frame = pd.DataFrame(final_results)


Final Evaluation Results:


In [None]:
final_result_frame

Unnamed: 0,Model,Average Region Accuracy,Average Month Accuracy,Averagge Accuracy Targets (Region & Month)
1,Gradient Boosting,0.958129,0.871122,0.914625
3,LightGBM,0.957847,0.850862,0.904354
2,XGBoost,0.957847,0.840485,0.899166
4,Extra Trees,0.946337,0.849189,0.897763
0,Random Forest,0.95897,0.814356,0.886663
10,CatBoost (Separate),0.959815,0.802007,0.880911
5,Decision Tree,0.938767,0.788797,0.863782
6,Naive Bayes,0.886193,0.350346,0.618269
7,K-Nearest Neighbors,0.655627,0.180327,0.417977
8,Logistic Regression,0.617157,0.182634,0.399896


In [None]:
import joblib
GB_model_IMAN = models['Gradient Boosting']
joblib.dump(GB_model_IMAN, "DB_model_IMAN.pkl")
print("✅ המודל נשמר כ- DB_model_IMAN.pkl")


✅ המודל נשמר כ- DB_model_IMAN.pkl
