In [None]:
!pip install --upgrade scikit-learn imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Installing collected packages: imblearn
Successfully installed imblearn-0.0


In [None]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, roc_auc_score
import matplotlib.pyplot as plt
import sys

# Check scikit-learn version
print(f"scikit-learn version: {sklearn.__version__}")

# Load dataset with error handling
try:
    df = pd.read_excel('/cow_health_dataset.xlsx')  # Verify path is correct
except FileNotFoundError:
    print("Error: Dataset file not found. Please check the path (e.g., '/kaggle/input/cow-health-prediction/cow_health_dataset.xlsx').")
    sys.exit(1)

# Handle missing values
if df.isnull().sum().any():
    print("Missing values detected. Filling with median for numeric columns...")
    df.fillna(df.median(numeric_only=True), inplace=True)

# Encode target variable and Breed
le_health = LabelEncoder()
le_breed = LabelEncoder()
try:
    df['health_status_encoded'] = le_health.fit_transform(df['Health'])
    df['Breed_encoded'] = le_breed.fit_transform(df['Breed'])
except KeyError as e:
    print(f"Error: Column {e} not found in dataset.")
    sys.exit(1)

# Feature selection
features = ['Age', 'Breed_encoded', 'Heart_Rate', 'Body_Temperature', 'Respiratory_Rate']
try:
    X = df[features]
except KeyError as e:
    print(f"Error: Feature {e} not found in dataset.")
    sys.exit(1)
y = df['health_status_encoded']

# Split data with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Hyperparameter tuning for Decision Tree
dt_param_grid = {
    'max_depth': [3, 5, 7, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}
dt_model = DecisionTreeClassifier(random_state=42, class_weight='balanced')
grid_search_dt = GridSearchCV(dt_model, dt_param_grid, cv=5, scoring='f1_weighted', n_jobs=-1)
grid_search_dt.fit(X_train_scaled, y_train)

# Best Decision Tree model
best_dt_model = grid_search_dt.best_estimator_
print("Best Decision Tree Parameters:", grid_search_dt.best_params_)

# Evaluate Decision Tree
y_pred_dt = best_dt_model.predict(X_test_scaled)
print("\nDecision Tree Classification Report:\n")
print(classification_report(y_test, y_pred_dt, target_names=le_health.classes_))
if len(le_health.classes_) == 2:
    y_pred_proba_dt = best_dt_model.predict_proba(X_test_scaled)[:, 1]
    print("Decision Tree ROC-AUC Score:", roc_auc_score(y_test, y_pred_proba_dt))

# Cross-validation score
cv_scores_dt = cross_val_score(best_dt_model, X_train_scaled, y_train, cv=5, scoring='f1_weighted')
print("\nDecision Tree Cross-Validation F1-Weighted Scores:", cv_scores_dt)
print("Mean CV F1-Weighted Score:", cv_scores_dt.mean())

# Feature importance
feature_importance = pd.DataFrame({
    'Feature': features,
    'Importance': best_dt_model.feature_importances_
}).sort_values(by='Importance', ascending=False)
print("\nFeature Importance (Decision Tree):\n", feature_importance)

# Visualize and save Decision Tree
plt.figure(figsize=(12, 8))
from sklearn.tree import plot_tree
plot_tree(best_dt_model, filled=True, feature_names=features, class_names=le_health.classes_, rounded=True, fontsize=10)
plt.title("Optimized Decision Tree for Cow Health Classification")
plt.savefig('decision_tree_cow_health.png')
plt.close()

# Function to predict cow health
def predict_cow_health(data_point, model, scaler, feature_names, le_health, le_breed):
    # Encode Breed if provided as string
    if isinstance(data_point['Breed'], str):
        data_point['Breed'] = le_breed.transform([data_point['Breed']])[0]
    data_point_df = pd.DataFrame([data_point], columns=feature_names)
    data_point_scaled = scaler.transform(data_point_df)
    prediction = model.predict(data_point_scaled)
    probability = model.predict_proba(data_point_scaled)
    return le_health.inverse_transform(prediction)[0], probability[0]



scikit-learn version: 1.6.1
Best Decision Tree Parameters: {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}

Decision Tree Classification Report:

              precision    recall  f1-score   support

     Healthy       0.72      0.76      0.74        34
   Unhealthy       0.95      0.94      0.95       166

    accuracy                           0.91       200
   macro avg       0.84      0.85      0.84       200
weighted avg       0.91      0.91      0.91       200

Decision Tree ROC-AUC Score: 0.8522324592487598

Decision Tree Cross-Validation F1-Weighted Scores: [0.92383754 0.92255556 0.89845006 0.9375     0.92129565]
Mean CV F1-Weighted Score: 0.9207277615276637

Feature Importance (Decision Tree):
             Feature  Importance
3  Body_Temperature    0.361505
4  Respiratory_Rate    0.332467
2        Heart_Rate    0.249249
0               Age    0.039692
1     Breed_encoded    0.017088


In [None]:
# Test predictions
healthy_cow = {
    'Age': 4,
    'Breed': le_breed.transform(['Beef'])[0],  # Example breed
    'Heart_Rate': 60,
    'Body_Temperature': 102,
    'Respiratory_Rate': 30
}
unhealthy_cow = {
    'Age': 5,
    'Breed': le_breed.transform(['Diary'])[0],  # Example breed
    'Heart_Rate': 100,
    'Body_Temperature': 103,
    'Respiratory_Rate': 59
}

# Decision Tree predictions
pred_healthy_dt, prob_healthy_dt = predict_cow_health(healthy_cow, best_dt_model, scaler, features, le_health, le_breed)
print(f"\nDecision Tree Prediction for Healthy Cow: {pred_healthy_dt}")
print(f"Probability: Healthy={prob_healthy_dt[0]:.2%}, Unhealthy={prob_healthy_dt[1]:.2%}")

pred_unhealthy_dt, prob_unhealthy_dt = predict_cow_health(unhealthy_cow, best_dt_model, scaler, features, le_health, le_breed)
print(f"\nDecision Tree Prediction for Unhealthy Cow: {pred_unhealthy_dt}")
print(f"Probability: Healthy={prob_unhealthy_dt[0]:.2%}, Unhealthy={prob_unhealthy_dt[1]:.2%}")


Decision Tree Prediction for Healthy Cow: Healthy
Probability: Healthy=100.00%, Unhealthy=0.00%

Decision Tree Prediction for Unhealthy Cow: Healthy
Probability: Healthy=100.00%, Unhealthy=0.00%


# Random Forest

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')

# Load and preprocess the dataset
def load_and_preprocess_data(file_path):
    # Read the dataset
    df = pd.read_excel(file_path)

    # Encode categorical variable 'Breed'
    breed_le = LabelEncoder()
    df['Breed_encoded'] = breed_le.fit_transform(df['Breed'])

    # Encode the target variable 'Health'
    health_le = LabelEncoder()
    df['Health_encoded'] = health_le.fit_transform(df['Health'])

    # Define features and target
    X = df[['Age', 'Breed_encoded', 'Heart_Rate', 'Body_Temperature', 'Respiratory_Rate']]
    y = df['Health_encoded']

    return X, y, breed_le, health_le, df

# Train the Random Forest model
def train_model(X, y, health_le):
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize and train the model
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Evaluate the model
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Model Accuracy: {accuracy:.2f}")
    print("\nClassification Report:")
    # Use numerical labels for y_test and y_pred, with decoded names for display
    print(classification_report(y_test, y_pred, target_names=health_le.classes_))

    return model

# Function to predict health for new IoT data
def predict_cow_health(model, breed_le, health_le, age, breed, heart_rate, body_temperature, respiratory_rate):
    # Prepare input data
    try:
        breed_encoded = breed_le.transform([breed])[0]
    except ValueError:
        raise ValueError(f"Breed '{breed}' not found in training data.")

    input_data = np.array([[age, breed_encoded, heart_rate, body_temperature, respiratory_rate]])

    # Predict
    prediction = model.predict(input_data)
    health_status = health_le.inverse_transform(prediction)[0]

    return health_status

In [None]:
# Main execution
if __name__ == "__main__":
    # Example file path (replace with your actual CSV file path)
    file_path = "/cow_health_dataset.xlsx"

    try:
        # Load and preprocess data
        X, y, breed_le, health_le, df = load_and_preprocess_data(file_path)

        # Print encoded values for inspection
        print("Encoded Breed values:")
        print(dict(zip(df['Breed'], df['Breed_encoded'])))
        print("\nEncoded Health values:")
        print(dict(zip(df['Health'], df['Health_encoded'])))

        # Train the model
        model = train_model(X, y, health_le)

        # Example prediction with IoT-like input
        sample_data = {
            'age': 5,
            'breed': 'Diary',  # Must match a breed in the training data
            'heart_rate': 54,
            'body_temperature': 103,
            'respiratory_rate': 25
        }

        health_status = predict_cow_health(
            model,
            breed_le,
            health_le,
            sample_data['age'],
            sample_data['breed'],
            sample_data['heart_rate'],
            sample_data['body_temperature'],
            sample_data['respiratory_rate']
        )
        print(f"\nPredicted Health Status for cow (Age: {sample_data['age']}, Breed: {sample_data['breed']}): {health_status}")

    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found. Please provide a valid CSV file with columns: Age, Breed, Heart_Rate, Body_Temperature, Respiratory_Rate, Health.")
    except Exception as e:
        print(f"An error occurred: {str(e)}")

Error: File '/cow_health_dataset.xlsx' not found. Please provide a valid CSV file with columns: Age, Breed, Heart_Rate, Body_Temperature, Respiratory_Rate, Health.
