# Workshop 03: Machine Learning Models for Industrial Safety
This notebook loads the Kaggle dataset 'Industrial Safety and Health Analytics Database', preprocesses the data, and applies two classification algorithms (Random Forest and Logistic Regression) to predict the accident level.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import warnings
warnings.filterwarnings('ignore')

## 1. Load Data
We can load the dataset directly using `kagglehub`.

In [None]:
# Install kagglehub if not present
!pip install -q kagglehub
import kagglehub

# Download latest version of the dataset
path = kagglehub.dataset_download("ihmstefanini/industrial-safety-and-health-analytics-database")
print("Path to dataset files:", path)

# Load the dataset
import os
csv_file = os.path.join(path, "IHMStefanini_industrial_safety_and_health_database.csv")
df = pd.read_csv(csv_file)

df.head()

## 2. Preprocessing
Select relevant features and split into X and y.

In [None]:
# Defining Features (X) and Target (y)
features = ['Countries', 'Local', 'Industry Sector', 'Genre', 'Employee ou Terceiro', 'Risco Critico']
target = 'Accident Level'

# Drop rows with missing target or features just in case
df = df.dropna(subset=features + [target])

X = df[features]
y = df[target]

# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training dimensions: {X_train.shape}")
print(f"Test dimensions: {X_test.shape}")

## 3. Modeling Pipeline
Create preprocessing steps to One-Hot Encode categorical features, and train the models.

In [None]:
# Preprocessor: One-Hot Encoding for categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), features)
    ])

# --- Model 1: Random Forest Classifier ---
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42, n_estimators=100))
])

# Train Random Forest
rf_pipeline.fit(X_train, y_train)
rf_predictions = rf_pipeline.predict(X_test)

print("--- Random Forest Results ---")
print(f"Accuracy: {accuracy_score(y_test, rf_predictions):.4f}")
print("Classification Report:")
print(classification_report(y_test, rf_predictions, zero_division=0))

In [None]:
# --- Model 2: Logistic Regression (Multinomial) ---
lr_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000, multi_class='multinomial'))
])

# Train Logistic Regression
lr_pipeline.fit(X_train, y_train)
lr_predictions = lr_pipeline.predict(X_test)

print("--- Logistic Regression Results ---")
print(f"Accuracy: {accuracy_score(y_test, lr_predictions):.4f}")
print("Classification Report:")
print(classification_report(y_test, lr_predictions, zero_division=0))