# Desafio: Prever ferimentos em acidente de carro

#### Blibliotecas utilizadas ↓

In [0]:
%pip install xlrd>=2.0.1
%pip install openpyxl
%restart_python

In [0]:

import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns, sklearn as sklearn, statistics as st
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, RocCurveDisplay
from pathlib import Path


####Importação da planilhas para analise ↓

In [0]:

data_path = Path("/Workspace/Users/ahmad.jmazloum@gmail.com/Data__1_.xlsx")
df = pd.read_excel(data_path)
df.columns = (df.columns.str.strip().str.replace("\n"," ",regex=False).str.replace(r"[^0-9a-zA-Z]+","_",regex=True).str.strip("_"))
df.head(3)


In [0]:

target_col = [c for c in df.columns if "injur" in c.lower()][0]
y_raw = pd.to_numeric(df[target_col], errors="coerce")
vals = set(pd.Series(y_raw.dropna().unique()).tolist())
if {1,2}.issubset(vals) or vals == {1,2}:
    y = (y_raw == 1).astype(int)
elif vals.issubset({0,1}):
    y = y_raw.astype(int)
else:
    y = (y_raw > 0).astype(int)
X = df.drop(columns=[target_col])
X.head()


In [0]:

expected_cols = ["Rush_Hour","Alcohol_Involved","Work_Zone","Align","Weekday",
                 "Accident_at_Intersection","Accident_at_Roadway","Speed_Limit",
                 "Number_of_Vehical_Involved","Number_of_Vehicle_Involved","Weather"]
feature_cols = [c for c in expected_cols if c in X.columns]
if "Number_of_Vehical_Involved" in feature_cols and "Number_of_Vehicle_Involved" in feature_cols:
    X["Number_of_Vehicle_Involved"] = X["Number_of_Vehicle_Involved"].fillna(X["Number_of_Vehical_Involved"])
    feature_cols.remove("Number_of_Vehical_Involved")
if not feature_cols:
    feature_cols = X.columns.tolist()
X = X[feature_cols]
num_features = X.select_dtypes(include=[np.number]).columns.tolist()
cat_features = [c for c in X.columns if c not in num_features]
num_features, cat_features


In [0]:

numeric_pre = Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())])
categorical_pre = Pipeline([("imputer", SimpleImputer(strategy="most_frequent")), ("ohe", OneHotEncoder(handle_unknown="ignore"))])
pre_glm = ColumnTransformer([("num", numeric_pre, num_features), ("cat", categorical_pre, cat_features)])
pre_tree = ColumnTransformer([("num", SimpleImputer(strategy="median"), num_features), ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [0]:

linreg = Pipeline([("prep", pre_glm), ("model", LinearRegression())]).fit(X_train, y_train)
logreg = Pipeline([("prep", pre_glm), ("model", LogisticRegression(max_iter=300, class_weight="balanced"))]).fit(X_train, y_train)
rf = Pipeline([("prep", pre_tree), ("model", RandomForestClassifier(n_estimators=300, random_state=42, class_weight="balanced_subsample"))]).fit(X_train, y_train)

lin_score = linreg.predict(X_test); lin_pred = (lin_score >= 0.5).astype(int)
log_proba = logreg.predict_proba(X_test)[:,1]; log_pred = (log_proba >= 0.5).astype(int)
rf_proba  = rf.predict_proba(X_test)[:,1]; rf_pred  = (rf_proba  >= 0.5).astype(int)

import pandas as pd
def metrics_row(name, y_true, y_pred, y_score):
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
    return dict(model=name,
                accuracy=accuracy_score(y_true, y_pred),
                precision=precision_score(y_true, y_pred, zero_division=0),
                recall=recall_score(y_true, y_pred),
                f1=f1_score(y_true, y_pred),
                roc_auc=roc_auc_score(y_true, y_score))
results_df = pd.DataFrame([
    metrics_row("LinearRegression", y_test, lin_pred, lin_score),
    metrics_row("LogisticRegression", y_test, log_pred, log_proba),
    metrics_row("RandomForest", y_test, rf_pred, rf_proba),
])
results_df


####Matriz de confusão e ROC - Regreção Linear ↓
• A baixo linha de códigos referente ao teste de regrassão linerar em formato de ROC e Matriz

• Foi observado que tem um aumento em 4 setores de acidentes, onde um deles consta a utilizadação de alcool e alta velocidade, podendo ser os dois simultaneamente 

In [0]:
def plot_conf(y_true, y_pred, title):
    cm = confusion_matrix(y_true, y_pred); fig, ax = plt.subplots()
    ax.imshow(cm); ax.set_title(title); ax.set_xlabel("Injury"); ax.set_ylabel("Real")
    for (i,j),v in np.ndenumerate(cm): ax.text(j,i,str(v),ha="center",va="center")
    plt.show()

def plot_roc(y_true, y_score, title):
    fig, ax = plt.subplots()
    RocCurveDisplay.from_predictions(y_true, y_score, ax=ax); ax.set_title(title); plt.show()

plot_conf(y_test, lin_pred, "Matriz de confusão - Linear Regression"); plot_roc(y_test, lin_score, "ROC - Linear Regression")
plot_conf(y_test, log_pred, "Matriz de confusão - Logistic Regression"); plot_roc(y_test, log_proba, "ROC - Logistic Regression")