In [0]:
%pip install xlrd>=2.0.1 
%pip install openpyxl 
%restart_python 

####Coleta e tratamento dos dados ‚Üì

In [0]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    mean_squared_error, r2_score, accuracy_score,
    precision_score, recall_score, f1_score, confusion_matrix
)

# Local do arquivo dentro do databricks
file_path = "/Workspace/Users/ahmad.jmazloum@gmail.com/Data__1_.xlsx"

# Variavel do arquivo
df = pd.read_excel(file_path)

# Visualiza√ß√£o inicial
print("Dimens√£o:", df.shape)
print(df.head())

# Tratamento de colunas com texto para valores bin√°rios
map_dict = {"yes":1, "no":0, "sim":1, "n√£o":0, "nao":0}

for col in df.columns:
    if df[col].dtype == "object":
        df[col] = df[col].str.lower().map(map_dict)

# Corrigir a vari√°vel alvo (Injury: 1 = ferido, 2 = n√£o ferido ‚Üí 0)
df["Injury"] = df["Injury"].replace({2:0})

# Remover linhas nulas ou ausentes dentro da tabela
df = df.dropna().reset_index(drop=True)

# Conferir estat√≠sticas gerais
print(df.describe())
print("\nDistribui√ß√£o de Injury:\n", df["Injury"].value_counts())

# Correla√ß√£o entre vari√°veis (analisar se entre as variaveis existe algo em cumum)
plt.figure(figsize=(10,6))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap="YlGnBu")
plt.title("Correla√ß√£o entre Vari√°veis")
plt.show()


#### Modelagem dos grafico ‚Üì

In [0]:
# Definir vari√°veis independentes (X) e dependente (y)
X = df.drop(columns=["Injury"])
y = df["Injury"]

# Divis√£o para inicio dos teste de treinamento
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)


#### Modelagem de regrass√£o linear ‚Üì

In [0]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)

# Converter para classifica√ß√£o (0 ou 1) ‚Üí dentro do range [0,1] √© bastante utilizado em analise de dados
y_pred_lr_class = (y_pred_lr >= 0.5).astype(int)

# M√©tricas utilizadas opara o gravico
mse = mean_squared_error(y_test, y_pred_lr)
r2 = r2_score(y_test, y_pred_lr)
acc_lr = accuracy_score(y_test, y_pred_lr_class)
f1_lr = f1_score(y_test, y_pred_lr_class)
print("üîπ Regress√£o Linear:")
print(f"MSE: {mse:.4f} | R¬≤: {r2:.4f}")
print(f"Acur√°cia: {acc_lr:.4f} | F1: {f1_lr:.4f}")

#### Aleatoridades ‚Üì

In [0]:
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

acc_rf = accuracy_score(y_test, y_pred_rf)
prec_rf = precision_score(y_test, y_pred_rf)
rec_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)

print("\nüîπ Random Forest:")
print(f"Acur√°cia: {acc_rf:.4f} | Precis√£o: {prec_rf:.4f} | Recall: {rec_rf:.4f} | F1: {f1_rf:.4f}")


####Matriz de confus√£o - Random Forest ‚Üì

In [0]:
cm = confusion_matrix(y_test, y_pred_rf)
plt.figure(figsize=(4,3))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Matriz de Confus√£o - Random Forest")
plt.xlabel("Predito")
plt.ylabel("Real")
plt.show()

# Import√¢ncia das vari√°veis (mais importantes dentro do modelo)
importances = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=True)
plt.figure(figsize=(8,5))
importances.plot(kind="barh", color="teal")
plt.title("Import√¢ncia das Vari√°veis - Random Forest")
plt.xlabel("Import√¢ncia")
plt.show()
