<a href="https://colab.research.google.com/github/Marcelpmf/Workshop-dados-25.2/blob/main/Desafio10_09.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:

"""Projeto ML - Classificação de Carros"""

import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib
import numpy as np

np.random.seed(42)

marcas_premium = ["BMW", "Audi", "Mercedes"]
marcas_populares = ["Fiat", "Volkswagen", "Hyundai", "Toyota", "Honda", "Chevrolet", "Ford"]

n = 60
data = {"marca": [], "ano": [], "potencia": [], "consumo": [], "preco": [], "quilometragem": [], "caro": []}

for i in range(n):
    if np.random.rand() > 0.5:
        # Carro premium, mas com chance de preço baixo (~30%)
        marca = np.random.choice(marcas_premium)
        ano = np.random.randint(2018, 2023)
        potencia = np.random.randint(180, 400)
        consumo = round(np.random.uniform(7, 12), 1)
        preco = np.random.randint(90000, 250000)  # inclui preços mais baixos
        km = np.random.randint(5000, 60000)
        caro = int(preco >= 120000)
    else:
        # Carro popular, mas com chance de preço alto (~30%)
        marca = np.random.choice(marcas_populares)
        ano = np.random.randint(2010, 2022)
        potencia = np.random.randint(70, 150)
        consumo = round(np.random.uniform(12, 18), 1)
        preco = np.random.randint(40000, 150000)  # inclui preços mais altos
        km = np.random.randint(20000, 120000)
        caro = int(preco >= 120000)

    data["marca"].append(marca)
    data["ano"].append(ano)
    data["potencia"].append(potencia)
    data["consumo"].append(consumo)
    data["preco"].append(preco)
    data["quilometragem"].append(km)
    data["caro"].append(caro)

df = pd.DataFrame(data)
print(df.head())

# =================== Features / Target =================== #
features = ["marca", "ano", "potencia", "consumo", "quilometragem"]
X = df[features]
y = df["caro"]

num_features = ["ano", "potencia", "consumo", "quilometragem"]
cat_features = ["marca"]

# =================== Pipelines =================== #
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", num_pipeline, num_features),
    ("cat", cat_pipeline, cat_features)
])

# =================== Treino / Teste =================== #
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
print("\nDimensões treino/teste:", X_train.shape, X_test.shape)

# =================== Modelos =================== #
models = {
    "LogisticRegression": LogisticRegression(max_iter=500),
    "RandomForest": RandomForestClassifier(n_estimators=200, max_depth=6, random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=3)
}

pipelines = {name: Pipeline([("preprocessor", preprocessor), ("model", model)]) for name, model in models.items()}

# =================== Treinamento / Avaliação =================== #
results = {}
for name, pipe in pipelines.items():
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    acc = accuracy_score(y_test, y_pred) * 100
    print(f"\n=== {name} ===")
    print("Acurácia: {:.2f}%".format(acc))
    print(classification_report(y_test, y_pred, digits=4))

    cv = cross_val_score(pipe, X, y, cv=5, scoring="accuracy")
    print("CV Mean: {:.2f}% ± {:.2f}%".format(cv.mean()*100, cv.std()*100))

    results[name] = acc

best_model_name = max(results, key=results.get)
best_model = pipelines[best_model_name]
print("\nMelhor modelo:", best_model_name)

joblib.dump(best_model, "best_model_carros.pkl")
print("Modelo salvo em best_model_carros.pkl")

# =================== Teste Novos Carros =================== #
novos_carros = pd.DataFrame({
    "marca": ["Toyota", "BMW", "Fiat", "Mercedes"],
    "ano": [2020, 2022, 2015, 2021],
    "potencia": [110, 320, 85, 280],
    "consumo": [14, 9, 16, 8],
    "quilometragem": [50000, 15000, 80000, 12000]
})

previsoes = best_model.predict(novos_carros)
print("\nPrevisão novos carros (0=Barato, 1=Caro):", previsoes)


       marca   ano  potencia  consumo   preco  quilometragem  caro
0      Honda  2020       141     15.6   94886          26265     0
1    Hyundai  2020        93     15.9  104820          20769     0
2       Audi  2019       371     12.0  175305          33693     1
3       Fiat  2010       128     14.4   58431          22747     0
4  Chevrolet  2012       124     17.9   51394          89092     0

Dimensões treino/teste: (42, 5) (18, 5)

=== LogisticRegression ===
Acurácia: 66.67%
              precision    recall  f1-score   support

           0     0.7500    0.6000    0.6667        10
           1     0.6000    0.7500    0.6667         8

    accuracy                         0.6667        18
   macro avg     0.6750    0.6750    0.6667        18
weighted avg     0.6833    0.6667    0.6667        18

CV Mean: 65.00% ± 11.06%

=== RandomForest ===
Acurácia: 77.78%
              precision    recall  f1-score   support

           0     0.8750    0.7000    0.7778        10
           1