# Trabalho Prático 1 – Aprendizagem Automática
## Previsão do Preço de Carros Usados

Este trabalho aborda um problema de aprendizagem automática do tipo regressão,
onde o objetivo é prever o preço de carros usados com base nas suas características.


In [3]:
# Bibliotecas para manipulação de dados
import pandas as pd
import numpy as np


ModuleNotFoundError: No module named 'pandas'

In [None]:
# Ferramentas de aprendizagem automática
from sklearn.model_selection import train_test_split


In [None]:
train_df = pd.read_csv("train.csv")


In [None]:
train_df.head()


Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200
1,1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999
2,2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900
3,3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000
4,4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500


In [None]:
train_df.shape


(188533, 13)

In [None]:
train_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188533 entries, 0 to 188532
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            188533 non-null  int64 
 1   brand         188533 non-null  object
 2   model         188533 non-null  object
 3   model_year    188533 non-null  int64 
 4   milage        188533 non-null  int64 
 5   fuel_type     183450 non-null  object
 6   engine        188533 non-null  object
 7   transmission  188533 non-null  object
 8   ext_col       188533 non-null  object
 9   int_col       188533 non-null  object
 10  accident      186081 non-null  object
 11  clean_title   167114 non-null  object
 12  price         188533 non-null  int64 
dtypes: int64(4), object(9)
memory usage: 18.7+ MB


In [None]:
X = train_df.drop("price", axis=1)
y = train_df["price"]


In [None]:
X.shape, y.shape


((188533, 12), (188533,))

In [None]:
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X.select_dtypes(include=["object"]).columns


In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer


In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression


In [None]:
linreg_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", LinearRegression())
])


In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error


In [None]:
scores = cross_val_score(
    linreg_pipeline,
    X,
    y,
    cv=5,
    scoring="neg_root_mean_squared_error"
)

rmse_linreg = -scores.mean()
rmse_linreg


74000.80167003523

KNN STRATEGIE

In [None]:
from sklearn.neighbors import KNeighborsRegressor


In [None]:
knn_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", KNeighborsRegressor())
])


In [None]:
param_grid_knn = {
    "model__n_neighbors": [3, 7],
    "model__weights": ["distance"]
}



In [None]:
from sklearn.model_selection import GridSearchCV


In [None]:
grid_knn = GridSearchCV(
    knn_pipeline,
    param_grid_knn,
    cv=5,
    scoring="neg_mean_squared_error",
    n_jobs=-1,
    error_score=np.nan
)



In [None]:
#grid_knn.fit(X, y)

In [None]:
#best_knn_rmse = -grid_knn.best_score_

In [None]:

#best_knn_rmse = -grid_knn.best_score_

#best_knn_rmse


In [None]:
#grid_knn.best_params_


O modelo K-Nearest Neighbors foi implementado com pipeline completo, incluindo
normalização, one-hot encoding e avaliação através de grid search com cross-validation.
No entanto, devido à elevada dimensionalidade introduzida pelas variáveis categóricas,
o KNN apresentou instabilidade durante a validação cruzada, originando scores não finitos
em vários folds. Por esse motivo, não foi possível obter um valor de RMSE fiável para este
modelo, pelo que não foi considerado para seleção final.


ÁRVORES DE DECISÃO

In [None]:
from sklearn.tree import DecisionTreeRegressor


In [None]:
tree_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", DecisionTreeRegressor(random_state=42))
])


DEFINIR O GRID

In [None]:
param_grid_tree = {
    "model__max_depth": [5, 10],
    "model__min_samples_leaf": [1, 5]
}


In [None]:
grid_tree = GridSearchCV(
    tree_pipeline,
    param_grid_tree,
    cv=5,
    scoring="neg_root_mean_squared_error",
    n_jobs=-1
)


In [None]:
grid_tree.fit(X, y)


In [None]:
best_tree_rmse = -grid_tree.best_score_
best_tree_params = grid_tree.best_params_

best_tree_rmse, best_tree_params
 