# Classificação de dados

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBRegressor
from sklearn.model_selection import KFold
import gc
from sklearn.impute import KNNImputer
from sklearn.model_selection import KFold

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

from sklearn.preprocessing import LabelEncoder,OrdinalEncoder
from sklearn.linear_model import LinearRegression

In [2]:
train_path = '../data/raw/train.csv'
df = pd.read_csv(train_path)

In [3]:
df.drop(axis = 1, columns=['id'], inplace=True)

In [4]:

def knn_impute(df, n_neighbors=5):   
    df_encoded = df.copy()
    for col in df_encoded.select_dtypes(include='object').columns:
        df_encoded[col] = df_encoded[col].astype('category').cat.codes
    knn_imputer = KNNImputer(n_neighbors=n_neighbors)
    df_imputed = pd.DataFrame(knn_imputer.fit_transform(df_encoded), columns=df_encoded.columns)
    for col in df.select_dtypes(include='object').columns:
        df_imputed[col] = df_imputed[col].round().astype(int).map(
            dict(enumerate(df[col].astype('category').cat.categories)))
    return df_imputed


df_imput = knn_impute(df, n_neighbors=25)

In [5]:
cat_cols_train = df_imput.select_dtypes(include=['object']).columns
cat_cols_train = cat_cols_train[cat_cols_train != 'class']
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

df_imput[cat_cols_train] = ordinal_encoder.fit_transform(df_imput[cat_cols_train].astype(str))


In [6]:
df_imput

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,31.0,495.0,2007.0,213000.0,2.0,116.0,38.0,312.0,71.0,1.0,0.0,4200.0
1,28.0,930.0,2002.0,143250.0,2.0,366.0,38.0,263.0,10.0,0.0,0.0,4999.0
2,9.0,1575.0,2002.0,136731.0,1.0,640.0,38.0,38.0,71.0,1.0,0.0,13900.0
3,16.0,758.0,2017.0,19500.0,2.0,863.0,49.0,29.0,14.0,1.0,0.0,45000.0
4,36.0,1077.0,2021.0,7388.0,2.0,259.0,23.0,29.0,10.0,1.0,0.0,97500.0
...,...,...,...,...,...,...,...,...,...,...,...,...
188528,8.0,604.0,2017.0,49000.0,2.0,866.0,49.0,304.0,10.0,1.0,0.0,27500.0
188529,36.0,206.0,2018.0,28600.0,2.0,770.0,31.0,304.0,14.0,0.0,0.0,30000.0
188530,36.0,223.0,2021.0,13650.0,2.0,921.0,23.0,304.0,14.0,1.0,0.0,86900.0
188531,3.0,1471.0,2022.0,13895.0,2.0,512.0,1.0,82.0,14.0,1.0,1.0,84900.0


In [7]:
df_imput['engine_transmission'] = df_imput['engine'] * df_imput['transmission']

df_imput['int_ext_color'] = df_imput['int_col']*df_imput['ext_col']

df_imput.drop(columns = ['engine', 'transmission', 'int_col', 'ext_col'], inplace = True)

In [8]:
df_imput

Unnamed: 0,brand,model,model_year,milage,fuel_type,accident,clean_title,price,engine_transmission,int_ext_color
0,31.0,495.0,2007.0,213000.0,2.0,1.0,0.0,4200.0,4408.0,22152.0
1,28.0,930.0,2002.0,143250.0,2.0,0.0,0.0,4999.0,13908.0,2630.0
2,9.0,1575.0,2002.0,136731.0,1.0,1.0,0.0,13900.0,24320.0,2698.0
3,16.0,758.0,2017.0,19500.0,2.0,1.0,0.0,45000.0,42287.0,406.0
4,36.0,1077.0,2021.0,7388.0,2.0,1.0,0.0,97500.0,5957.0,290.0
...,...,...,...,...,...,...,...,...,...,...
188528,8.0,604.0,2017.0,49000.0,2.0,1.0,0.0,27500.0,42434.0,3040.0
188529,36.0,206.0,2018.0,28600.0,2.0,0.0,0.0,30000.0,23870.0,4256.0
188530,36.0,223.0,2021.0,13650.0,2.0,1.0,0.0,86900.0,21183.0,4256.0
188531,3.0,1471.0,2022.0,13895.0,2.0,1.0,1.0,84900.0,512.0,1148.0


In [9]:
def remove_outliers_iqr(df, column):   
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1   
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR    
    df_out = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df_out

df_train_no_outliers = remove_outliers_iqr(df_imput, 'milage')
df_train_no_outliers = remove_outliers_iqr(df_imput, 'price')
df_train_no_outliers.reset_index(drop=True, inplace=True)
df = df_train_no_outliers
df.shape

(177653, 10)

## Classificação

In [10]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.pipeline import Pipeline

# Técnicas de pré-processamento
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import FunctionTransformer

# Técnicas de avaliação/validação
from sklearn.metrics import mean_squared_error


In [11]:
seed = 42

In [12]:
y = df['price'] 
X = df.drop(['price'],axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

In [13]:
X

Unnamed: 0,brand,model,model_year,milage,fuel_type,accident,clean_title,engine_transmission,int_ext_color
0,31.0,495.0,2007.0,213000.0,2.0,1.0,0.0,4408.0,22152.0
1,28.0,930.0,2002.0,143250.0,2.0,0.0,0.0,13908.0,2630.0
2,9.0,1575.0,2002.0,136731.0,1.0,1.0,0.0,24320.0,2698.0
3,16.0,758.0,2017.0,19500.0,2.0,1.0,0.0,42287.0,406.0
4,36.0,1077.0,2021.0,7388.0,2.0,1.0,0.0,5957.0,290.0
...,...,...,...,...,...,...,...,...,...
177648,8.0,604.0,2017.0,49000.0,2.0,1.0,0.0,42434.0,3040.0
177649,36.0,206.0,2018.0,28600.0,2.0,0.0,0.0,23870.0,4256.0
177650,36.0,223.0,2021.0,13650.0,2.0,1.0,0.0,21183.0,4256.0
177651,3.0,1471.0,2022.0,13895.0,2.0,1.0,1.0,512.0,1148.0


In [14]:
scaler = StandardScaler()
X_train.loc[:, ['brand','model', 'milage', 'int_ext_color', 'engine_transmission']] = scaler.fit_transform(X_train.loc[:, ['brand','model', 'milage', 'int_ext_color', 'engine_transmission']] )
X_test.loc[:, ['brand','model', 'milage', 'int_ext_color', 'engine_transmission']]  = scaler.transform(X_test.loc[:, ['brand','model', 'milage', 'int_ext_color', 'engine_transmission']] )

In [15]:
for n_neighbors in np.arange(10,15):
    knn_ = KNeighborsRegressor(n_neighbors=n_neighbors)
    knn_.fit(X_train, y_train)
    knn_predict = knn_.predict(X_test)
    rmse = mean_squared_error(knn_predict, y_test)
    print('MSE KNN {} VIZINHOS: '.format(n_neighbors), rmse)

MSE KNN 10 VIZINHOS:  190968445.83270642
MSE KNN 11 VIZINHOS:  189460890.86356854
MSE KNN 12 VIZINHOS:  188322662.68245372
MSE KNN 13 VIZINHOS:  187688594.02755722
MSE KNN 14 VIZINHOS:  187069248.00565088


In [16]:
tree_ = DecisionTreeRegressor(random_state=42)
tree_.fit(X_train, y_train)
tree_predict = tree_.predict(X_test)
rmse = mean_squared_error(tree_predict, y_test)
print('MSE TREE : ', rmse )

MSE TREE :  354303295.5459317


In [17]:
mult_lin = LinearRegression()
mult_lin.fit(X_train, y_train)
lin_predict = mult_lin.predict(X_test)
rmse = mean_squared_error(lin_predict, y_test)
print('MSE LIN : ', rmse )

MSE LIN :  222288526.2740925


In [19]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))
print(f"Random Forest RMSE: {rf_rmse:.2f}")

Random Forest RMSE: 13509.94


In [None]:
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)
xgb_rmse = np.sqrt(mean_squared_error(y_test, xgb_pred))
print(xgb_rmse)

13032.30470043588


In [129]:
df_comparativo = pd.DataFrame({'Real':y_test, 'KNN': knn_predict, 'Tree': tree_predict, 'Multi_Linear': lin_predict, 'RandomForest': rf_pred, 'XGB':xgb_pred})

In [130]:
df_comparativo.sample(10)

Unnamed: 0,Real,KNN,Tree,Multi_Linear,RandomForest,XGB
143364,18900.0,21649.214286,14999.0,14960.318717,18111.15,20732.029297
117605,58000.0,29964.714286,20399.0,33800.466641,33274.48,30706.476562
140568,10500.0,41400.642857,33500.0,43279.922137,48338.23,38470.070312
82965,40999.0,48585.142857,53722.0,43580.918395,43679.9,55207.742188
153122,12000.0,10514.285714,13500.0,14676.817154,10348.87,11584.907227
154667,34500.0,45961.071429,73900.0,39261.884964,52625.16,42610.554688
8809,7500.0,7871.071429,8200.0,5047.239839,7799.45,7797.247559
3063,17000.0,38086.0,73000.0,47462.852626,42523.6,42332.375
113335,62450.0,39389.0,35999.0,42198.363322,36511.96,36536.199219
94262,15000.0,35724.214286,12000.0,36253.55346,26939.03,34582.734375


In [20]:
# Adicionar após o treinamento do melhor modelo (XGBoost neste caso)
import joblib

# Salvando o modelo
model_path = '../models/car_price_model.pkl'
joblib.dump(rf_model, model_path)
print(f"Modelo salvo em: {model_path}")

Modelo salvo em: ../models/car_price_model.pkl
