In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import BayesianRidge
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import NearestNeighbors

In [35]:
#Librerias
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error as MAE,r2_score

import lightgbm as lgb

from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import cross_val_score, KFold
import xgboost as xgb
from sklearn.metrics import classification_report

In [36]:
url = "https://raw.githubusercontent.com/ITC-Nosotros/Modelo/main/archivo_modelo.csv"
cols = ['car_model','price','year_model','kms','colour','fuel_type','location','url_car']

data = pd.read_csv(url, names=cols , header = 0,encoding='utf-8')

In [37]:
print(data.shape)
data.head()

(9427, 8)


Unnamed: 0,car_model,price,year_model,kms,colour,fuel_type,location,url_car
0,Chevrolet Onix,52.900.000,2021,38500,Gris,Gasolina,Antioquia,https://articulo.tucarro.com.co/MCO-1409021891...
1,Chevrolet Onix,40.000.000,2016,47096,Blanco,Gasolina,Nariño,https://articulo.tucarro.com.co/MCO-2257666376...
2,Chevrolet Onix,65.000.000,2023,4700,Blanco,Gasolina,Antioquia,https://articulo.tucarro.com.co/MCO-1405115767...
3,Chevrolet Onix,69.000.000,2021,17310,Plateado,Gasolina,Valle Del Cauca,https://articulo.tucarro.com.co/MCO-2275306018...
4,Chevrolet Onix,65.000.000,2021,33627,Gris,Gasolina,Risaralda,https://articulo.tucarro.com.co/MCO-1404317531...


In [38]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9427 entries, 0 to 9426
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   car_model   9427 non-null   object
 1   price       9427 non-null   object
 2   year_model  9427 non-null   int64 
 3   kms         9427 non-null   object
 4   colour      9427 non-null   object
 5   fuel_type   9427 non-null   object
 6   location    9416 non-null   object
 7   url_car     9427 non-null   object
dtypes: int64(1), object(7)
memory usage: 589.3+ KB


In [39]:
#Valores nulos y en que medidad
data.isnull().sum()

car_model      0
price          0
year_model     0
kms            0
colour         0
fuel_type      0
location      11
url_car        0
dtype: int64

In [40]:
# eliminar los valores nulos de la columna price
data['price'] = data['price'].dropna()

In [41]:
data['price'] = data['price'].str.replace('.', '')
#data['car_model'] = data['car_model'].str.replace(' ', '_')
data['kms'] = data['kms'].str.replace('.', '')
data['price'] = data['price'].str.replace(',', '')
data['kms'] = data['kms'].str.replace(',', '')

In [42]:
#pd.to_numeric() de pandas para convertir la columna 'price'
data['price'] = pd.to_numeric(data['price'])
data['kms'] = pd.to_numeric(data['kms'])

In [43]:
#Recorre todas las columnas del DataFrame data que tienen un tipo de datos object y las convierte en categorías utilizando el método astype('category').
for col in data.select_dtypes(include=['object']).columns:
  data[col] = data[col].astype('category')

In [44]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9427 entries, 0 to 9426
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   car_model   9427 non-null   category
 1   price       9427 non-null   int64   
 2   year_model  9427 non-null   int64   
 3   kms         9427 non-null   int64   
 4   colour      9427 non-null   category
 5   fuel_type   9427 non-null   category
 6   location    9416 non-null   category
 7   url_car     9427 non-null   category
dtypes: category(5), int64(3)
memory usage: 609.5 KB


In [45]:
#Valores nulos y en que medidad
data.isna().sum()

car_model      0
price          0
year_model     0
kms            0
colour         0
fuel_type      0
location      11
url_car        0
dtype: int64

In [46]:
#eliminará todas las filas que contienen al menos un valor nulo y actualizará el DataFrame original.
data.dropna(inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9416 entries, 0 to 9426
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   car_model   9416 non-null   category
 1   price       9416 non-null   int64   
 2   year_model  9416 non-null   int64   
 3   kms         9416 non-null   int64   
 4   colour      9416 non-null   category
 5   fuel_type   9416 non-null   category
 6   location    9416 non-null   category
 7   url_car     9416 non-null   category
dtypes: category(5), int64(3)
memory usage: 682.6 KB


In [47]:
# Preprocessing
categorical_features = ['car_model','location']
numeric_features = ['price','kms','year_model']

selected_columns = categorical_features + numeric_features
new_data = data[selected_columns]

# Display the first few rows of the new dataset
print(new_data.head())

# Standarization (both categorical and numerical variables)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)])

        car_model         location     price    kms  year_model
0  Chevrolet Onix        Antioquia  52900000  38500        2021
1  Chevrolet Onix           Nariño  40000000  47096        2016
2  Chevrolet Onix        Antioquia  65000000   4700        2023
3  Chevrolet Onix  Valle Del Cauca  69000000  17310        2021
4  Chevrolet Onix        Risaralda  65000000  33627        2021


In [48]:
new_data

Unnamed: 0,car_model,location,price,kms,year_model
0,Chevrolet Onix,Antioquia,52900000,38500,2021
1,Chevrolet Onix,Nariño,40000000,47096,2016
2,Chevrolet Onix,Antioquia,65000000,4700,2023
3,Chevrolet Onix,Valle Del Cauca,69000000,17310,2021
4,Chevrolet Onix,Risaralda,65000000,33627,2021
...,...,...,...,...,...
9422,Toyota Corolla,Antioquia,157000000,0,2024
9423,Toyota Corolla,Cundinamarca,68000000,77000,2019
9424,Toyota Corolla,Bogotá D.C.,72500000,69000,2019
9425,Toyota Corolla,Bogotá D.C.,129000000,26000,2022


In [49]:
preprocessor

In [50]:
# Define pipeline for regression model
model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])


In [51]:
model

In [52]:
# Este bloque de código divide los datos en conjuntos de entrenamiento y prueba para su posterior modelado y evaluación
# Split data: training and testing
# X = new_data.drop('price', axis=1)
X = new_data
y = data['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

In [53]:
X.shape

(9416, 5)

In [54]:
X_train.head()

Unnamed: 0,car_model,location,price,kms,year_model
991,Chevrolet Onix,Valle Del Cauca,73990000,24147,2022
7223,Renault Duster,Antioquia,90900000,21900,2023
6562,Chevrolet Onix,Antioquia,43900000,54950,2020
8194,Renault Duster,Bogotá D.C.,25000000,0,2023
3519,Mazda 2,Antioquia,61900000,71053,2018


In [55]:
# devolverá las primeras filas de datos en y_train
y_train.head()

991     73990000
7223    90900000
6562    43900000
8194    25000000
3519    61900000
Name: price, dtype: int64

In [56]:
# Training the price prediction model (regression)
model.fit(X_train, y_train)

In [57]:
# Evaluating the price prediction model (regression)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 5831.080724970928


### Regresión lineal multivariable
La regresión lineal multivariable es una técnica para predecir una variable (dependiente) basada en múltiples variables predictoras. Modela la relación entre estas variables como una función lineal, donde se buscan coeficientes óptimos que minimicen la diferencia entre los valores observados y los predichos. Es ampliamente utilizada en estadísticas y aprendizaje automático para análisis predictivo y comprensión de relaciones entre variables.

In [58]:
# accuracy check
mse = mean_squared_error (y_test, y_pred)
rmse = mean_squared_error (y_test, y_pred, squared=False)
mae = MAE (y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("MSE: %.2f" % mse)
print("RMSE: %.2f" % rmse)
print("MAE: %.2f" % mae)
print("R2: %.2f" % r2)

MSE: 5831.08
RMSE: 76.36
MAE: 12.63
R2: 1.00


### Light GBM (Gradient Boosting Machine)

Es una biblioteca de aprendizaje automático de código abierto y de alta eficiencia que se utiliza para problemas de regresión, clasificación y ranking. Utiliza el algoritmo de refuerzo de gradiente para construir un modelo predictivo mediante la combinación de múltiples árboles de decisión débiles.

In [59]:
# Hyperparameters
params = {
    'task': 'train',
    'boosting': 'gbdt',
    'objective': 'regression',
    'num_leaves': 5,
    'learning_rate': 0.05,
    'metric': {'l2','l1'},
    'header' : 'true',
    'verbose': 0
}

# loading data
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

# fitting the model
model2 = lgb.train(params,
                 train_set=lgb_train,
                 valid_sets=lgb_eval)
# Pred
y_pred2 = model2.predict(X_test)

In [60]:
# accuracy check
mse = mean_squared_error(y_test, y_pred2)
rmse = mean_squared_error(y_test, y_pred2, squared=False)
mae = MAE(y_test, y_pred2)
r2 = r2_score(y_test, y_pred2)
print("MSE: %.2f" % mse)
print("RMSE: %.2f" % rmse)
print("MAE: %.2f" % mae)
print("R2: %.2f" % r2)

MSE: 78091140504721.44
RMSE: 8836919.17
MAE: 456669.45
R2: 0.92


In [61]:
# Define the function to recommend cars
def recommend_and_predict(car_features_df, data, model):

    # Preprocessing
    categorical_features = ['car_model','location']
    numeric_features = ['kms','year_model']
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_features),
            ('cat', OneHotEncoder(), categorical_features)
        ],
        remainder='drop'
    )

    # Pipeline to transformed data
    pipe = Pipeline(steps=[('preprocessor', preprocessor)])
    transformed_data = pipe.fit_transform(data.drop(['price'], axis=1))
    transformed_query = pipe.transform(car_features_df)

    # Price prediction
    predicted_price = model.predict(car_features_df)
    print(f"Predicted Price: {predicted_price[0]}")

    # Applying K-Nearest Neighbors for searching similar cars
    n_neighbors = 5
    nn = NearestNeighbors(n_neighbors=n_neighbors)
    nn.fit(transformed_data)

    # Finding the nearest neighbors for the input 'car_features_df'
    distances, indices = nn.kneighbors(transformed_query)

    # Obtaining similar cars
    similar_cars = data.iloc[indices[0]]
    return similar_cars

In [62]:
new_data_y = new_data
new_data_y['price'] = data['price']
new_data_y.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_data_y['price'] = data['price']


Unnamed: 0,car_model,location,price,kms,year_model
0,Chevrolet Onix,Antioquia,52900000,38500,2021
1,Chevrolet Onix,Nariño,40000000,47096,2016
2,Chevrolet Onix,Antioquia,65000000,4700,2023
3,Chevrolet Onix,Valle Del Cauca,69000000,17310,2021
4,Chevrolet Onix,Risaralda,65000000,33627,2021


In [63]:
new_data_y.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9416 entries, 0 to 9426
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   car_model   9416 non-null   category
 1   location    9416 non-null   category
 2   price       9416 non-null   int64   
 3   kms         9416 non-null   int64   
 4   year_model  9416 non-null   int64   
dtypes: category(2), int64(3)
memory usage: 313.5 KB


In [64]:
# Example use of the function (ensure your DataFrame and model are correctly defined)
test_data_recommended = pd.DataFrame([{
    'car_model': 'Mazda 2', 'location': 'Antioquia', 'price': 40000000,'kms':'20000', 'year_model': 2010}])

similar_cars = recommend_and_predict(test_data_recommended, new_data_y, model)
print("Recommended Similar Cars:")
print(similar_cars)

Predicted Price: 39999991.45971565
Recommended Similar Cars:
     car_model   location     price    kms  year_model
3490   Mazda 2  Antioquia  38900000  72000        2012
1973   Mazda 2  Antioquia  38900000  72000        2012
1107   Mazda 2  Antioquia  38900000  72000        2012
4028   Mazda 2  Antioquia  38900000  72000        2012
216    Mazda 2  Antioquia  38900000  72000        2012


In [65]:
# Example use of the function (ensure your DataFrame and model are correctly defined)
test_data_recommended = pd.DataFrame([{
    'car_model': 'Suzuki Swift', 'location': 'Bogotá D.C.', 'price': 30000000,'kms':'80000', 'year_model': 2020}])

similar_cars = recommend_and_predict(test_data_recommended, new_data_y, model)
print("Recommended Similar Cars:")
print(similar_cars)

Predicted Price: 30000003.90761947
Recommended Similar Cars:
         car_model     location     price    kms  year_model
6419  Suzuki Swift  Bogotá D.C.  43500000  78200        2019
8416  Suzuki Swift  Bogotá D.C.  42000000  78200        2019
9141  Suzuki Swift  Bogotá D.C.  42000000  78200        2019
2968  Suzuki Swift  Bogotá D.C.  42000000  71000        2019
1460  Suzuki Swift  Bogotá D.C.  42000000  71000        2019
