In [24]:
import pandas as pd
import numpy as np
import warnings

warnings.simplefilter("ignore")

In [25]:
df = pd.read_csv("autos_limpios.csv")
df

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price,marca
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0,alfa-romero
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0,alfa-romero
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0,alfa-romero
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,mpfi,3.19,3.40,10.0,102,5500,24,30,13950.0,audi
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,mpfi,3.19,3.40,8.0,115,5500,18,22,17450.0,audi
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,201,-1,volvo 145e (sw),gas,std,four,sedan,rwd,front,109.1,...,mpfi,3.78,3.15,9.5,114,5400,23,28,16845.0,volvo
201,202,-1,volvo 144ea,gas,turbo,four,sedan,rwd,front,109.1,...,mpfi,3.78,3.15,8.7,160,5300,19,25,19045.0,volvo
202,203,-1,volvo 244dl,gas,std,four,sedan,rwd,front,109.1,...,mpfi,3.58,2.87,8.8,134,5500,18,23,21485.0,volvo
203,204,-1,volvo 246,diesel,turbo,four,sedan,rwd,front,109.1,...,idi,3.01,3.40,23.0,106,4800,26,27,22470.0,volvo


### Seleccionamos las columnas que vamos a usar para entrenar y evaluar el modelo

In [26]:
df_ml = df[["symboling", "fueltype", "aspiration", "doornumber", "carbody", "drivewheel", "enginelocation",
            "wheelbase", "carlength", "carwidth", "carheight", "curbweight", "enginetype", "cylindernumber",
            "enginesize", "fuelsystem", "stroke", "compressionratio", "horsepower", "peakrpm",
            "citympg", "highwaympg", "price", "marca"]]

df_ml.head()

Unnamed: 0,symboling,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,carwidth,...,enginesize,fuelsystem,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price,marca
0,3,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,...,130,mpfi,2.68,9.0,111,5000,21,27,13495.0,alfa-romero
1,3,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,...,130,mpfi,2.68,9.0,111,5000,21,27,16500.0,alfa-romero
2,1,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,...,152,mpfi,3.47,9.0,154,5000,19,26,16500.0,alfa-romero
3,2,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,...,109,mpfi,3.4,10.0,102,5500,24,30,13950.0,audi
4,2,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,...,136,mpfi,3.4,8.0,115,5500,18,22,17450.0,audi


Columna symboling

In [27]:
df_ml["symboling"].unique()

array([ 3,  1,  2,  0, -1, -2], dtype=int64)

Columna fueltype

In [28]:
df_ml["fueltype"].replace("gas", 0, inplace=True)
df_ml["fueltype"].replace("diesel", 1, inplace=True)
df_ml["fueltype"].unique()

array([0, 1], dtype=int64)

Columna aspiration

In [29]:
df_ml["aspiration"].unique()

array(['std', 'turbo'], dtype=object)

In [30]:
df_ml["aspiration"].replace("std", 0, inplace=True)
df_ml["aspiration"].replace("turbo", 1, inplace=True)
df_ml["aspiration"].unique()

array([0, 1], dtype=int64)

Columna doornumber

In [31]:
df_ml["doornumber"].unique()

array(['two', 'four'], dtype=object)

In [32]:
df_ml["doornumber"].replace("two", 2,inplace=True)
df_ml["doornumber"].replace("four", 4,inplace=True)
df_ml["doornumber"].unique()

array([2, 4], dtype=int64)

Para agilizar el proceso vamos a usar un LabelEncoder para cambiar las palabras por numeros

In [33]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()


df_ml["carbody"] = label_encoder.fit_transform(df_ml["carbody"])

def encoder(columna):
    df_ml[columna] = label_encoder.fit_transform(df_ml[columna])

encoder("drivewheel")
encoder("enginelocation")
encoder("fuelsystem")
encoder("marca")
encoder("enginetype")
encoder("cylindernumber")

Creamos la columna para poder realizar entrenar al modelo sobre si un modelo es gama alta o baja

In [46]:
#Creamos la funcion para aplicarla a la columna
def mediana_func(price, mediana):
        #Si el precio es mayor a la mediana
        if price > mediana:
            #Retorna 1 (Gama alta)
            return 1
        else:
            # Si no retorna 0 (Gama baja)
            return 0

#Creamos la mediana        
mediana = df_ml["price"].median()

In [47]:
#Aplicamos la función
df_ml["gama_alta"] = df_ml["price"].apply(lambda x: mediana_func(x, mediana))

In [50]:
#Verificamos que los resultados se hayan aplicado correctamente
df_ml[df_ml["price"] > mediana]

Unnamed: 0,symboling,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,carwidth,...,fuelsystem,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price,marca,gama_alta
0,3,0,0,2,0,2,0,88.6,168.8,64.1,...,5,2.68,9.0,111,5000,21,27,13495.0,0,1
1,3,0,0,2,0,2,0,88.6,168.8,64.1,...,5,2.68,9.0,111,5000,21,27,16500.0,0,1
2,1,0,0,2,2,2,0,94.5,171.2,65.5,...,5,3.47,9.0,154,5000,19,26,16500.0,0,1
3,2,0,0,4,3,1,0,99.8,176.6,66.2,...,5,3.40,10.0,102,5500,24,30,13950.0,1,1
4,2,0,0,4,3,0,0,99.4,176.6,66.4,...,5,3.40,8.0,115,5500,18,22,17450.0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,-1,0,0,4,3,2,0,109.1,188.8,68.9,...,5,3.15,9.5,114,5400,23,28,16845.0,21,1
201,-1,0,1,4,3,2,0,109.1,188.8,68.8,...,5,3.15,8.7,160,5300,19,25,19045.0,21,1
202,-1,0,0,4,3,2,0,109.1,188.8,68.9,...,5,2.87,8.8,134,5500,18,23,21485.0,21,1
203,-1,1,1,4,3,2,0,109.1,188.8,68.9,...,3,3.40,23.0,106,4800,26,27,22470.0,21,1


In [53]:
df_ml.to_csv("datos_entrenamiento.csv", index=False)