## *Load Libraries*

In [1]:
import pandas as pd
import numpy as np
import warnings

In [2]:
warnings.filterwarnings('ignore')

### *Load Data Clear*

In [3]:
url = "C:\\Users\\PC\\Desktop\\Ciencias de datos\\Proyectos\\Criaglist\\data\\craiglist_clear.parquet"

In [4]:
df = pd.read_parquet(url)

### *Dummy Variables*

In [5]:
df['premium'] = np.where(df.premium=='yes',1,0)

In [6]:
df_ohe = pd.get_dummies(df)

In [7]:
df_ohe.head()

Unnamed: 0,price,year,condition,cylinders,odometer,premium,manufacturer_acura,manufacturer_alfa-romeo,manufacturer_audi,manufacturer_bmw,...,paint_color_brown,paint_color_custom,paint_color_green,paint_color_grey,paint_color_orange,paint_color_purple,paint_color_red,paint_color_silver,paint_color_white,paint_color_yellow
65,22500,2001,2.0,8.0,144700.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
539,17500,2001,2.0,6.0,178000.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
611,6900,2001,3.0,6.0,144445.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1516,5400,2001,4.0,8.0,198000.0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1848,5500,2001,2.0,8.0,116000.0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [8]:
drops = ['model_other','transmission_other','fuel_other','type_other']

In [9]:
df_ohe.drop(columns=drops,inplace=True)

The 'other' category can cause us problems, taking advantage of the dummy transformation we can afford the risk of eliminating these variables without losing important information.

Because we have already eliminated this category, it no longer returns indexes.

### *Train/Test/Validation*

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
train,test = train_test_split(df_ohe,test_size = 0.2,random_state = 42)
train,val = train_test_split(train,test_size = 0.2,random_state = 42)

In [12]:
train.shape,test.shape,val.shape

((84934, 6651), (26543, 6651), (21234, 6651))

### *StandarScaler*

We will perform a scale adjustment to speed up the training process of the neural network.

In [13]:
from sklearn.preprocessing import StandardScaler

In [14]:
scaler_columns = ['year','condition','odometer','cylinders']

In [15]:
scaler = StandardScaler().fit(train[scaler_columns])

In [16]:
scaler

StandardScaler()

Transform the data from the mean value and the standard deviation of the training data.

In [17]:
train[scaler_columns] = scaler.transform(train[scaler_columns])
test[scaler_columns] = scaler.transform(test[scaler_columns])
val[scaler_columns] = scaler.transform(val[scaler_columns])

In [18]:
scaler_y = StandardScaler().fit(train[['price']])

We create another scaler object for the predictor variable.

In [19]:
train[['price']] = scaler_y.transform(train[['price']])
test[['price']] = scaler_y.transform(test[['price']])
val[['price']] = scaler_y.transform(val[['price']])

### *Save Object Scaler*

In [20]:
import joblib

In [21]:
joblib.dump(scaler,'scaler_inputs.file')
joblib.dump(scaler_y,'scaler_target.file')

['scaler_target.file']

We saved both scalers, since we use them to preprocess new data, it makes our preprocessing task much easier.

## *Save JSON File*

In [22]:
import json

In [23]:
X = pd.get_dummies(df.drop(columns='price'),prefix='', prefix_sep='')

In [24]:
np.where('other'==X.columns)[0]

array([4379, 6616, 6625, 6636], dtype=int64)

In [26]:
X.drop(columns=['other'],inplace = True)

In [27]:
columns = X.columns

In [28]:
columns_dict = {'column': [col for col in columns]}

In [29]:
with open("columns.json","w") as F:

    F.write(json.dumps(columns_dict))

We save a JSON file with the names of the columns, to make it easier to process new data.

## Save Data Transform

In [30]:
url_train = "C:\\Users\\PC\\Desktop\\Ciencias de datos\\Proyectos\\Criaglist\\data\\train.parquet"
url_test = "C:\\Users\\PC\\Desktop\\Ciencias de datos\\Proyectos\\Criaglist\\data\\test.parquet"
url_val = "C:\\Users\\PC\\Desktop\\Ciencias de datos\\Proyectos\\Criaglist\\data\\val.parquet"

In [31]:
train.to_parquet(url_train)
test.to_parquet(url_test)
val.to_parquet(url_val)