In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import src.preprocessing as prepro
import src.data_handler as data_handler
from src.models import LinealReg

raw_data : data_handler.RawData = data_handler.RawData()

In [2]:
processed_data : data_handler.ProcessedData = data_handler.ProcessedData(correct_data_types=True, standarize=False, area_units='m2')
processed_data.save_data()
processed_data_nona : data_handler.ProcessedData = data_handler.ProcessedData(correct_data_types=True, standarize=False, area_units='m2', remove_na_rows=True)
processed_data_nona.save_data(ext='nona')
processed_data_standarized : data_handler.ProcessedData = data_handler.ProcessedData(correct_data_types=True, standarize=True, area_units='m2')
processed_data_standarized.save_data(ext='standarized')
processed_data_standarized_nona : data_handler.ProcessedData = data_handler.ProcessedData(correct_data_types=True, standarize=True, area_units='m2', remove_na_rows=True) # Contiene los datos estandarizados, con los tipos corregidos, y unidades unificadas.
processed_data_standarized_nona.save_data(ext='standarized_nona')

4 ) Hago feature engineering a partir de un dataframe con datos procesados no estandarizados

4.1 )

Relleno los datos faltantes con el promedio de los datos

In [3]:
engineered_data : data_handler.ProcessedData = data_handler.ProcessedData(correct_data_types=True, standarize=False, area_units='m2')
engineered_data.fill_missing_values(method='mean')

Nueva feature 'high_latitude': Basada en 'lat' y 'lon', reemplazo ambas features con 1 en caso de que la latitud sea alta, y 0 en caso de que la latitud sea baja.

(Vimos en el punto 1 que siempre que la latitud es alta la longitud es baja).

In [4]:
engineered_data.casas_dev['high_latitude'] = np.where(engineered_data.casas_dev['lat'] > 0, True, False)
engineered_data.casas_dev = engineered_data.casas_dev.drop(columns=['lon', 'lat'])
engineered_data.casas_dev.head()

Unnamed: 0,area,area_units,is_house,has_pool,age,price,rooms,high_latitude
0,59,m2,True,False,18,546,2,True
1,102,m2,False,False,9,759,3,True
2,62,m2,False,False,7,464,2,True
3,127,m2,True,True,16,251,4,False
4,128,m2,False,False,8,963,4,True


In [5]:
train : pd.DataFrame
validation : pd.DataFrame
train, validation = data_handler.get_train_and_validation_sets(engineered_data.casas_dev, train_fraction=0.8, seed=42)
train_standarized : pd.DataFrame = prepro.standarize_numeric_columns(train, excluded_columns=set(['lat', 'lon']))
validation_standarized : pd.DataFrame = prepro.standarize_numeric_columns(train, excluded_columns=set(['lat', 'lon']))
other_features = ['area', 'price', 'age', 'rooms', 'high_latitude']
train_means : dict[str, float] = {f : train[f].mean() for f in other_features}
train_stds : dict[str, float] = {f : train[f].std() for f in other_features}
validation_means : dict[str, float] = {f : validation[f].mean() for f in other_features}
validation_stds : dict[str, float] = {f : validation[f].std() for f in other_features}

4.2 )

In [6]:
lin : LinealReg
for caract in ['rooms', 'age', 'high_latitude']:
    lin : LinealReg = LinealReg(train_standarized[['area', caract]].to_numpy(), train_standarized['price'].to_numpy())
    lin.fit_pseudo_inverse()
    print(f" \"area\", {f"\"{caract}\"":15}", f"|   Error Cuadrático Medio: {lin.error_cuadratico_medio():3f}", f"|   Error Least Square: {lin.error_least_squares_function():3f}")
print("")


 "area", "rooms"         |   Error Cuadrático Medio: 0.873407 |   Error Least Square: 628.853028
 "area", "age"           |   Error Cuadrático Medio: 0.847709 |   Error Least Square: 610.350463
 "area", "high_latitude" |   Error Cuadrático Medio: 0.137116 |   Error Least Square: 98.723250



In [None]:
a = []
seen_combinations : set = {}
for i in range(2, 9):
    engineered_data.casas_dev[f'area^{i}'] = engineered_data.casas_dev['area'] ** i
    engineered_data.casas_dev[f'age^{i}'] = engineered_data.casas_dev['age'] ** i
    engineered_data.casas_dev[f'rooms^{i}'] = engineered_data.casas_dev['rooms'] ** i
engineered_data.casas_dev.head()

Unnamed: 0,area,area_units,is_house,has_pool,age,price,rooms,high_latitude,area^2,age^2,...,rooms^5,area^6,age^6,rooms^6,area^7,age^7,rooms^7,area^8,age^8,rooms^8
0,59,m2,True,False,18,546,2,True,3481,324,...,32,42180533641,34012224,64,2488651484819,612220032,128,146830437604321,11019960576,256
1,102,m2,False,False,9,759,3,True,10404,81,...,243,1126162419264,531441,729,114868566764928,4782969,2187,11716593810022656,43046721,6561
2,62,m2,False,False,7,464,2,True,3844,49,...,32,56800235584,117649,64,3521614606208,823543,128,218340105584896,5764801,256
3,127,m2,True,True,16,251,4,False,16129,256,...,1024,4195872914689,16777216,4096,532875860165503,268435456,16384,67675234241018881,4294967296,65536
4,128,m2,False,False,8,963,4,True,16384,64,...,1024,4398046511104,262144,4096,562949953421312,2097152,16384,72057594037927936,16777216,65536
