In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import src.preprocessing as prepro
import src.data_handler as data_handler
from src.models import LinealReg

raw_data : data_handler.RawData = data_handler.RawData()

In [2]:
processed_data : data_handler.ProcessedData = data_handler.ProcessedData(correct_data_types=True, standarize=False, area_units='m2')
processed_data.save_data()
processed_data_nona : data_handler.ProcessedData = data_handler.ProcessedData(correct_data_types=True, standarize=False, area_units='m2', remove_na_rows=True)
processed_data_nona.save_data(ext='nona')
processed_data_standarized : data_handler.ProcessedData = data_handler.ProcessedData(correct_data_types=True, standarize=True, area_units='m2')
processed_data_standarized.save_data(ext='standarized')
processed_data_standarized_nona : data_handler.ProcessedData = data_handler.ProcessedData(correct_data_types=True, standarize=True, area_units='m2', remove_na_rows=True) # Contiene los datos estandarizados, con los tipos corregidos, y unidades unificadas.
processed_data_standarized_nona.save_data(ext='standarized_nona')

4 ) Hago feature engineering a partir de un dataframe con datos procesados no estandarizados

4.1 )

Relleno los datos faltantes con el promedio de los datos

In [3]:
engineered_data : data_handler.ProcessedData = data_handler.ProcessedData(correct_data_types=False, standarize=False, area_units='m2')
engineered_data.fill_missing_values(method='mean')

Nuevas features <br><br>
'high_latitude': Basada en 'lat' y 'lon', reemplazo ambas features con 1 en caso de que la latitud sea alta, y 0 en caso de que la latitud sea baja.<br>
'rooms_per_area': rooms / area<br>
'log_area': log(area)

(Vimos en el punto 1 que siempre que la latitud es alta la longitud es baja).

In [4]:
engineered_data.casas_dev['high_latitude'] = np.where(engineered_data.casas_dev['lat'] > 0, True, False)
engineered_data.casas_dev['rooms_per_area'] = engineered_data.casas_dev['rooms'] / engineered_data.casas_dev['area']
engineered_data.casas_dev['log_area'] = np.log(engineered_data.casas_dev['area'])
engineered_data.casas_dev = engineered_data.casas_dev.drop(columns=['lon', 'lat', 'area_units'])
engineered_data.casas_dev.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 900 entries, 0 to 899
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   area            900 non-null    float64
 1   is_house        900 non-null    int64  
 2   has_pool        900 non-null    int64  
 3   age             900 non-null    float64
 4   price           900 non-null    float64
 5   rooms           900 non-null    float64
 6   high_latitude   900 non-null    bool   
 7   rooms_per_area  900 non-null    float64
 8   log_area        900 non-null    float64
dtypes: bool(1), float64(6), int64(2)
memory usage: 57.3 KB


In [5]:
train : pd.DataFrame
validation : pd.DataFrame
train, validation = data_handler.get_train_and_validation_sets(engineered_data.casas_dev, train_fraction=0.8, seed=42)
train_standarized : pd.DataFrame = prepro.standarize_numeric_columns(train, excluded_columns=set(['lat', 'lon', 'is_house', 'has_pool', 'area_units', 'high_latitude']))
validation_standarized : pd.DataFrame = prepro.standarize_numeric_columns(validation, excluded_columns=set(['lat', 'lon', 'is_house', 'has_pool', 'area_units', 'high_latitude']))

4.2 )

In [6]:
lin : LinealReg
for caracts in ['area',
                'rooms', 
               'age', 
               'high_latitude', 
               'rooms_per_area', 
               'log_area',
               ['area', 'high_latitude'],
               ['log_area', 'high_latitude'],
               ['rooms_per_area', 'high_latitude'],
               ['area', 'rooms', 'age', 'high_latitude', 'rooms_per_area', 'log_area'],
               ]:
    lin : LinealReg = LinealReg(train_standarized[caracts].to_numpy(), train_standarized['price'].to_numpy())
    lin.fit_pseudo_inverse()
    print(f" {f"\"{caracts}\"""":75}", f"|   Error Cuadrático Medio: {lin.error_cuadratico_medio():3f}", f"|   Error Least Square: {lin.error_least_squares_function():3f}")

 "area"                                                                      |   Error Cuadrático Medio: 0.875800 |   Error Least Square: 630.575901
 "rooms"                                                                     |   Error Cuadrático Medio: 0.884956 |   Error Least Square: 637.168002
 "age"                                                                       |   Error Cuadrático Medio: 0.969342 |   Error Least Square: 697.926049
 "high_latitude"                                                             |   Error Cuadrático Medio: 0.325203 |   Error Least Square: 234.146027
 "rooms_per_area"                                                            |   Error Cuadrático Medio: 0.998547 |   Error Least Square: 718.954059
 "log_area"                                                                  |   Error Cuadrático Medio: 0.878543 |   Error Least Square: 632.551070
 "['area', 'high_latitude']"                                                 |   Error Cuadrático Medio: 0

4.3 )

In [7]:
new_columns : dict[str, float] = {}
numeric_features = ['area', 'rooms', 'age', 'high_latitude', 'rooms_per_area', 'log_area']
for i in range(2, 62):
    numeric_features.append(f'area^{i}')
    numeric_features.append(f'age^{i}')
    numeric_features.append(f'rooms^{i}')
    numeric_features.append(f'rooms_per_area^{i}')
    numeric_features.append(f'log_area^{i}')
    new_columns[f'area^{i}'] = engineered_data.casas_dev['area'] ** i
    new_columns[f'age^{i}'] = engineered_data.casas_dev['age'] ** i
    new_columns[f'rooms^{i}'] = engineered_data.casas_dev['rooms'] ** i
    new_columns[f'rooms_per_area^{i}'] = engineered_data.casas_dev['rooms_per_area'] ** i
    new_columns[f'log_area^{i}'] = engineered_data.casas_dev['log_area'] ** i
engineered_data.casas_dev = pd.concat([engineered_data.casas_dev, pd.DataFrame(new_columns)], axis=1)
engineered_data.casas_dev.head()

Unnamed: 0,area,is_house,has_pool,age,price,rooms,high_latitude,rooms_per_area,log_area,area^2,...,area^60,age^60,rooms^60,rooms_per_area^60,log_area^60,area^61,age^61,rooms^61,rooms_per_area^61,log_area^61
0,59.0,1,0,18.0,546.0,2.0,True,0.033898,4.077537,3481.0,...,1.782874e+106,2.071812e+75,1.152922e+18,6.466645e-89,4.206079e+36,1.051896e+108,3.729261e+76,2.305843e+18,2.192083e-90,1.7150449999999999e+37
1,102.0,0,0,9.0,759.0,3.0,True,0.029412,4.624973,10404.0,...,3.281031e+120,1.79701e+57,4.239116e+28,1.292007e-92,8.064009999999999e+39,3.346651e+122,1.617309e+58,1.2717349999999998e+29,3.800022e-94,3.7295829999999997e+40
2,62.0,0,0,7.0,464.0,2.0,True,0.032258,4.127134,3844.0,...,3.4954359999999997e+107,5.080219e+50,1.152922e+18,3.298362e-90,8.687936e+36,2.16717e+109,3.5561529999999996e+51,2.305843e+18,1.063988e-91,3.585628e+37
3,127.0,1,1,16.0,251.0,4.0,False,0.031496,4.844187,16129.0,...,1.69131e+126,1.7668469999999999e+72,1.329228e+36,7.859161999999999e-91,1.2979399999999999e+41,2.147964e+128,2.826955e+73,5.316912e+36,2.475327e-92,6.2874619999999996e+41
4,128.0,0,0,8.0,963.0,4.0,True,0.03125,4.85203,16384.0,...,2.707685e+126,1.532496e+54,1.329228e+36,4.9090929999999997e-91,1.430244e+41,3.465837e+128,1.225996e+55,5.316912e+36,1.534092e-92,6.9395859999999995e+41


In [8]:
train : pd.DataFrame
validation : pd.DataFrame
train, validation = data_handler.get_train_and_validation_sets(engineered_data.casas_dev, train_fraction=0.8, seed=42)
train_standarized : pd.DataFrame = prepro.standarize_numeric_columns(train, excluded_columns=set(['lat', 'lon', 'is_house', 'has_pool', 'area_units', 'high_latitude']))
validation_standarized : pd.DataFrame = prepro.standarize_numeric_columns(validation, excluded_columns=set(['lat', 'lon', 'is_house', 'has_pool', 'area_units', 'high_latitude']))

In [None]:
lin : LinealReg = LinealReg(train_standarized.drop(columns='price').to_numpy(), train_standarized['price'].to_numpy())
lin.fit_pseudo_inverse()
print("PSEUDOINVERSA")
print("  ECM (train set)      : ", lin.error_cuadratico_medio())
print("  ECM (validation set) : ", lin.error_cuadratico_medio(validation_set_x=validation_standarized.drop(columns=['price']).to_numpy(), validation_set_y=validation_standarized['price'].to_numpy()))
lin : LinealReg = LinealReg(train_standarized.drop(columns='price').to_numpy(), train_standarized['price'].to_numpy())
lin.fit_gradient_descent(step_size=0.000005, tolerance=-1, max_number_of_steps=10000)
print("GRADIENTE DESCENDENTE")
print("  ECM (train set)      : ", lin.error_cuadratico_medio())
print("  ECM (validation set) : ", lin.error_cuadratico_medio(validation_set_x=validation_standarized.drop(columns=['price']).to_numpy(), validation_set_y=validation_standarized['price'].to_numpy()))

PSEUDOINVERSA
  ECM (train set)      :  163.62337658262462
  ECM (validation set) :  1322793643410.2915
GRADIENTE DESCENDENTE
  ECM (train set)      :  0.11797847775720835
  ECM (validation set) :  0.12828815271897054


Los resultados anteriores sugieren que el modelo de la pseudoinversa produce overfitting al haber una gran discrepancia entre el set de datos de entrenamiento y el set de datos de validación.<br>
En el caso del modelo que utiliza gradiente descendiente se logra una mejor abstracción de los datos con los parámetros utilizados, pero a mayor costo algorítmico.