In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import src.preprocessing as prepro
import src.data_handler as data_handler
from src.models import LinealReg

raw_data : data_handler.RawData = data_handler.RawData()

In [2]:
processed_data : data_handler.ProcessedData = data_handler.ProcessedData(correct_data_types=True, standarize=False, area_units='m2')
processed_data.save_data()
processed_data_nona : data_handler.ProcessedData = data_handler.ProcessedData(correct_data_types=True, standarize=False, area_units='m2', remove_na_rows=True)
processed_data_nona.save_data(ext='nona')
processed_data_standarized : data_handler.ProcessedData = data_handler.ProcessedData(correct_data_types=True, standarize=True, area_units='m2')
processed_data_standarized.save_data(ext='standarized')
processed_data_standarized_nona : data_handler.ProcessedData = data_handler.ProcessedData(correct_data_types=True, standarize=True, area_units='m2', remove_na_rows=True) # Contiene los datos estandarizados, con los tipos corregidos, y unidades unificadas.
processed_data_standarized_nona.save_data(ext='standarized_nona')

4 ) Hago feature engineering a partir de un dataframe con datos procesados no estandarizados

4.1 )

Relleno los datos faltantes con el promedio de los datos

In [3]:
engineered_data : data_handler.ProcessedData = data_handler.ProcessedData(correct_data_types=False, standarize=False, area_units='m2')
engineered_data.fill_missing_values(method='mean')

Nueva feature 'high_latitude': Basada en 'lat' y 'lon', reemplazo ambas features con 1 en caso de que la latitud sea alta, y 0 en caso de que la latitud sea baja.

(Vimos en el punto 1 que siempre que la latitud es alta la longitud es baja).

In [4]:
engineered_data.casas_dev['high_latitude'] = np.where(engineered_data.casas_dev['lat'] > 0, True, False)
engineered_data.casas_dev = engineered_data.casas_dev.drop(columns=['lon', 'lat'])
engineered_data.casas_dev.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 900 entries, 0 to 899
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   area           900 non-null    float64
 1   area_units     900 non-null    object 
 2   is_house       900 non-null    int64  
 3   has_pool       900 non-null    int64  
 4   age            900 non-null    float64
 5   price          900 non-null    float64
 6   rooms          900 non-null    float64
 7   high_latitude  900 non-null    bool   
dtypes: bool(1), float64(4), int64(2), object(1)
memory usage: 50.2+ KB


In [5]:
train : pd.DataFrame
validation : pd.DataFrame
train, validation = data_handler.get_train_and_validation_sets(engineered_data.casas_dev, train_fraction=0.8, seed=42)
train_standarized : pd.DataFrame = prepro.standarize_numeric_columns(train, excluded_columns=set(['lat', 'lon']))
validation_standarized : pd.DataFrame = prepro.standarize_numeric_columns(train, excluded_columns=set(['lat', 'lon']))
other_features = ['area', 'price', 'age', 'rooms', 'high_latitude']
train_means : dict[str, float] = {f : train[f].mean() for f in other_features}
train_stds : dict[str, float] = {f : train[f].std() for f in other_features}
validation_means : dict[str, float] = {f : validation[f].mean() for f in other_features}
validation_stds : dict[str, float] = {f : validation[f].std() for f in other_features}

4.2 )

In [6]:
lin : LinealReg
for caract in ['rooms', 'age', 'high_latitude']:
    lin : LinealReg = LinealReg(train_standarized[['area', caract]].to_numpy(), train_standarized['price'].to_numpy())
    lin.fit_pseudo_inverse()
    print(f" \"area\", {f"\"{caract}\"":15}", f"|   Error Cuadrático Medio: {lin.error_cuadratico_medio():3f}", f"|   Error Least Square: {lin.error_least_squares_function():3f}")


0.8734069833552368
 "area", "rooms"         |   Error Cuadrático Medio: 0.873407 |   Error Least Square: 628.853028
0.8477089758817278
 "area", "age"           |   Error Cuadrático Medio: 0.847709 |   Error Least Square: 610.350463
0.1371156243122655
 "area", "high_latitude" |   Error Cuadrático Medio: 0.137116 |   Error Least Square: 98.723250


In [7]:
new_columns : set = {}
for i in range(2, 102):
    new_columns[f'area^{i}'] = engineered_data.casas_dev['area'] ** i
    new_columns[f'age^{i}'] = engineered_data.casas_dev['age'] ** i
    new_columns[f'rooms^{i}'] = engineered_data.casas_dev['rooms'] ** i
engineered_data.casas_dev = pd.concat([engineered_data.casas_dev, pd.DataFrame(new_columns)], axis=1)
engineered_data.casas_dev.head()

Unnamed: 0,area,area_units,is_house,has_pool,age,price,rooms,high_latitude,area^2,age^2,...,rooms^98,area^99,age^99,rooms^99,area^100,age^100,rooms^100,area^101,age^101,rooms^101
0,59.0,m2,1,0,18.0,546.0,2.0,True,3481.0,324.0,...,3.169127e+29,2.062287e+175,1.8705869999999999e+124,6.3382529999999996e+29,1.216749e+177,3.367057e+125,1.267651e+30,7.178822000000001e+178,6.060703e+126,2.535301e+30
1,102.0,m2,0,0,9.0,759.0,3.0,True,10404.0,81.0,...,5.726417000000001e+46,7.102594000000001e+198,2.951267e+94,1.717925e+47,7.244646e+200,2.65614e+95,5.153775e+47,7.3895389999999994e+202,2.390526e+96,1.546133e+48
2,62.0,m2,0,0,7.0,464.0,2.0,True,3844.0,49.0,...,3.169127e+29,2.7975459999999997e+177,4.620681e+83,6.3382529999999996e+29,1.734479e+179,3.234477e+84,1.267651e+30,1.075377e+181,2.2641339999999998e+85,2.535301e+30
3,127.0,m2,1,1,16.0,251.0,4.0,False,16129.0,256.0,...,1.004336e+59,1.8904639999999997e+208,1.6139060000000001e+119,4.017345e+59,2.400889e+210,2.58225e+120,1.606938e+60,3.049129e+212,4.1316e+121,6.427752e+60
4,128.0,m2,0,0,8.0,963.0,4.0,True,16384.0,64.0,...,1.004336e+59,4.1094809999999995e+208,2.546295e+89,4.017345e+59,5.260136e+210,2.037036e+90,1.606938e+60,6.732974e+212,1.629629e+91,6.427752e+60
