# Buenos Aires Housing Data

In [None]:
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from category_encoders import OneHotEncoder, OrdinalEncoder

# I. Wrangle Data

In [None]:
def wrangle(filepath):
    df = pd.read_csv(filepath, 
                     parse_dates=['created_on'],
                     index_col='created_on')
    
    # Unpack lat-long
    df[['lat', 'long']] = df['lat-lon'].str.split(',', expand=True).astype(float)
    
    # Create neighborhood
    df['neighborhood'] = df['place_with_parent_names'].str.split('|').apply(lambda x: x[-2])
    
    df.drop(columns=['operation', 'lat-lon', 'price', 
                     'currency', 'price_aprox_local_currency',
                     'price_usd_per_m2', 'price_per_m2',
                     'properati_url', 'description', 'title',
                     'image_thumbnail', 'place_with_parent_names'], inplace=True)
    
    # Drop rows with no target
    df.dropna(axis=0, subset=['price_aprox_usd'], inplace=True)
    
    mask = df['price_aprox_usd'] < 750_000
    df = df[mask]
    
    return df

In [None]:
df = wrangle('data/argentina-real-estate/buenos-aires-1.csv')

In [None]:
df['price_aprox_usd'].hist()

# II. Split Data

In [None]:
target = 'price_aprox_usd'
y = df[target]
X = df.drop(columns=target)

In [None]:
mask = df.index.year >= 2015

X_train, y_train = X.loc[~mask], y.loc[~mask]
X_test, y_test = X.loc[mask], y.loc[mask]

# III. Establish Baseline

In [None]:
print('Mean property price:', y_train.mean())
y_pred = [y_train.mean()] * len(y_train)
print('Baseline MAE:', mean_absolute_error(y_train, y_pred))

# IV. Build Model

In [None]:
model = make_pipeline(
    OneHotEncoder(use_cat_names=True),
    SimpleImputer(strategy='median'),
    Ridge(alpha=40)
)

model.fit(X_train, y_train);

# V. Check Metrics

In [None]:
print('Training MAE:', mean_absolute_error(y_train, model.predict(X_train)))
print('Test MAE:', mean_absolute_error(y_test, model.predict(X_test)))