In [32]:
import warnings
from glob import glob
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.utils.validation import check_is_fitted
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import make_pipeline
from category_encoders import OneHotEncoder


warnings.simplefilter(action='ignore', category=FutureWarning)

In [33]:
def wrangle(filepath):
    #read csv into dataframe
    df = pd.read_csv(filepath)

    #subset tp properties in capital federal
    mask_ba = df['place_with_parent_names'].str.contains('Capital Federal')
    df =df[mask_ba]
    #subsets to 'apartments'
    mask_apt = df['property_type'] == 'apartment'

    #subbset price_approx usd to less than 400k
    mask_price = df['price_aprox_usd'] < 400000
    df = df[mask_ba & mask_apt & mask_price]
    # Subset Removing outliers on surface area
    low, high =df['surface_covered_in_m2'].quantile([0.1, 0.9])
    mask_area = df['surface_covered_in_m2'].between(low, high)
    df = df[mask_area]
    
    #split lat-lon column
    df[['lat', 'lon']] = df['lat-lon'].str.split(',', expand=True).astype(float)
    df.drop(columns=['lat-lon'], inplace=True)

    #extract neigbourhood from the parent_with_place_names

    df['neighborhood'] = df['place_with_parent_names'].str.split('|', expand=True)[3]
    df.drop(columns='place_with_parent_names', inplace=True)
    return df
    

In [34]:
#glob helps to import mutiple files at the same time

files = glob('data/buenos-aires-real-estate-*.csv')
files

['data\\buenos-aires-real-estate-1.csv',
 'data\\buenos-aires-real-estate-2.csv',
 'data\\buenos-aires-real-estate-3.csv',
 'data\\buenos-aires-real-estate-4.csv',
 'data\\buenos-aires-real-estate-5.csv']

In [41]:
# Check your work
assert len(files) == 5, f"`files` should contain 5 items, not {len(files)}"

In [35]:
#The next step is to read each of the CSVs in files into a DataFrame, and put all of those DataFrames into a list. What's a good way to iterate through files so we can do this? A for loop!
#Use your wrangle function in a for loop to create a list named frames. The list should the cleaned DataFrames created from the CSV filenames your collected in files.

frames = []
for file in files:
    df = wrangle(file)
    frames.append(df)



  df = df[mask_ba & mask_apt & mask_price]
  df = df[mask_ba & mask_apt & mask_price]
  df = df[mask_ba & mask_apt & mask_price]
  df = df[mask_ba & mask_apt & mask_price]
  df = df[mask_ba & mask_apt & mask_price]


In [24]:
len(frames)
frames[4].head()

Unnamed: 0,operation,property_type,place_with_parent_names,price,currency,price_aprox_local_currency,price_aprox_usd,surface_total_in_m2,surface_covered_in_m2,price_usd_per_m2,price_per_m2,floor,rooms,expenses,properati_url,lat,lon
3,sell,apartment,|Argentina|Capital Federal|Belgrano|,112000.0,USD,1698188.8,112000.0,,60.0,,1866.666667,1.0,2.0,,http://belgrano.properati.com.ar/116pr_venta_d...,-34.564676,-58.45572
9,sell,apartment,|Argentina|Capital Federal|Caballito|,76500.0,USD,1159923.6,76500.0,0.0,36.0,,2125.0,,2.0,1000.0,http://caballito.properati.com.ar/12ksf_venta_...,-34.61883,-58.437779
18,sell,apartment,|Argentina|Capital Federal|Belgrano|,90000.0,USD,1364616.0,90000.0,100.0,100.0,900.0,900.0,,3.0,,http://belgrano.properati.com.ar/zouu_venta_de...,-34.577168,-58.538654
20,sell,apartment,|Argentina|Capital Federal|Colegiales|,159900.0,USD,2424467.76,159900.0,82.0,77.0,1950.0,2076.623377,,3.0,,http://colegiales.properati.com.ar/zpcd_venta_...,-34.571526,-58.455637
21,sell,apartment,|Argentina|Capital Federal|Colegiales|,123875.0,USD,1878242.3,123875.0,79.0,50.0,1568.037975,2477.5,,2.0,,http://colegiales.properati.com.ar/12sbf_venta...,-34.578688,-58.457358


In [36]:
#concatinating the dataframes

df = pd.concat(frames, ignore_index=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6582 entries, 0 to 6581
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   operation                   6582 non-null   object 
 1   property_type               6582 non-null   object 
 2   price                       6582 non-null   float64
 3   currency                    6582 non-null   object 
 4   price_aprox_local_currency  6582 non-null   float64
 5   price_aprox_usd             6582 non-null   float64
 6   surface_total_in_m2         4752 non-null   float64
 7   surface_covered_in_m2       6582 non-null   float64
 8   price_usd_per_m2            4536 non-null   float64
 9   price_per_m2                6582 non-null   float64
 10  floor                       1900 non-null   float64
 11  rooms                       5286 non-null   float64
 12  expenses                    1739 non-null   float64
 13  properati_url               6582 

In [68]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6582 entries, 0 to 6581
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   operation                   6582 non-null   object 
 1   property_type               6582 non-null   object 
 2   price                       6582 non-null   float64
 3   currency                    6582 non-null   object 
 4   price_aprox_local_currency  6582 non-null   float64
 5   price_aprox_usd             6582 non-null   float64
 6   surface_total_in_m2         4752 non-null   float64
 7   surface_covered_in_m2       6582 non-null   float64
 8   price_usd_per_m2            4536 non-null   float64
 9   price_per_m2                6582 non-null   float64
 10  floor                       1900 non-null   float64
 11  rooms                       5286 non-null   float64
 12  expenses                    1739 non-null   float64
 13  properati_url               6582 

EXPLORING

create a Neigbourhood feature


Modify your wrangle function to create a new feature "neighborhood". You can find the neighborhood for each property in the "place_with_parent_names" column. For example, a property with the place name "|Argentina|Capital Federal|Palermo|" is located in the neighborhood is "Palermo". Also, your function should drop the "place_with_parent_names" column.

In [45]:
#df['place_with_parent_names'].head()

SPLIT

In [37]:
#we now split our data into feature matrix(X) and target vector(y)
target = 'price_aprox_usd'
features = ['neighborhood']
y_train = df[target]
X_train = df[features]
 


In [52]:
# Check your work
assert X_train.shape == (6582, 1), f"`X_train` is the wrong size: {X_train.shape}."
assert y_train.shape == (6582,), f"`y_train` is the wrong size: {y_train.shape}."

Build Model

In [16]:
#Calculate the baseline mean absolute error for your model.

In [38]:
y_mean = [y_train.mean()]
y_pred_baseline = y_mean * len(y_train)
print('Mean apartment price:', y_mean)
print('Baseline MAE:', mean_absolute_error(y_train, y_pred_baseline))

Mean apartment price: [np.float64(132383.83701458524)]
Baseline MAE: 44860.10834274133


Iterate


In [19]:
#onehotencoder looks onto all the values, identify the unique values and creates a new columns for them
#First, instantiate a OneHotEncoder named ohe. Make sure to set the use_cat_names argument to True. Next, fit your transformer to the feature matrix X_train. Finally, use your encoder to transform the feature matrix X_train, and assign the transformed data to the variable XT_train.


In [39]:
#instatiate the encoder
ohe = OneHotEncoder(use_cat_names=True)
#fit the encoder
ohe.fit(X_train)

#transform
XT_train = ohe.transform(X_train)
print(XT_train.shape)
XT_train.head()

(6582, 57)


Unnamed: 0,neighborhood_Chacarita,neighborhood_Villa Luro,neighborhood_Caballito,neighborhood_Constitución,neighborhood_Once,neighborhood_Almagro,neighborhood_Palermo,neighborhood_Flores,neighborhood_Belgrano,neighborhood_Liniers,...,neighborhood_Puerto Madero,neighborhood_Agronomía,neighborhood_Monte Castro,neighborhood_Tribunales,neighborhood_Villa Santa Rita,neighborhood_Velez Sarsfield,neighborhood_Villa Soldati,neighborhood_Villa Real,neighborhood_Pompeya,neighborhood_Catalinas
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
# Check your work
assert XT_train.shape == (6582, 57), f"`XT_train` is the wrong shape: {XT_train.shape}"

build model

In [50]:
model = make_pipeline(
    OneHotEncoder(use_cat_names=True),
    Ridge()
)
model.fit(X_train, y_train)

In [19]:
# Check your work
check_is_fitted(model[-1])

evaluate the model

In [41]:
#Training mean absolute error
y_pred_training = model.predict(X_train)
mae_training = mean_absolute_error(y_train, y_pred_training)
print('Training MAE:', round(mae_training, 2))

Training MAE: 39350.22


In [53]:

X_test = pd.read_csv("data/buenos-aires-test-features.csv")#[features]
#y_pred_test = pd.Series(model.predict(X_test))
#y_pred_test.head()
X_test



Unnamed: 0,surface_covered_in_m2,lat,lon,neighborhood
0,47,-34.615310,-58.361983,Puerto Madero
1,55,-34.547298,-58.462705,Nuñez
2,34,-34.617762,-58.383662,Monserrat
3,34,-34.639925,-58.438406,Parque Chacabuco
4,42,-34.611495,-58.442359,Caballito
...,...,...,...,...
1481,37,-34.568937,-58.502755,Villa Urquiza
1482,75,-34.595063,-58.447459,Villa Crespo
1483,60,-34.616517,-58.367498,Monserrat
1484,44,-34.595646,-58.427610,Palermo


Communicate the results


In [45]:
intercept = model.named_steps['ridge'].intercept_
coefficients = model.named_steps['ridge'].coef_
print ('coefficient len:', len(coefficients))
print(coefficients[:5])

coefficient len: 57
[-2.89895934e+03 -6.29555347e+00  9.25289088e+03 -4.17487330e+04
 -3.23037446e+03]


In [46]:
# Check your work
assert isinstance(
    intercept, float
), f"`intercept` should be a `float`, not {type(intercept)}."
assert isinstance(
    coefficients, np.ndarray
), f"`coefficients` should be a `float`, not {type(coefficients)}."
assert coefficients.shape == (
    57,
), f"`coefficients` is wrong shape: {coefficients.shape}."

In [47]:
feature_names = model.named_steps['onehotencoder'].get_feature_names()
print("features len:", len(feature_names))
print(feature_names[:5])  # First five feature names

features len: 57
['neighborhood_Chacarita' 'neighborhood_Villa Luro'
 'neighborhood_Caballito' 'neighborhood_Constitución' 'neighborhood_Once']


In [19]:
# Check your work
assert isinstance(
    feature_names, np.ndarray
), f"`features` should be a `list`, not {type(feature_names)}."
assert len(feature_names) == len(
    coefficients
), "You should have the same number of features and coefficients."

In [48]:
#combining the coefficients and the neigborhoods
feat_imp = pd.Series(coefficients, index=feature_names)
feat_imp.head()

neighborhood_Chacarita       -2898.959335
neighborhood_Villa Luro         -6.295553
neighborhood_Caballito        9252.890876
neighborhood_Constitución   -41748.733031
neighborhood_Once            -3230.374461
dtype: float64

In [26]:
# Check your work
assert isinstance(
    feat_imp, pd.Series
), f"`feat_imp` should be a `float`, not {type(feat_imp)}."
assert feat_imp.shape == (57,), f"`feat_imp` is wrong shape: {feat_imp.shape}."
assert all(
    a == b for a, b in zip(sorted(feature_names), sorted(feat_imp.index))
), "The index of `feat_imp` should be identical to `features`."

In [49]:
#print the equation

print(f"price = {intercept.round(2)}")
for f, c in feat_imp.items():
    print(f"+ ({round(c, 2)} * {f})")

price = 118524.65
+ (-2898.96 * neighborhood_Chacarita)
+ (-6.3 * neighborhood_Villa Luro)
+ (9252.89 * neighborhood_Caballito)
+ (-41748.73 * neighborhood_Constitución)
+ (-3230.37 * neighborhood_Once)
+ (2903.34 * neighborhood_Almagro)
+ (45934.41 * neighborhood_Palermo)
+ (-8662.28 * neighborhood_Flores)
+ (46954.21 * neighborhood_Belgrano)
+ (-13729.1 * neighborhood_Liniers)
+ (6277.05 * neighborhood_Villa Crespo)
+ (-10678.63 * neighborhood_San Cristobal)
+ (-7974.66 * neighborhood_Congreso)
+ (14701.16 * neighborhood_Saavedra)
+ (-11172.55 * neighborhood_Balvanera)
+ (-29585.61 * neighborhood_Parque Avellaneda)
+ (72740.78 * neighborhood_Recoleta)
+ (5638.47 * neighborhood_San Telmo)
+ (42831.32 * neighborhood_Nuñez)
+ (55590.93 * neighborhood_Barrio Norte)
+ (-6323.68 * neighborhood_Parque Centenario)
+ (4330.55 * neighborhood_Abasto)
+ (-7905.29 * neighborhood_Centro / Microcentro)
+ (-19370.74 * neighborhood_)
+ (-7108.23 * neighborhood_Paternal)
+ (-21078.78 * neighborhood_Ma

curse of dimentiality

In [30]:
#Scroll up, change the predictor in your model to Ridge, and retrain it. Then evaluate the model's training and test performance. Do you still have an overfitting problem? If not, extract the intercept and coefficients again (you'll need to change your code a little bit) and regenerate the model's equation. Does it look different than before?