In [None]:
import warnings

import matplotlib.pyplot as plt
import pandas as pd
import wqet_grader
from IPython.display import VimeoVideo
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.utils.validation import check_is_fitted

warnings.simplefilter(action="ignore", category=FutureWarning)
wqet_grader.init("Project 2 Assessment")

In [None]:
def wrangle(filepath):
    #read csv file into dataframe
    df = pd.read_csv(filepath)
    #subset to properties in 'Capital federal'
    mask_ba = df["place_with_parent_names"].str.contains("Capital Federal")
    #subset to 'apartments'
    mask_apt = df["property_type"] == "apartment"
    #subset to properties where price aprox usd is less than 400000
    mask_price = df["price_aprox_usd"] < 400_000
    #subset
    
    df = df[mask_ba & mask_apt & mask_price]
    #Remove outliers
    low, high = df["surface_covered_in_m2"].quantile([0.1, 0.9])
    mask_area = df["surface_covered_in_m2"].between(low, high)
    df = df[mask_area]
    return df

In [None]:
df = wrangle("data/buenos-aires-real-estate-1.csv")
print("df shape:", df.shape)
df.head()

In [None]:
# Check your work
assert (
    len(df) <= 8606
), f"`df` should have no more than 8606 observations, not {len(df)}."

In [None]:
# Check your work
assert (
    len(df) <= 1781
), f"`df` should have no more than 1781 observations, not {len(df)}."

In [None]:
plt.hist(df["surface_covered_in_m2"])
plt.xlabel("Area [sq meters]")
plt.title("Distribution of Apartment Sizes")

In [None]:
df.describe()["surface_covered_in_m2"]

In [None]:
plt.scatter(x=df["surface_covered_in_m2"], y=df["price_aprox_usd"])
plt.xlabel("Area [sq meters]")
plt.ylabel("Price [USD]")
plt.title("Buenos Aires: price vs Area");

In [None]:
features = ["surface_covered_in_m2"]
X_train = df[features]
X_train.head()

In [None]:
target = "price_aprox_usd"
y_train = df[target]
y_train.shape

In [None]:
y_mean = y_train.mean()
y_mean

In [None]:
y_pred_baseline = [y_mean] * len(y_train)

In [None]:
X_train.values[:5]

In [None]:
plt.plot(X_train.values, y_pred_baseline, color="orange", label="Baseline Model")
plt.scatter(X_train, y_train)
plt.xlabel("Area [sq meters]")
plt.ylabel("Price [USD]")
plt.title("Buenos Aires: Price vs. Area")
plt.legend();

In [None]:
mae_baseline = mean_absolute_error(y_train, y_pred_baseline)

print("Mean apt price", round(y_mean, 2))
print("Baseline MAE:", round(mae_baseline, 2))

In [None]:
model = LinearRegression()

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred_training = model.predict(X_train)
y_pred_training[:5]

In [None]:
mae_training = mean_absolute_error(y_train, y_pred_training)
print("Training MAE:", round(mae_training, 2))

In [None]:
X_test = pd.read_csv("data/buenos-aires-test-features.csv")[features]
y_pred_test = pd.Series(model.predict(X_test))
y_pred_test.head()

In [None]:
intercept = round(model.intercept_, 2)
print("Model Intercept:", intercept)
assert any([isinstance(intercept, int), isinstance(intercept, float)])

In [None]:
coefficient = round(model.coef_[0], 2)
print('Model coefficient for "surface_covered_in_m2":', coefficient)
assert any([isinstance(coefficient, int), isinstance(coefficient, float)])

In [None]:
print(f"apt_price = {intercept} + {coefficient} * surface_covered")

In [None]:
plt.plot(X_train.values, model.predict(X_train), color="magenta", label="Linear Model")
plt.scatter(X_train, y_train)
plt.xlabel("surface covered [sq meters]")
plt.ylabel("price [usd]")
plt.legend();