# **Predicting Apartment Prices in Mexico City**

In [None]:
# Import libraries here

from glob import glob

import pandas as pd

import seaborn as sns

import matplotlib.pyplot as plt

import plotly.express as px

from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge  # noqa F401
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.utils.validation import check_is_fitted

## **Prepare Data**

In [None]:
# Build  `wrangle` function

def wrangle (filepath):
    df=pd.read_csv(filepath)
    
    #subset df and cost less than 100000 dolars
    
    mask_df= df["place_with_parent_names"].str.contains('Distrito Federal')
    mask_apart=df['property_type']== 'apartment'
    mask_cost= df['price_aprox_usd']<100000
    df= df[mask_df & mask_cost & mask_apart]
    
   # Subset data: Remove outliers for "surface_covered_in_m2"
    
    low, high= df['surface_covered_in_m2'].quantile([0.1, 0.9])
    mask_area = df['surface_covered_in_m2'].between(low,high)
    df= df[mask_area]
    
    #split lat and lon
    
    df[['lat', 'lon']]= df['lat-lon'].str.split(',', expand=True).astype(float)
    df.drop(columns= ['lat-lon'], inplace=True)
    
    #create borough column
    
    df["borough"]= df['place_with_parent_names'].str.split('|', expand=True)[1]
    df.drop(columns=['place_with_parent_names'], inplace=True)
    
    #drop high nulls columns
    
    df.drop(columns=[ 'price_usd_per_m2',
                    'floor', 'rooms', 'expenses'], inplace=True)
    
    #drop columns containing low- or high-cardinality categorical values
    
    df.drop(columns=['operation' ,
                    'currency', 'properati_url', 'property_type'], inplace=True)
    
    #drop leaky columns for price_aprox_usd
    
    df.drop(columns=['price', 'price_aprox_local_currency', 
                     'price_per_m2'], inplace=True)
    
    #Drop columns that would create issues of multicollinearity
    
    df.drop(columns=['surface_total_in_m2'], inplace=True)
    
    
    
    return df
    

In [None]:
#test wrangle function and explore the data
df= wrangle('data/mexico-city-real-estate-1.csv')
df

In [None]:
#Use glob to create the list files

files = sorted(glob('data/mexico-city-real-estate-*.csv'))

files

In [None]:
#Combine wrangle function, a list comprehension, and pd.concat to create a DataFrame df

df = [wrangle(file) for file in files]
df= pd.concat(df)

print(df.info())
df.head()

## **Explore**

In [None]:
# Plot distribution of price

plt.hist(df["price_aprox_usd"])
plt.xlabel("Area [sq meters]")
plt.ylabel("Count")
plt.title("Distribution of Apartment Prices");

In [None]:
# Plot price vs area

plt.scatter(x= df["surface_covered_in_m2"], y= df["price_aprox_usd"] )
plt.xlabel("Area [sq meters]")
plt.ylabel("Price [USD]")
plt.title("Mexico City: Price vs. Area")

In [None]:
# Plot Mapbox location and price

fig = px.scatter_mapbox(
    df,  # Our DataFrame
    lat= 'lat',
    lon= 'lon',
    width=600,  # Width of map
    height=600,  # Height of map
    color="price_aprox_usd",
    hover_data=["price_aprox_usd"],  # Display price when hovering mouse over house
)

fig.update_layout(mapbox_style="open-street-map")

fig.show()

In [None]:
# Split data into feature matrix `X_train` and target vector `y_train`.
target= "price_aprox_usd"
feature = ['surface_covered_in_m2', 'lat', 'lon', 'borough']


X_train = df[feature]
y_train = df[target]

## **Build Model**

In [None]:
#Baseline
#Calculate the baseline mean absolute error of the model

y_mean = y_train.mean()
y_pred_baseline = [y_mean]*len(y_train)
baseline_mae = mean_absolute_error(y_train, y_pred_baseline)
print("Mean apt price:", y_mean)
print("Baseline MAE:", baseline_mae)

In [None]:
#Iterate
#Create a pipeline named model that contains all the transformers necessary for this dataset and one of the predictors used during this project. 

from category_encoders import OneHotEncoder

#instanciate

ohe = OneHotEncoder(use_cat_names=True)

#Fit

ohe.fit(X_train)

#transform

XT_train = ohe.transform(X_train)
print(XT_train.shape)
XT_train.head()

In [None]:
# Build Model
model = make_pipeline(
    OneHotEncoder(use_cat_names=True),
    SimpleImputer(),
    Ridge()

)

# Fit model
model.fit(X_train, y_train)

## **Evaluate**

In [None]:
#Read the CSV file mexico-city-test-features.csv into the DataFrame X_test

X_test = pd.read_csv('data/mexico-city-test-features.csv')
print(X_test.info())
X_test.head()



In [None]:

#Use the  model to generate a Series of predictions for X_test

y_test_pred = pd.Series(model.predict(X_test))
y_test_pred.head()

## **Communicate Results**

In [None]:
coefficients = ...
features = ...
feat_imp = ...
feat_imp

In [None]:
# Create horizontal bar chart