In [1]:
import pandas as pd
import numpy as np

# Constants
N = 1000
R_earth = 6371 # km
radius = 2 # km
lat_central = -12.08672
lon_central = -77.03653
min_price = 100000
max_price = 560000
min_age = 0
max_age = 10

# Random generation inside the radius
np.random.seed(0)
random_radius = radius * np.sqrt(np.random.rand(N))
random_angle = 2 * np.pi * np.random.rand(N)

# Latitude and Longitude
df = pd.DataFrame()
df['latitud'] = lat_central + (random_radius / R_earth) * (180 / np.pi)
df['longitud'] = lon_central + (random_radius / R_earth) * (180 / np.pi) / np.cos(lat_central * np.pi/180)

# Random housing price and age
df['precio_vivienda'] = np.random.randint(min_price, max_price, N)
df['antiguedad_vivienda'] = np.random.randint(min_age, max_age, N)

df.head()

Unnamed: 0,latitud,longitud,precio_vivienda,antiguedad_vivienda
0,-12.073395,-77.022903,496843,5
1,-12.071509,-77.020974,141935,6
2,-12.072756,-77.022249,109019,9
3,-12.073443,-77.022952,463797,2
4,-12.075013,-77.024557,238737,9


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# Divide data into features (X) and target (y)
X = df[['latitud', 'longitud', 'antiguedad_vivienda']]
y = df['precio_vivienda']

# Split data into train, validation and test set (70%, 15%, 15%)
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.15, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.1765, random_state=0) # 0.1765 * 0.85 = 0.15

# Create and train the model
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(X_train, y_train)

RandomForestRegressor(random_state=0)

In [3]:
from sklearn.metrics import mean_squared_error

# Make predictions
y_train_pred = model.predict(X_train)
y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)

# Calculate RMSE
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))

rmse_train, rmse_val, rmse_test

(55408.20293371725, 145174.356631227, 150875.48254486805)

In [4]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

# Calculate RMSE from cross-validation scores
cv_rmse = np.sqrt(-cv_scores)

# Train the model with all training data
model.fit(X_temp, y_temp)

cv_rmse.mean()

145980.78936942614

In [5]:
import folium
from folium.plugins import HeatMap

# Convert lat/lon to meters and then to grid coordinates
df['grid_lat'] = (df['latitud'] * 110.574 * 1000 // 100).astype(int)
df['grid_lon'] = (df['longitud'] * 111.32 * 1000 * np.cos(df['latitud']) // 100).astype(int)

# Calculate average prices for each grid cell
heatmap_data = df.groupby(['grid_lat', 'grid_lon'])['precio_vivienda'].mean().reset_index()

# Create base map
map_base = folium.Map(location=[lat_central, lon_central], control_scale=True, zoom_start=14)

# Add heatmap to base map
HeatMap(heatmap_data[['grid_lat', 'grid_lon', 'precio_vivienda']].values.tolist(), radius=8, max_zoom=13).add_to(map_base)

map_base

In [6]:
import folium
from folium.plugins import HeatMap

# Calculate predicted prices for each house
df['predicted_price'] = model.predict(df[['latitud', 'longitud', 'antiguedad_vivienda']])

# Convert lat/lon to meters and then to grid coordinates
df['grid_lat'] = (df['latitud'] * 110.574 * 1000 // 100).astype(int)
df['grid_lon'] = (df['longitud'] * 111.32 * 1000 * np.cos(df['latitud']) // 100).astype(int)

# Calculate average real and predicted prices for each grid cell
heatmap_data_real = df.groupby(['grid_lat', 'grid_lon'])['precio_vivienda'].mean().reset_index()
heatmap_data_predicted = df.groupby(['grid_lat', 'grid_lon'])['predicted_price'].mean().reset_index()

# Create base maps
map_real = folium.Map(location=[lat_central, lon_central], control_scale=True, zoom_start=14)
map_predicted = folium.Map(location=[lat_central, lon_central], control_scale=True, zoom_start=14)

# Add heatmaps to base maps
HeatMap(heatmap_data_real[['grid_lat', 'grid_lon', 'precio_vivienda']].values.tolist(), radius=8, max_zoom=13).add_to(map_real)
HeatMap(heatmap_data_predicted[['grid_lat', 'grid_lon', 'predicted_price']].values.tolist(), radius=8, max_zoom=13).add_to(map_predicted)

map_real.save('heatmap_real.html')
map_predicted.save('heatmap_predicted.html')

In [7]:
import folium
from folium.plugins import HeatMap
from IPython.display import display

# Calculate predicted prices for each house
df['predicted_price'] = model.predict(df[['latitud', 'longitud', 'antiguedad_vivienda']])

# Convert lat/lon to meters and then to grid coordinates
df['grid_lat'] = (df['latitud'] * 110.574 * 1000 // 100).astype(int)
df['grid_lon'] = (df['longitud'] * 111.32 * 1000 * np.cos(df['latitud']) // 100).astype(int)

# Calculate average real and predicted prices for each grid cell
heatmap_data_real = df.groupby(['grid_lat', 'grid_lon'])['precio_vivienda'].mean().reset_index()
heatmap_data_predicted = df.groupby(['grid_lat', 'grid_lon'])['predicted_price'].mean().reset_index()

# Create base maps
map_real = folium.Map(location=[lat_central, lon_central], control_scale=True, zoom_start=14)
map_predicted = folium.Map(location=[lat_central, lon_central], control_scale=True, zoom_start=14)

# Add heatmaps to base maps
HeatMap(heatmap_data_real[['grid_lat', 'grid_lon', 'precio_vivienda']].values.tolist(), radius=8, max_zoom=13).add_to(map_real)
HeatMap(heatmap_data_predicted[['grid_lat', 'grid_lon', 'predicted_price']].values.tolist(), radius=8, max_zoom=13).add_to(map_predicted)

# Display the maps
display(map_real)
display(map_predicted)

In [8]:
import pyproj
from pyproj import Proj, transform

def convert_latlon_to_utm(df, lat_col='latitud', lon_col='longitud'):
    """
    Convert lat/lon to UTM coordinates
    """
    # Create a projection for lat/lon
    proj_latlon = Proj(proj='latlong', datum='WGS84')
    
    # Create a projection for UTM
    # Zone number is determined by the longitude
    # The '+north' indicates that we're in the northern hemisphere
    zone_number = int((df[lon_col].mean() + 180) / 6) + 1
    proj_utm = Proj(proj='utm', zone=zone_number, datum='WGS84')

    # Apply the transformations
    df['easting'], df['northing'] = transform(proj_latlon, proj_utm, df[lon_col].tolist(), df[lat_col].tolist())
    
    return df

df_utm = convert_latlon_to_utm(df, lat_col='latitud', lon_col='longitud')

  df['easting'], df['northing'] = transform(proj_latlon, proj_utm, df[lon_col].tolist(), df[lat_col].tolist())


In [9]:
df_utm.head()

Unnamed: 0,latitud,longitud,precio_vivienda,antiguedad_vivienda,grid_lat,grid_lon,predicted_price,easting,northing
0,-12.073395,-77.022903,496843,5,-13351,-75533,416309.66,279806.322842,-1335483.0
1,-12.071509,-77.020974,141935,6,-13348,-75454,268313.56,280014.833322,-1335273.0
2,-12.072756,-77.022249,109019,9,-13350,-75506,169389.76,279877.023923,-1335412.0
3,-12.073443,-77.022952,463797,2,-13351,-75535,397498.31,279801.039096,-1335488.0
4,-12.075013,-77.024557,238737,9,-13352,-75600,274442.04,279627.514673,-1335663.0


In [10]:
def assign_grid_cells(df, easting_col='easting', northing_col='northing', cell_size=100):
    """
    Assign grid cells to each house
    """
    df['grid_cell_easting'] = df[easting_col] // cell_size
    df['grid_cell_northing'] = df[northing_col] // cell_size
    
    return df

df_grid = assign_grid_cells(df_utm, easting_col='easting', northing_col='northing')

In [11]:
df_grid

Unnamed: 0,latitud,longitud,precio_vivienda,antiguedad_vivienda,grid_lat,grid_lon,predicted_price,easting,northing,grid_cell_easting,grid_cell_northing
0,-12.073395,-77.022903,496843,5,-13351,-75533,416309.66,279806.322842,-1.335483e+06,2798.0,-13355.0
1,-12.071509,-77.020974,141935,6,-13348,-75454,268313.56,280014.833322,-1.335273e+06,2800.0,-13353.0
2,-12.072756,-77.022249,109019,9,-13350,-75506,169389.76,279877.023923,-1.335412e+06,2798.0,-13355.0
3,-12.073443,-77.022952,463797,2,-13351,-75535,397498.31,279801.039096,-1.335488e+06,2798.0,-13355.0
4,-12.075013,-77.024557,238737,9,-13352,-75600,274442.04,279627.514673,-1.335663e+06,2796.0,-13357.0
...,...,...,...,...,...,...,...,...,...,...,...
995,-12.081099,-77.030781,431169,8,-13359,-75851,356918.29,278954.793871,-1.336342e+06,2789.0,-13364.0
996,-12.073813,-77.023331,117056,8,-13351,-75550,277453.02,279760.117985,-1.335529e+06,2797.0,-13356.0
997,-12.069296,-77.018711,297373,8,-13346,-75362,292360.01,280259.452506,-1.335026e+06,2802.0,-13351.0
998,-12.078119,-77.027734,489875,0,-13356,-75729,450453.83,279284.112291,-1.336009e+06,2792.0,-13361.0


In [12]:
def calculate_average_prices(df, grid_cell_easting_col='grid_cell_easting', grid_cell_northing_col='grid_cell_northing', real_price_col='precio_vivienda', estimated_price_col='predicted_price'):
    """
    Calculate average real and estimated prices for each grid cell
    """
    # Group by grid cell
    groups = df.groupby([grid_cell_easting_col, grid_cell_northing_col])
    
    # Calculate average prices
    average_prices = groups[[real_price_col, estimated_price_col]].mean().reset_index()
    
    return average_prices

average_prices = calculate_average_prices(df_grid, grid_cell_easting_col='grid_cell_easting', grid_cell_northing_col='grid_cell_northing', real_price_col='precio_vivienda', estimated_price_col='predicted_price')

In [13]:
average_prices

Unnamed: 0,grid_cell_easting,grid_cell_northing,precio_vivienda,predicted_price
0,2783.0,-13370.0,457302.0,386908.97
1,2784.0,-13369.0,296080.4,292760.758
2,2785.0,-13368.0,341949.8,307242.315
3,2786.0,-13367.0,318860.352941,308281.514118
4,2787.0,-13366.0,352965.071429,343089.276429
5,2788.0,-13365.0,317634.419355,304241.63871
6,2788.0,-13364.0,321679.0,331991.15
7,2789.0,-13364.0,286821.96875,284580.443437
8,2789.0,-13363.0,155553.0,257132.7
9,2790.0,-13363.0,329387.314286,316523.132286


In [14]:
import folium
from folium.plugins import HeatMap

def create_heatmaps(df, grid_cell_easting_col='grid_cell_easting', grid_cell_northing_col='grid_cell_northing', real_price_col='precio_vivienda', estimated_price_col='predicted_price', lat_central=lat_central, lon_central=lon_central):
    """
    Create heatmaps of real and estimated house prices
    """
    # Create base maps
    map_real = folium.Map(location=[lat_central, lon_central], control_scale=True, zoom_start=14)
    map_estimated = folium.Map(location=[lat_central, lon_central], control_scale=True, zoom_start=14)

    # Add heatmaps to base maps
    HeatMap(df[[grid_cell_northing_col, grid_cell_easting_col, real_price_col]].values.tolist(), radius=8, max_zoom=13).add_to(map_real)
    HeatMap(df[[grid_cell_northing_col, grid_cell_easting_col, estimated_price_col]].values.tolist(), radius=8, max_zoom=13).add_to(map_estimated)

    return map_real, map_estimated

In [15]:
map_real, map_estimated = create_heatmaps(average_prices, grid_cell_easting_col='grid_cell_easting', grid_cell_northing_col='grid_cell_northing', real_price_col='precio_vivienda', estimated_price_col='predicted_price')

In [16]:
map_real

In [17]:
import folium
from folium.plugins import HeatMap

def create_heatmaps(df, grid_cell_easting_col='grid_cell_easting', grid_cell_northing_col='grid_cell_northing', real_price_col='precio_vivienda', estimated_price_col='predicted_price', lat_central=lat_central, lon_central=lon_central):
    """
    Create and display heatmaps of real and estimated house prices
    """
    # Create base maps
    map_real = folium.Map(location=[lat_central, lon_central], control_scale=True, zoom_start=14)
    map_estimated = folium.Map(location=[lat_central, lon_central], control_scale=True, zoom_start=14)

    # Add heatmaps to base maps
    HeatMap(df[[grid_cell_easting_col, grid_cell_northing_col, real_price_col]].values.tolist(), radius=8, max_zoom=13).add_to(map_real)
    HeatMap(df[[grid_cell_easting_col, grid_cell_northing_col, estimated_price_col]].values.tolist(), radius=8, max_zoom=13).add_to(map_estimated)

    # Display the maps
    display(map_real)
    display(map_estimated)

In [18]:
create_heatmaps(average_prices, grid_cell_easting_col='grid_cell_easting', grid_cell_northing_col='grid_cell_northing', real_price_col='precio_vivienda', estimated_price_col='predicted_price')

In [19]:
map_real.save('heatmap_real.html')
map_estimated.save('heatmap_estimated.html')

In [20]:
import geopandas as gpd
import matplotlib.pyplot as plt

def create_heatmaps_gpd(df, grid_cell_easting_col='grid_cell_easting', grid_cell_northing_col='grid_cell_northing', real_price_col='precio_vivienda', estimated_price_col='predicted_price', boundary_file='lima_boundary.shp'):
    """
    Create heatmaps of real and estimated house prices using geopandas
    """
    # Convert DataFrame to GeoDataFrame
    df['geometry'] = gpd.points_from_xy(df[grid_cell_easting_col], df[grid_cell_northing_col])
    gdf = gpd.GeoDataFrame(df, geometry='geometry')
    
    # Load boundary file
    boundary = gpd.read_file(boundary_file)

    # Create heatmaps
    fig, ax = plt.subplots(2, 1, figsize=(10, 20))

    boundary.plot(ax=ax[0], color='white', edgecolor='black')
    gdf.plot(column=real_price_col, ax=ax[0], legend=True)
    ax[0].set_title('Real house prices')

    boundary.plot(ax=ax[1], color='white', edgecolor='black')
    gdf.plot(column=estimated_price_col, ax=ax[1], legend=True)
    ax[1].set_title('Estimated house prices')

    plt.show()

In [21]:
create_heatmaps_gpd(average_prices, grid_cell_easting_col='grid_cell_easting', grid_cell_northing_col='grid_cell_northing', real_price_col='precio_vivienda', estimated_price_col='predicted_price', boundary_file='lima_boundary.shp')

DriverError: lima_boundary.shp: No such file or directory

In [22]:
import folium

def create_marker_maps(df, lat_col='latitud', lon_col='longitud', real_price_col='precio_vivienda', estimated_price_col='predicted_price', lat_central=lat_central, lon_central=lon_central):
    """
    Create and display maps with markers for each house. The size of the marker is related to the price.
    """
    # Create base maps
    map_real = folium.Map(location=[lat_central, lon_central], control_scale=True, zoom_start=14)
    map_estimated = folium.Map(location=[lat_central, lon_central], control_scale=True, zoom_start=14)

    # Add markers to the maps
    for idx, row in df.iterrows():
        folium.CircleMarker(
            location=(row[lat_col], row[lon_col]),
            radius=row[real_price_col]/100000,  # Adjust this value to change the marker size
            color='blue',
            fill=True,
            fill_color='blue'
        ).add_to(map_real)

        folium.CircleMarker(
            location=(row[lat_col], row[lon_col]),
            radius=row[estimated_price_col]/100000,  # Adjust this value to change the marker size
            color='red',
            fill=True,
            fill_color='red'
        ).add_to(map_estimated)

    # Display the maps
    display(map_real)
    display(map_estimated)

In [23]:
create_marker_maps(df, lat_col='latitud', lon_col='longitud', real_price_col='precio_vivienda', estimated_price_col='predicted_price')