# Data Filtering

In [1]:
import pandas as pd
import numpy as np

In [2]:
df_M = pd.read_csv('datos_analisis_por_comuna.csv') 
df_M = df_M.loc[:, ~df_M.columns.str.contains('^Unnamed')]
df_M.head()

Unnamed: 0,created_on,end_date,delta_time,lon,lat,bedrooms,bathrooms,surface_total,price,imput,...,MANRIQUE,PALMITAS,POPULAR,ROBLEDO,SAN ANTONIO DE PRADO,SAN CRISTOBAL,SAN JAVIER,SANTA CRUZ,SANTA ELENA,VILLA HERMOSA
0,9/14/2018,9/25/2018,11,-75.577025,6.21136,,,134,19.519293,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,9/14/2018,9/25/2018,11,-75.572928,6.23435,,,588,19.701615,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,9/14/2018,9/25/2018,11,-75.553186,6.237236,,,160,20.125429,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,9/14/2018,9/25/2018,11,-75.594599,6.226078,,,90,19.583832,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,9/14/2018,9/25/2018,11,-75.542267,6.205542,,,101,19.781657,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [3]:
# Separating the real values (not imputed)
df_real = df_M.loc[df_M['imput'] == 0]
print(df_real.shape)

# Counting NaN values of bedrooms and bathrooms respectively
print('Bedrooms nan values: ', df_real['bedrooms'].isna().sum())
print('Bathrooms nan values: ', df_real['bathrooms'].isna().sum())

(18775, 40)
Bedrooms nan values:  6199
Bathrooms nan values:  853


In [4]:
# Eliminate all the rows that contain a NaN value (provisional)
df_real = df_real.dropna(axis = 0)
df_real.shape

(12537, 40)

In [5]:
# Selection of households and apartments
esCasaoApto = df_real[['Apartamento', 'Casa']].apply(lambda x: False if (x[0] == 0 and x[1] == 0) else True, axis = 1)
df_real = df_real.loc[esCasaoApto].drop(columns = ['imput', 'Casa', 'Depósito', 'Finca', 'Local comercial', 'Lote', 
                                                   'Parqueadero', 'Oficina', 'Otro'])
df_real.columns

Index(['created_on', 'end_date', 'delta_time', 'lon', 'lat', 'bedrooms',
       'bathrooms', 'surface_total', 'price', 'Apartamento', 'ALTAVISTA',
       'ARANJUEZ', 'BELEN', 'BUENOS AIRES', 'CASTILLA', 'DOCE DE OCTUBRE',
       'EL POBLADO', 'GUAYABAL', 'LA AMERICA', 'LA CANDELARIA', 'LAURELES',
       'MANRIQUE', 'PALMITAS', 'POPULAR', 'ROBLEDO', 'SAN ANTONIO DE PRADO',
       'SAN CRISTOBAL', 'SAN JAVIER', 'SANTA CRUZ', 'SANTA ELENA',
       'VILLA HERMOSA'],
      dtype='object')

In [6]:
df_real['price'] = df_real['price'].apply(lambda x: np.exp(x))

# Comparing basic traits of each district

In [7]:
# Extracting the names of every district
columnas = df_real.columns.to_series()
nombres = columnas.loc['ALTAVISTA':]
nombres.reset_index(inplace = True, drop = True)
nombres

0                ALTAVISTA
1                 ARANJUEZ
2                    BELEN
3             BUENOS AIRES
4                 CASTILLA
5          DOCE DE OCTUBRE
6               EL POBLADO
7                 GUAYABAL
8               LA AMERICA
9            LA CANDELARIA
10                LAURELES
11                MANRIQUE
12                PALMITAS
13                 POPULAR
14                 ROBLEDO
15    SAN ANTONIO DE PRADO
16           SAN CRISTOBAL
17              SAN JAVIER
18              SANTA CRUZ
19             SANTA ELENA
20           VILLA HERMOSA
dtype: object

In [8]:
# Creating a list of dataframes containing the information of district
districts_df = [None] * nombres.shape[0]
d = {}
i = 0
for district in nombres:
    name = district.replace(' ', '_')
    vars()[name] = df_real.loc[df_real[district] == 1].loc[:,:'price'] 
    districts_df[i] = vars()[name]
    d[i] = name
    i += 1

In [9]:
# Describing each data set and extracting the data and storing it in a list
delta_time = [None] * nombres.shape[0]
bedrooms = [None] * nombres.shape[0]
bathrooms = [None] * nombres.shape[0]
surface_total = [None] * nombres.shape[0]
price = [None] * nombres.shape[0]
size = [None] * nombres.shape[0]

i = 0

while i < nombres.shape[0]:
    df = districts_df[i].describe().loc['mean'].drop(['lat', 'lon'])
    delta_time[i] = df[0]
    bedrooms[i] = df[1]
    bathrooms[i] = df[2]
    surface_total[i] = df[3]
    price[i] = df[4]
    size[i] = districts_df[i].shape[0]
    i += 1

In [10]:
# Turning it into a unique dataframe
districts = pd.DataFrame({
    'name': nombres.values.tolist(),
    'delta_time': delta_time, 
    'bedrooms': bedrooms, 
    'bathrooms': bathrooms, 
    'surface_total': surface_total,
    'price': price,
    'size': size
})
districts

Unnamed: 0,name,delta_time,bedrooms,bathrooms,surface_total,price,size
0,ALTAVISTA,68.730769,2.75,2.057692,84.596154,346807700.0,52
1,ARANJUEZ,68.83908,4.068966,2.275862,149.310345,337508100.0,87
2,BELEN,66.717514,3.378531,2.6742,137.879473,436517800.0,1062
3,BUENOS AIRES,87.429213,3.804494,2.997753,278.516854,679572800.0,445
4,CASTILLA,72.098039,3.705882,2.568627,130.647059,383745100.0,51
5,DOCE DE OCTUBRE,36.4,3.6,3.0,136.2,438000000.0,5
6,EL POBLADO,75.208403,3.297059,3.497059,386.276891,826680900.0,2380
7,GUAYABAL,79.72428,3.438272,2.823045,201.62963,593852800.0,486
8,LA AMERICA,69.788686,3.723794,2.607321,143.81198,397851600.0,601
9,LA CANDELARIA,59.983752,3.691285,2.927622,176.42836,550824200.0,677


In [71]:
ordenados_por_tamano = districts.sort_values(['size'])
ordenados_por_tamano

Unnamed: 0,name,delta_time,bedrooms,bathrooms,surface_total,price,size
18,SANTA CRUZ,37.0,3.0,2.0,67.0,240000000.0,2
5,DOCE DE OCTUBRE,36.4,3.6,3.0,136.2,438000000.0,5
12,PALMITAS,119.6,3.4,2.8,118.6,384000000.0,5
13,POPULAR,55.0,4.636364,2.818182,213.727273,427727300.0,11
15,SAN ANTONIO DE PRADO,104.833333,3.75,3.0,255.083333,670000000.0,12
16,SAN CRISTOBAL,115.043478,3.73913,2.869565,148.73913,390347800.0,23
17,SAN JAVIER,72.934783,3.782609,2.673913,282.456522,411913000.0,46
4,CASTILLA,72.098039,3.705882,2.568627,130.647059,383745100.0,51
11,MANRIQUE,74.294118,4.0,2.627451,178.490196,456490200.0,51
0,ALTAVISTA,68.730769,2.75,2.057692,84.596154,346807700.0,52


# Plotting

In [11]:
import plotly
from plotly.offline import init_notebook_mode, plot, iplot, download_plotlyjs
init_notebook_mode(connected = True)

import plotly.graph_objs as go

In [12]:
trace = go.Bar(x = districts.sort_values(['price']).name, 
               y = districts.sort_values(['price']).price.apply(lambda x: x/1000000))
layout = go.Layout(title = 'Precio Medio por Comuna', xaxis_title = 'Comuna', 
                   yaxis_title = 'Precio COP en (M)')
fig = go.Figure(data = [trace], layout = layout)
iplot(fig)

In [13]:
trace = go.Bar(x = districts.sort_values(['surface_total']).name, 
               y = districts.sort_values(['surface_total']).surface_total)
layout = go.Layout(title = 'Superfcie Total Media por Comuna', xaxis_title = 'Comuna', 
                   yaxis_title = 'Superficie Total (m^2)')
fig = go.Figure(data = [trace], layout = layout)
iplot(fig)

In [14]:
trace1 = go.Bar(x = districts.sort_values(['price']).name, 
               y = districts.sort_values(['price']).price.apply(lambda x: x/1000000),
               name = 'Precio COP (M)', marker_color = '#0b31db')
trace = go.Bar(x = districts.sort_values(['surface_total']).name, 
                y = districts.sort_values(['surface_total']).surface_total,
                name = 'Superficie Total', marker_color = '#db700b')
layout = go.Layout(title = 'Precio y Superficie total Media por Comuna', xaxis_title = 'Comuna', barmode = 'group')
fig = go.Figure(data = [trace, trace1], layout = layout)
fig.add_trace(go.Scatter(x = districts.sort_values(['surface_total']).name, 
                         y = districts.sort_values(['surface_total']).price.apply(lambda x: x/1000000),
                         mode = 'lines+markers',
                         name = 'Precio COP (M)'))
fig.add_trace(go.Scatter(x = districts.sort_values(['surface_total']).name,
                         y = districts.sort_values(['surface_total']).surface_total,
                         mode = 'lines+markers',
                         name = 'Superficie Total'))
iplot(fig)

In [15]:
trace = go.Bar(x = districts.sort_values(['bedrooms']).name, y = districts.sort_values(['bedrooms']).bedrooms)
layout = go.Layout(title = 'Numero Medio de Cuartos por Comuna', xaxis_title = 'Comuna', yaxis_title = 'Cuartos')
fig = go.Figure(data = [trace], layout = layout)
iplot(fig)

In [16]:
trace = go.Bar(x = districts.sort_values(['bathrooms']).name, y = districts.sort_values(['bathrooms']).bathrooms)
layout = go.Layout(title = 'Numero Medio de Baños por Comuna', xaxis_title = 'Comuna', yaxis_title = 'Baños')
fig = go.Figure(data = [trace], layout = layout)
iplot(fig)

In [17]:
trace1 = go.Bar(x = districts.sort_values(['bathrooms']).name, 
                y = districts.sort_values(['bathrooms']).bathrooms, name = 'Baños', marker_color = '#db700b')
trace = go.Bar(x = districts.sort_values(['bathrooms']).name, 
               y = districts.sort_values(['bathrooms']).bedrooms, name = 'Cuartos')
layout = go.Layout(title = 'Numero de Medio de Baños y Cuartos (Orden por Precio)', 
                   xaxis_title = 'Comuna', barmode = 'group')
fig = go.Figure(data = [trace, trace1], layout = layout)
fig.add_trace(go.Scatter(x = districts.sort_values(['bathrooms']).name, 
                         y = districts.sort_values(['bathrooms']).bathrooms,
                         mode = 'lines+markers',
                         name = 'Baños'))
fig.add_trace(go.Scatter(x = districts.sort_values(['bathrooms']).name,
                         y = districts.sort_values(['bathrooms']).bedrooms,
                         mode = 'lines+markers',
                         name = 'Cuartos'))
fig.add_trace(go.Scatter(x = districts.sort_values(['bathrooms']).name,
                         y = districts.sort_values(['bathrooms']).price.apply(lambda x: x/100000000),
                         mode = 'lines+markers',
                         name = 'Precio COP (100M)',
                         marker_color = 'black'))
iplot(fig)

# Observaciones
 
## Superficie total
 - Es probable que las viviendas que sean catalogadas como casas en Santa Elena, en realidad sean fincas, debido a su alto precio medio y su alta suferpeficie total media.
 - Hay comunas en las cuales la superficie total de no sigue la misma tendencia que el precio necesariamente. Esto puede ocurrir debido a que la zona efectivamente infla los precios de las viviendas que se encuentrar en dichos distritos, y los precios en esta zona efectivamente son delimitados por su sector.
 - Los picos de la anomalia mencionada previamentre corresponden a las comunas: LAURELES, LA CANDELARIA, GUAYABAL, VILLA HERMOSA, SAN ANTONIO DE PRADO, EL POBLADO y SANTA ELENA.
 - Los bajos corresponden a: ARANJUEZ, MANRIQUE, POPULAR y SAN JAVIER.

## Cuartos y Baños
 - El precio tampoco sigue la misma tendencia del numero medio de cuartos y baños por comuna. 
 - Cuando se organiza con respecto a los baños, hay picos en: VILLA HERMOSA, GUAYABAL, LA CANDELARIA, LAURELES, BUENOS AIRES, SAN ANTONIO DE PRADO, EL POBLADO Y SANTA ELENA.
 - Los bajos corresponden a: PALMITAS, POPULAR, SAN CRISTOBAL, Y EL DOCE DE OCTUBRE. 
 - Cuando se organizan con respecto al numero medio de cuartos el precio no sigue una tendencia clara en ninguna parte de la grafica, a diferencia de las graficas de las superficie total media y el numero medio de baños, donde habia una clara relacion entre estas dos variables al principio de la grafica. 

# Analisis de Picos en Superficie Total

## Santa Elena

In [25]:
# Filtering
santa_elena = df_real.loc[df_real['SANTA ELENA'] == 1].drop(columns = ['ALTAVISTA', 'ARANJUEZ', 'BELEN', 
                                                                       'BUENOS AIRES','CASTILLA', 'DOCE DE OCTUBRE',
                                                                       'EL POBLADO', 
                                                                       'GUAYABAL', 'LA AMERICA', 'LA CANDELARIA', 
                                                                       'LAURELES', 'PALMITAS', 'POPULAR', 'ROBLEDO', 
                                                                       'MANRIQUE', 'SAN ANTONIO DE PRADO',
                                                                       'SAN CRISTOBAL', 
                                                                       'SAN JAVIER', 'SANTA CRUZ', 'SANTA ELENA',
                                                                       'VILLA HERMOSA'])
santa_elena.describe()

Unnamed: 0,delta_time,lon,lat,bedrooms,bathrooms,surface_total,price,Apartamento
count,81.0,81.0,81.0,81.0,81.0,81.0,81.0,81.0
mean,75.555556,-75.551486,6.202292,3.864198,5.08642,627.209877,1852837000.0,0.123457
std,62.424955,0.007957,0.01698,0.862454,1.797203,600.356909,1140980000.0,0.33101
min,3.0,-75.559286,6.179083,2.0,1.0,96.0,280000000.0,0.0
25%,32.0,-75.555849,6.187518,3.0,4.0,237.0,780000000.0,0.0
50%,76.0,-75.552002,6.197128,4.0,5.0,496.0,1700000000.0,0.0
75%,91.0,-75.547997,6.22432,4.0,7.0,700.0,2700000000.0,0.0
max,364.0,-75.498001,6.226688,6.0,8.0,2700.0,4300000000.0,1.0


In [26]:
apto_elena = santa_elena[santa_elena['Apartamento'] == 1]
casa_elena = santa_elena[santa_elena['Apartamento'] == 0]
print('Numero de apatamentos: ', apto_elena.shape[0])
print('Numero de casas: ', casa_elena.shape[0])

Numero de apatamentos:  10
Numero de casas:  71


In [27]:
trace1 = go.Scatter(x = apto_elena.surface_total, y = apto_elena.price, mode = 'markers', name = 'Apartamentos',
                   marker_color = '#ffd000')
trace2 = go.Scatter(x = casa_elena.surface_total, y = casa_elena.price, mode = 'markers', name = 'Casas', 
                   marker_color = '#1caeb0')
layout = go.Layout(title = 'Superficie Total vs Precio (SANTA ELENA)',
                   xaxis_title = 'Superficie Total',
                   yaxis_title = 'Precio COP')
fig = go.Figure(data = [trace1, trace2], layout =  layout)
iplot(fig)

## Observaciones 

##### Primeras Observaciones:
 - Claramente la superficie total de incrementa de manera notoria el precio de las viviendas.
 - Si se compara con la superficie total de EL POBLADO (Esto ya no es valido o seguro, debido a que se descubrio que hay un outlier que incrementa el valor de la superficie total media), comuna cuya superficie total es la segunda mayor, despues de esta (SANTA ELENA), en general, las superficies totales son bastante grandes, lo que puede indicar que efectivamemnte la mayoria de estas viviendas pueden ser fincas
 - Es sorprendente que los apartamentos tengan una superficie tan grande, en especial por la zona en la que se esta, puede que estos tambien esten mal catalogados y en realidad sean casas finca.

## Laureles

In [23]:
# Filtering
laureles = df_real.loc[df_real['LAURELES'] == 1].drop(columns = ['ALTAVISTA', 'ARANJUEZ', 'BELEN', 
                                                                       'BUENOS AIRES','CASTILLA', 'DOCE DE OCTUBRE',
                                                                       'EL POBLADO', 
                                                                       'GUAYABAL', 'LA AMERICA', 'LA CANDELARIA', 
                                                                       'LAURELES', 'PALMITAS', 'POPULAR', 'ROBLEDO', 
                                                                       'MANRIQUE', 'SAN ANTONIO DE PRADO',
                                                                       'SAN CRISTOBAL', 
                                                                       'SAN JAVIER', 'SANTA CRUZ', 'SANTA ELENA',
                                                                       'VILLA HERMOSA'])
laureles.describe()

Unnamed: 0,delta_time,lon,lat,bedrooms,bathrooms,surface_total,price,Apartamento
count,2349.0,2349.0,2349.0,2349.0,2349.0,2349.0,2349.0,2349.0
mean,62.255002,-75.593307,6.24736,3.632184,2.960409,172.8361,543988400.0,0.56407
std,56.034723,0.007841,0.007116,1.365835,1.207465,211.890149,415147700.0,0.495984
min,0.0,-75.61,6.237556,1.0,1.0,11.0,240000000.0,0.0
25%,18.0,-75.601334,6.243,3.0,2.0,95.0,327000000.0,0.0
50%,56.0,-75.594002,6.244705,3.0,3.0,130.0,425000000.0,1.0
75%,89.0,-75.586998,6.249,4.0,4.0,200.0,580000000.0,1.0
max,460.0,-75.578003,6.27,18.0,10.0,5700.0,4500000000.0,1.0


In [24]:
apto_laureles = laureles[laureles['Apartamento'] == 1]
casa_laureles = laureles[laureles['Apartamento'] == 0]
print('Numero de apatamentos: ', apto_laureles.shape[0])
print('Numero de casas: ', casa_laureles.shape[0])

Numero de apatamentos:  1325
Numero de casas:  1024


In [29]:
trace1 = go.Scatter(x = apto_laureles.surface_total, y = apto_laureles.price, mode = 'markers', name = 'Apartamentos',
                   marker_color = '#ffd000')
trace2 = go.Scatter(x = casa_laureles.surface_total, y = casa_laureles.price, mode = 'markers', name = 'Casas', 
                   marker_color = '#1caeb0')
layout = go.Layout(title = 'Superficie Total vs Precio (LAURELES)',
                   xaxis_title = 'Superficie Total',
                   yaxis_title = 'Precio COP')
fig = go.Figure(data = [trace1, trace2], layout =  layout)
iplot(fig)

## Observaciones

###### Primeras Observaciones:
 - El conjunto de datos es lo suficientemente grande como para ser representativo, y es bastante balanceado con respecto al tipo de vivienda.
 - Existen datos outliers que evidentemente estan moviendo la media de su lugar, los que respectan a la superficie total pueden ser datos mal catalogados o digitados, y los correspondientes al precio pueden ser lo mismo, o por el contrario casas bastante grandes (esto lo creo posible dado el sector). 
 - Hay dos datos outliers con respecto a los apartamentos tambien, uno claramente es un dato mal catalogado, y otro puede ser una mala digitacion, o un apartamento excesivamente caro.

## La Candelaria

In [30]:
# Filtering
candelaria = df_real.loc[df_real['LA CANDELARIA'] == 1].drop(columns = ['ALTAVISTA', 'ARANJUEZ', 'BELEN', 
                                                                       'BUENOS AIRES','CASTILLA', 'DOCE DE OCTUBRE',
                                                                       'EL POBLADO', 
                                                                       'GUAYABAL', 'LA AMERICA', 'LA CANDELARIA', 
                                                                       'LAURELES', 'PALMITAS', 'POPULAR', 'ROBLEDO', 
                                                                       'MANRIQUE', 'SAN ANTONIO DE PRADO',
                                                                       'SAN CRISTOBAL', 
                                                                       'SAN JAVIER', 'SANTA CRUZ', 'SANTA ELENA',
                                                                       'VILLA HERMOSA'])
candelaria.describe()

Unnamed: 0,delta_time,lon,lat,bedrooms,bathrooms,surface_total,price,Apartamento
count,677.0,677.0,677.0,677.0,677.0,677.0,677.0,677.0
mean,59.983752,-75.567574,6.24776,3.691285,2.927622,176.42836,550824200.0,0.621861
std,50.180338,0.005167,0.007869,1.531947,1.249529,217.496738,614006000.0,0.485281
min,0.0,-75.578003,6.224662,1.0,1.0,40.0,237000000.0,0.0
25%,15.0,-75.571999,6.246,3.0,2.0,87.0,295000000.0,0.0
50%,61.0,-75.566002,6.248,3.0,3.0,125.0,380000000.0,1.0
75%,89.0,-75.565,6.253,4.0,3.0,190.0,520000000.0,1.0
max,299.0,-75.556508,6.265,13.0,10.0,2710.0,4600000000.0,1.0


In [35]:
apto_candelaria = candelaria[candelaria['Apartamento'] == 1]
casa_candelaria = candelaria[candelaria['Apartamento'] == 0]
print('Numero de apatamentos: ', apto_candelaria.shape[0])
print('Numero de casas: ', casa_candelaria.shape[0])

Numero de apatamentos:  421
Numero de casas:  256


In [38]:
trace1 = go.Scatter(x = apto_candelaria.surface_total, y = apto_candelaria.price, mode = 'markers', 
                    name = 'Apartamentos', marker_color = '#ffd000')
trace2 = go.Scatter(x = casa_candelaria.surface_total, y = casa_candelaria.price, mode = 'markers', name = 'Casas', 
                   marker_color = '#1caeb0')
layout = go.Layout(title = 'Superficie Total vs Precio (LA CANDELARIA)',
                   xaxis_title = 'Superficie Total',
                   yaxis_title = 'Precio COP')
fig = go.Figure(data = [trace1, trace2], layout =  layout)
iplot(fig)

## Observaciones

##### Primeras Observaciones:
 - Es un conjunto de datos relaticamente pequeño, sin embargo, dado la clara agrupacion en la parte inferior izquierda, puede que estos datos lleguen a ser representativos. 
 - Evidentemente hay datos de outliers de apartamentos y casas, uen caso de los apartamentos su superficie indica que estan mal catalogados o digitados. Asi mismo, el caso de las casas puede ser el mismo

## Villa hermosa

In [33]:
# Filtering
villa_hermosa = df_real.loc[df_real['VILLA HERMOSA'] == 1].drop(columns = ['ALTAVISTA', 'ARANJUEZ', 'BELEN', 
                                                                       'BUENOS AIRES','CASTILLA', 'DOCE DE OCTUBRE',
                                                                       'EL POBLADO', 
                                                                       'GUAYABAL', 'LA AMERICA', 'LA CANDELARIA', 
                                                                       'LAURELES', 'PALMITAS', 'POPULAR', 'ROBLEDO', 
                                                                       'MANRIQUE', 'SAN ANTONIO DE PRADO',
                                                                       'SAN CRISTOBAL', 
                                                                       'SAN JAVIER', 'SANTA CRUZ', 'SANTA ELENA',
                                                                       'VILLA HERMOSA'])
villa_hermosa.describe()

Unnamed: 0,delta_time,lon,lat,bedrooms,bathrooms,surface_total,price,Apartamento
count,76.0,76.0,76.0,76.0,76.0,76.0,76.0,76.0
mean,62.736842,-75.550172,6.250035,4.223684,2.815789,208.342105,610105300.0,0.421053
std,58.639092,0.007211,0.00758,2.01725,1.162876,162.330819,797190400.0,0.497009
min,0.0,-75.558998,6.232,2.0,2.0,62.0,240000000.0,0.0
25%,17.75,-75.555,6.245394,3.0,2.0,89.25,277500000.0,0.0
50%,55.5,-75.552002,6.251,4.0,2.0,142.5,320000000.0,0.0
75%,89.0,-75.544692,6.257128,4.0,3.0,255.0,482500000.0,1.0
max,299.0,-75.53,6.26,12.0,6.0,680.0,3500000000.0,1.0


In [37]:
apto_villa = villa_hermosa[villa_hermosa['Apartamento'] == 1]
casa_villa = villa_hermosa[villa_hermosa['Apartamento'] == 0]
print('Numero de apatamentos: ', apto_villa.shape[0])
print('Numero de casas: ', casa_villa.shape[0])

Numero de apatamentos:  32
Numero de casas:  44


In [39]:
trace1 = go.Scatter(x = apto_villa.surface_total, y = apto_villa.price, mode = 'markers', 
                    name = 'Apartamentos', marker_color = '#ffd000')
trace2 = go.Scatter(x = casa_villa.surface_total, y = casa_villa.price, mode = 'markers', name = 'Casas', 
                   marker_color = '#1caeb0')
layout = go.Layout(title = 'Superficie Total vs Precio (VILLA HERMOSA)',
                   xaxis_title = 'Superficie Total',
                   yaxis_title = 'Precio COP')
fig = go.Figure(data = [trace1, trace2], layout =  layout)
iplot(fig)

## Observaciones

##### Primeras Observaciones:
 - Al ser tan pequeño, y los datos estar tan esparcidos. No estoy seguro si es conjunto de datos es representativo. INVESTIGAR. 
 - Conjuntos de tipos de viviendas estan balanceados
 - Dos datos outliers, no estaria seguro de afirmar si son casas fincas o no. 

## San Antonio de Prado

In [40]:
# Filtering
san_antonio = df_real.loc[df_real['SAN ANTONIO DE PRADO'] == 1].drop(columns = ['ALTAVISTA', 'ARANJUEZ', 'BELEN', 
                                                                       'BUENOS AIRES','CASTILLA', 'DOCE DE OCTUBRE',
                                                                       'EL POBLADO', 
                                                                       'GUAYABAL', 'LA AMERICA', 'LA CANDELARIA', 
                                                                       'LAURELES', 'PALMITAS', 'POPULAR', 'ROBLEDO', 
                                                                       'MANRIQUE', 'SAN ANTONIO DE PRADO',
                                                                       'SAN CRISTOBAL', 
                                                                       'SAN JAVIER', 'SANTA CRUZ', 'SANTA ELENA',
                                                                       'VILLA HERMOSA'])
san_antonio.describe()

Unnamed: 0,delta_time,lon,lat,bedrooms,bathrooms,surface_total,price,Apartamento
count,12.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0
mean,104.833333,-75.661065,6.196801,3.75,3.0,255.083333,670000000.0,0.5
std,79.916813,0.011247,0.011345,1.05529,1.3484,347.258996,658137900.0,0.522233
min,13.0,-75.683998,6.175,2.0,1.0,69.0,280000000.0,0.0
25%,64.5,-75.665459,6.18725,3.75,2.0,96.75,330000000.0,0.0
50%,84.0,-75.665333,6.204028,4.0,3.5,121.0,415000000.0,0.5
75%,125.75,-75.654999,6.204304,4.0,4.0,149.5,555000000.0,1.0
max,299.0,-75.64,6.21,6.0,5.0,1200.0,2500000000.0,1.0


In [41]:
apto_santonio = san_antonio[san_antonio['Apartamento'] == 1]
casa_santonio = san_antonio[san_antonio['Apartamento'] == 0]
print('Numero de apatamentos: ', apto_santonio.shape[0])
print('Numero de casas: ', casa_santonio.shape[0])

Numero de apatamentos:  6
Numero de casas:  6


In [43]:
trace1 = go.Scatter(x = apto_santonio.surface_total, y = apto_santonio.price, mode = 'markers', 
                    name = 'Apartamentos', marker_color = '#ffd000')
trace2 = go.Scatter(x = casa_santonio.surface_total, y = casa_santonio.price, mode = 'markers', name = 'Casas', 
                   marker_color = '#1caeb0')
layout = go.Layout(title = 'Superficie Total vs Precio (SAN ANTONIO DE PRADO)',
                   xaxis_title = 'Superficie Total',
                   yaxis_title = 'Precio COP')
fig = go.Figure(data = [trace1, trace2], layout =  layout)
iplot(fig)

## Observaciones

##### Primeras Observaciones:
 - Conjunto de datos con un tamaño nada suficiente para ser representativo, y ademas, con outliers.

## Buenos Aires

In [45]:
# Filtering
buenos_aires = df_real.loc[df_real['BUENOS AIRES'] == 1].drop(columns = ['ALTAVISTA', 'ARANJUEZ', 'BELEN', 
                                                                       'BUENOS AIRES','CASTILLA', 'DOCE DE OCTUBRE',
                                                                       'EL POBLADO', 
                                                                       'GUAYABAL', 'LA AMERICA', 'LA CANDELARIA', 
                                                                       'LAURELES', 'PALMITAS', 'POPULAR', 'ROBLEDO', 
                                                                       'MANRIQUE', 'SAN ANTONIO DE PRADO',
                                                                       'SAN CRISTOBAL', 
                                                                       'SAN JAVIER', 'SANTA CRUZ', 'SANTA ELENA',
                                                                       'VILLA HERMOSA'])
buenos_aires.describe()

Unnamed: 0,delta_time,lon,lat,bedrooms,bathrooms,surface_total,price,Apartamento
count,445.0,445.0,445.0,445.0,445.0,445.0,445.0,445.0
mean,87.429213,-75.560526,6.223623,3.804494,2.997753,278.516854,679572800.0,0.483146
std,51.156821,0.004819,0.008409,1.57113,1.387685,422.714424,637463600.0,0.500278
min,0.0,-75.566002,6.215319,1.0,1.0,12.0,239000000.0,0.0
25%,49.0,-75.563004,6.217,3.0,2.0,87.0,310000000.0,0.0
50%,85.0,-75.563004,6.219,3.0,3.0,147.0,424165000.0,0.0
75%,119.0,-75.559998,6.228309,4.0,4.0,270.0,780000000.0,1.0
max,368.0,-75.539,6.244,10.0,10.0,4000.0,3500000000.0,1.0


In [46]:
apto_baires = buenos_aires[buenos_aires['Apartamento'] == 1]
casa_baires = buenos_aires[buenos_aires['Apartamento'] == 0]
print('Numero de apatamentos: ', apto_baires.shape[0])
print('Numero de casas: ', casa_baires.shape[0])

Numero de apatamentos:  215
Numero de casas:  230


In [48]:
trace1 = go.Scatter(x = apto_baires.surface_total, y = apto_baires.price, mode = 'markers', 
                    name = 'Apartamentos', marker_color = '#ffd000')
trace2 = go.Scatter(x = casa_baires.surface_total, y = casa_baires.price, mode = 'markers', name = 'Casas', 
                   marker_color = '#1caeb0')
layout = go.Layout(title = 'Superficie Total vs Precio (BUENOS AIRES)',
                   xaxis_title = 'Superficie Total',
                   yaxis_title = 'Precio COP')
fig = go.Figure(data = [trace1, trace2], layout =  layout)
iplot(fig)

## Observaciones

##### Primeras Observaciones:
 - Caso parecido al de LA CANDELARIA. No se si el conjunto de datos es lo suficientemente grande para ser representativo, sin embargo, se logra ver una agrupacion de datos en la parte inferior izquierda.
 - Outliers por posible mala digitacion de datos

## El Poblado

In [49]:
# Filtering
poblado = df_real.loc[df_real['EL POBLADO'] == 1].drop(columns = ['ALTAVISTA', 'ARANJUEZ', 'BELEN', 
                                                                       'BUENOS AIRES','CASTILLA', 'DOCE DE OCTUBRE',
                                                                       'EL POBLADO', 
                                                                       'GUAYABAL', 'LA AMERICA', 'LA CANDELARIA', 
                                                                       'LAURELES', 'PALMITAS', 'POPULAR', 'ROBLEDO', 
                                                                       'MANRIQUE', 'SAN ANTONIO DE PRADO',
                                                                       'SAN CRISTOBAL', 
                                                                       'SAN JAVIER', 'SANTA CRUZ', 'SANTA ELENA',
                                                                       'VILLA HERMOSA'])
poblado.describe()

Unnamed: 0,delta_time,lon,lat,bedrooms,bathrooms,surface_total,price,Apartamento
count,2380.0,2380.0,2380.0,2380.0,2380.0,2380.0,2380.0,2380.0
mean,75.208403,-75.567305,6.202706,3.297059,3.497059,386.276891,826680900.0,0.639496
std,66.034582,0.007044,0.012351,1.142437,1.336427,5738.033171,600888800.0,0.480248
min,0.0,-75.582,6.177,1.0,1.0,28.0,239000000.0,0.0
25%,28.75,-75.571999,6.193,3.0,2.0,105.0,450000000.0,0.0
50%,65.0,-75.568,6.203,3.0,3.0,155.0,650000000.0,1.0
75%,91.0,-75.563577,6.213,4.0,4.0,250.0,950000000.0,1.0
max,454.0,-75.549,6.23,12.0,10.0,198000.0,4500000000.0,1.0


In [51]:
apto_poblado = poblado[poblado['Apartamento'] == 1]
casa_poblado = poblado[poblado['Apartamento'] == 0]
print('Numero de apatamentos: ', apto_poblado.shape[0])
print('Numero de casas: ', casa_poblado.shape[0])

Numero de apatamentos:  1522
Numero de casas:  858


In [52]:
trace1 = go.Scatter(x = apto_poblado.surface_total, y = apto_poblado.price, mode = 'markers', 
                    name = 'Apartamentos', marker_color = '#ffd000')
trace2 = go.Scatter(x = casa_poblado.surface_total, y = casa_poblado.price, mode = 'markers', name = 'Casas', 
                   marker_color = '#1caeb0')
layout = go.Layout(title = 'Superficie Total vs Precio (EL POBLADO)',
                   xaxis_title = 'Superficie Total',
                   yaxis_title = 'Precio COP')
fig = go.Figure(data = [trace1, trace2], layout =  layout)
iplot(fig)

## Observaciones

##### Primeras Observaciones:
 - Conjunto de datos lo suficientemente grande para ser representativo.
 - No se puede analizar el conjunto de datos debido a un outlier que evidentemente esta mal digitado. 

# Analisis de Bajos de Superficie Total

aranjuez manrique popular san javier

## Aranjuez

In [54]:
# Filtering
aranjuez = df_real.loc[df_real['ARANJUEZ'] == 1].drop(columns = ['ALTAVISTA', 'ARANJUEZ', 'BELEN', 
                                                                       'BUENOS AIRES','CASTILLA', 'DOCE DE OCTUBRE',
                                                                       'EL POBLADO', 
                                                                       'GUAYABAL', 'LA AMERICA', 'LA CANDELARIA', 
                                                                       'LAURELES', 'PALMITAS', 'POPULAR', 'ROBLEDO', 
                                                                       'MANRIQUE', 'SAN ANTONIO DE PRADO',
                                                                       'SAN CRISTOBAL', 
                                                                       'SAN JAVIER', 'SANTA CRUZ', 'SANTA ELENA',
                                                                       'VILLA HERMOSA'])
aranjuez.describe()

Unnamed: 0,delta_time,lon,lat,bedrooms,bathrooms,surface_total,price,Apartamento
count,87.0,87.0,87.0,87.0,87.0,87.0,87.0,87.0
mean,68.83908,-75.560863,6.272873,4.068966,2.275862,149.310345,337508100.0,0.425287
std,43.110694,0.004711,0.00614,1.538597,0.757745,73.49819,101563300.0,0.497253
min,2.0,-75.569,6.263905,1.0,1.0,55.0,240000000.0,0.0
25%,32.0,-75.566002,6.268,3.0,2.0,95.5,270000000.0,0.0
50%,71.0,-75.559,6.271,4.0,2.0,133.0,300000000.0,0.0
75%,92.0,-75.556529,6.277954,5.0,3.0,160.0,385000000.0,1.0
max,189.0,-75.554701,6.286238,9.0,5.0,430.0,900000000.0,1.0


In [55]:
apto_aranjuez = aranjuez[aranjuez['Apartamento'] == 1]
casa_aranjuez = aranjuez[aranjuez['Apartamento'] == 0]
print('Numero de apatamentos: ', apto_aranjuez.shape[0])
print('Numero de casas: ', casa_aranjuez.shape[0])

Numero de apatamentos:  37
Numero de casas:  50


In [57]:
trace1 = go.Scatter(x = apto_aranjuez.surface_total, y = apto_aranjuez.price, mode = 'markers', 
                    name = 'Apartamentos', marker_color = '#ffd000')
trace2 = go.Scatter(x = casa_aranjuez.surface_total, y = casa_aranjuez.price, mode = 'markers', name = 'Casas', 
                   marker_color = '#1caeb0')
layout = go.Layout(title = 'Superficie Total vs Precio (ARANJUEZ)',
                   xaxis_title = 'Superficie Total',
                   yaxis_title = 'Precio COP')
fig = go.Figure(data = [trace1, trace2], layout =  layout)
iplot(fig)

## Observaciones

##### Primeras Observaciones:
 - Conjunto de datos pequeño. Presencia de dos outliers, el del precio, mal catalogado. Conjunto de datos medianamente balanceado.

## Manrique

In [59]:
# Filtering
manrique = df_real.loc[df_real['MANRIQUE'] == 1].drop(columns = ['ALTAVISTA', 'ARANJUEZ', 'BELEN', 
                                                                       'BUENOS AIRES','CASTILLA', 'DOCE DE OCTUBRE',
                                                                       'EL POBLADO', 
                                                                       'GUAYABAL', 'LA AMERICA', 'LA CANDELARIA', 
                                                                       'LAURELES', 'PALMITAS', 'POPULAR', 'ROBLEDO', 
                                                                       'MANRIQUE', 'SAN ANTONIO DE PRADO',
                                                                       'SAN CRISTOBAL', 
                                                                       'SAN JAVIER', 'SANTA CRUZ', 'SANTA ELENA',
                                                                       'VILLA HERMOSA'])
manrique.describe()

Unnamed: 0,delta_time,lon,lat,bedrooms,bathrooms,surface_total,price,Apartamento
count,51.0,51.0,51.0,51.0,51.0,51.0,51.0,51.0
mean,74.294118,-75.549476,6.270787,4.0,2.627451,178.490196,456490200.0,0.411765
std,63.659184,0.003842,0.006087,1.6,0.958348,103.471421,308055500.0,0.49705
min,2.0,-75.554337,6.263,1.0,1.0,62.0,240000000.0,0.0
25%,24.5,-75.553081,6.266535,3.0,2.0,115.5,266000000.0,0.0
50%,78.0,-75.550377,6.269,4.0,2.0,162.0,350000000.0,0.0
75%,90.5,-75.545825,6.274367,5.0,3.0,215.0,572500000.0,1.0
max,299.0,-75.542,6.284,10.0,5.0,670.0,2100000000.0,1.0


In [60]:
apto_manrique = manrique[manrique['Apartamento'] == 1]
casa_manrique = manrique[manrique['Apartamento'] == 0]
print('Numero de apatamentos: ', apto_manrique.shape[0])
print('Numero de casas: ', casa_manrique.shape[0])

Numero de apatamentos:  21
Numero de casas:  30


In [62]:
trace1 = go.Scatter(x = apto_manrique.surface_total, y = apto_manrique.price, mode = 'markers', 
                    name = 'Apartamentos', marker_color = '#ffd000')
trace2 = go.Scatter(x = casa_manrique.surface_total, y = casa_manrique.price, mode = 'markers', name = 'Casas', 
                   marker_color = '#1caeb0')
layout = go.Layout(title = 'Superficie Total vs Precio (MANRIQUE)',
                   xaxis_title = 'Superficie Total',
                   yaxis_title = 'Precio COP')
fig = go.Figure(data = [trace1, trace2], layout =  layout)
iplot(fig)

## Observaciones

##### Primeras Observaciones:
 - Conjunto de datos bastante pequeño, ligeramente agrupados, tres outliers, mal catalogados o casas finca.

## Popular

In [63]:
# Filtering
popular = df_real.loc[df_real['POPULAR'] == 1].drop(columns = ['ALTAVISTA', 'ARANJUEZ', 'BELEN', 
                                                                       'BUENOS AIRES','CASTILLA', 'DOCE DE OCTUBRE',
                                                                       'EL POBLADO', 
                                                                       'GUAYABAL', 'LA AMERICA', 'LA CANDELARIA', 
                                                                       'LAURELES', 'PALMITAS', 'POPULAR', 'ROBLEDO', 
                                                                       'MANRIQUE', 'SAN ANTONIO DE PRADO',
                                                                       'SAN CRISTOBAL', 
                                                                       'SAN JAVIER', 'SANTA CRUZ', 'SANTA ELENA',
                                                                       'VILLA HERMOSA'])
popular.describe()

Unnamed: 0,delta_time,lon,lat,bedrooms,bathrooms,surface_total,price,Apartamento
count,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0
mean,55.0,-75.54775,6.291943,4.636364,2.818182,213.727273,427727300.0,0.181818
std,41.494578,0.005149,0.004847,2.90767,0.603023,121.399416,220730200.0,0.40452
min,8.0,-75.554352,6.285762,1.0,2.0,68.0,265000000.0,0.0
25%,16.0,-75.552224,6.287951,3.0,2.5,113.5,290000000.0,0.0
50%,62.0,-75.548,6.295,3.0,3.0,164.0,335000000.0,0.0
75%,77.0,-75.543,6.295,6.5,3.0,328.5,415000000.0,0.0
max,140.0,-75.542,6.298,9.0,4.0,356.0,860000000.0,1.0


In [64]:
apto_popular = popular[popular['Apartamento'] == 1]
casa_popular = popular[popular['Apartamento'] == 0]
print('Numero de apatamentos: ', apto_popular.shape[0])
print('Numero de casas: ', casa_popular.shape[0])

Numero de apatamentos:  2
Numero de casas:  9


In [65]:
trace1 = go.Scatter(x = apto_popular.surface_total, y = apto_popular.price, mode = 'markers', 
                    name = 'Apartamentos', marker_color = '#ffd000')
trace2 = go.Scatter(x = casa_popular.surface_total, y = casa_popular.price, mode = 'markers', name = 'Casas', 
                   marker_color = '#1caeb0')
layout = go.Layout(title = 'Superficie Total vs Precio (POPULAR)',
                   xaxis_title = 'Superficie Total',
                   yaxis_title = 'Precio COP')
fig = go.Figure(data = [trace1, trace2], layout =  layout)
iplot(fig)

## Observaciones

##### Primeras Observaciones:
 - Conjunto de datos para nada representativo, tamaño indudablemente insuficiente y con outliers. NO SIRVE.

## San Javier

In [66]:
# Filtering
san_javier = df_real.loc[df_real['SAN JAVIER'] == 1].drop(columns = ['ALTAVISTA', 'ARANJUEZ', 'BELEN', 
                                                                       'BUENOS AIRES','CASTILLA', 'DOCE DE OCTUBRE',
                                                                       'EL POBLADO', 
                                                                       'GUAYABAL', 'LA AMERICA', 'LA CANDELARIA', 
                                                                       'LAURELES', 'PALMITAS', 'POPULAR', 'ROBLEDO', 
                                                                       'MANRIQUE', 'SAN ANTONIO DE PRADO',
                                                                       'SAN CRISTOBAL', 
                                                                       'SAN JAVIER', 'SANTA CRUZ', 'SANTA ELENA',
                                                                       'VILLA HERMOSA'])
san_javier.describe()

Unnamed: 0,delta_time,lon,lat,bedrooms,bathrooms,surface_total,price,Apartamento
count,46.0,46.0,46.0,46.0,46.0,46.0,46.0,46.0
mean,72.934783,-75.613127,6.256933,3.782609,2.673913,282.456522,411913000.0,0.391304
std,64.144768,0.006725,0.005239,1.45927,1.136356,703.284286,325575600.0,0.493435
min,4.0,-75.631,6.247,1.0,1.0,54.0,240000000.0,0.0
25%,17.5,-75.617996,6.254002,3.0,2.0,82.0,255000000.0,0.0
50%,66.0,-75.615499,6.2565,3.0,2.0,143.0,315000000.0,0.0
75%,97.0,-75.608251,6.261,4.75,3.0,192.75,398750000.0,1.0
max,295.0,-75.601,6.267296,8.0,5.0,4553.0,2300000000.0,1.0


In [67]:
apto_sjavier = san_javier[san_javier['Apartamento'] == 1]
casa_sjavier = san_javier[san_javier['Apartamento'] == 0]
print('Numero de apatamentos: ', apto_sjavier.shape[0])
print('Numero de casas: ', casa_sjavier.shape[0])

Numero de apatamentos:  18
Numero de casas:  28


In [68]:
trace1 = go.Scatter(x = apto_sjavier.surface_total, y = apto_sjavier.price, mode = 'markers', 
                    name = 'Apartamentos', marker_color = '#ffd000')
trace2 = go.Scatter(x = casa_sjavier.surface_total, y = casa_sjavier.price, mode = 'markers', name = 'Casas', 
                   marker_color = '#1caeb0')
layout = go.Layout(title = 'Superficie Total vs Precio (SAN JAVIER)',
                   xaxis_title = 'Superficie Total',
                   yaxis_title = 'Precio COP')
fig = go.Figure(data = [trace1, trace2], layout =  layout)
iplot(fig)

## Observaciones

##### Primeras Observaciones:
 - Conjunto de datos pequeño, con una clara agrupacion y datos outliers, claramente mal digitados. No esta balanceado