In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("./properatti.csv", index_col=0)
data.shape

(121220, 25)

In [3]:
# Seteamos pandas para no truncar los valores de las columnas
pd.set_option('display.max_colwidth', None)

In [4]:
# Definimos dos grupos de columas para poder trabajar con ellas de forma mas sencilla
places = ['place_name','place_with_parent_names', 'country_name','state_name']
geolocation = ['geonames_id', 'lat-lon','lat','lon']

# Creamos un nuevo dataframe con las columnas que me interesan
location_data = data[places + geolocation].copy()
location_data.head()

Unnamed: 0,place_name,place_with_parent_names,country_name,state_name,geonames_id,lat-lon,lat,lon
0,Mataderos,|Argentina|Capital Federal|Mataderos|,Argentina,Capital Federal,3430787.0,"-34.6618237,-58.5088387",-34.661824,-58.508839
1,La Plata,|Argentina|Bs.As. G.B.A. Zona Sur|La Plata|,Argentina,Bs.As. G.B.A. Zona Sur,3432039.0,"-34.9038831,-57.9643295",-34.903883,-57.96433
2,Mataderos,|Argentina|Capital Federal|Mataderos|,Argentina,Capital Federal,3430787.0,"-34.6522615,-58.5229825",-34.652262,-58.522982
3,Liniers,|Argentina|Capital Federal|Liniers|,Argentina,Capital Federal,3431333.0,"-34.6477969,-58.5164244",-34.647797,-58.516424
4,Centro,|Argentina|Buenos Aires Costa Atlántica|Mar del Plata|Centro|,Argentina,Buenos Aires Costa Atlántica,3435548.0,"-38.0026256,-57.5494468",-38.002626,-57.549447


## Corroborar si las columnas places* son exactamente iguales 
___

*columnas places = ['place_name','place_with_parent_names', 'country_name','state_name']

In [5]:
# Convertimos la columna place_with_parent_names en una lista usando el separador "|"
location_data['place_with_parent_names'] = location_data['place_with_parent_names'].apply(lambda x: x.lstrip("|").rstrip("|").split("|"))
location_data['place_with_parent_names'].head()

0                             [Argentina, Capital Federal, Mataderos]
1                       [Argentina, Bs.As. G.B.A. Zona Sur, La Plata]
2                             [Argentina, Capital Federal, Mataderos]
3                               [Argentina, Capital Federal, Liniers]
4    [Argentina, Buenos Aires Costa Atlántica, Mar del Plata, Centro]
Name: place_with_parent_names, dtype: object

In [6]:
# Función para chequear si los valores de la columna place_with_parent_names coinciden exactamente con 
# los valores de las columnas country_name, state_name, place_name

def is_location_different(row):
    # la lista debería tener 3 elementos
    if len(row['place_with_parent_names']) != 3:
        return True
    if row['country_name'] == row['place_with_parent_names'][0] \
    and row['state_name'] == row['place_with_parent_names'][1] \
    and row['place_name'] == row['place_with_parent_names'][2]:
        return False
    else:
        return True

In [7]:
# Creamos la mascara utilizando la función is_location_different
mask = location_data.apply(lambda x: is_location_different(x), axis=1)
print("place_with_parent_names difference with country_name, state_name and place_name:", location_data[mask].shape[0])
location_data[mask][places].sample(10)

place_with_parent_names difference with country_name, state_name and place_name: 45220


Unnamed: 0,place_name,place_with_parent_names,country_name,state_name
34480,Tigre,"[Argentina, Bs.As. G.B.A. Zona Norte, Tigre, Tigre]",Argentina,Bs.As. G.B.A. Zona Norte
37773,Barrio La Alameda,"[Argentina, Bs.As. G.B.A. Zona Norte, Tigre, Nordelta, Barrio La Alameda]",Argentina,Bs.As. G.B.A. Zona Norte
61264,Villa Madero,"[Argentina, Bs.As. G.B.A. Zona Oeste, La Matanza, Villa Madero]",Argentina,Bs.As. G.B.A. Zona Oeste
78396,La Plata,"[Argentina, Bs.As. G.B.A. Zona Sur, La Plata, La Plata]",Argentina,Bs.As. G.B.A. Zona Sur
111976,Córdoba,"[Argentina, Córdoba]",Argentina,Córdoba
68331,Banfield,"[Argentina, Bs.As. G.B.A. Zona Sur, Lomas de Zamora, Banfield]",Argentina,Bs.As. G.B.A. Zona Sur
12215,Playa Grande,"[Argentina, Buenos Aires Costa Atlántica, Mar del Plata, Playa Grande]",Argentina,Buenos Aires Costa Atlántica
66225,San Carlos de Bariloche,"[Argentina, Río Negro, San Carlos de Bariloche, San Carlos de Bariloche]",Argentina,Río Negro
18757,Belén de Escobar,"[Argentina, Bs.As. G.B.A. Zona Norte, Escobar, Belén de Escobar]",Argentina,Bs.As. G.B.A. Zona Norte
76479,Lanús,"[Argentina, Bs.As. G.B.A. Zona Sur, Lanús, Lanús]",Argentina,Bs.As. G.B.A. Zona Sur


*Conclusión: Existen 45220 registros que no coinciden exactamente.* 

### Análisis de los registros que no coinciden exactamente en las columnas places
___

In [8]:
# Contamos la cantidad de valores de cada fila de la lista place_with_parent_names
location_data['place_with_parent_names'].apply(lambda x: len(x)).value_counts()

3    76023
4    39869
2     4780
5      548
Name: place_with_parent_names, dtype: int64

Los que no coinciden exactamente son aquellas listas que no tienen 3 valores. Revisamos que los valores que están en cada lista tengan su contraparte en las otras columnas 

In [9]:
# country_name
print("null values:", location_data['country_name'].isna().sum())
mask = ~location_data.apply(lambda x: x['country_name'] in x['place_with_parent_names'], axis=1)
print("place_with_parent_names difference with country_name:", location_data[mask].shape[0])

null values: 0
place_with_parent_names difference with country_name: 0


In [10]:
# state_name
print("null values:", location_data['state_name'].isna().sum())
mask = ~location_data.apply(lambda x: x['state_name'] in x['place_with_parent_names'], axis=1)
print("place_with_parent_names difference with state_name:", location_data[mask].shape[0])

null values: 0
place_with_parent_names difference with state_name: 0


In [11]:
# place_name
print("null values:", location_data['place_name'].isna().sum())
mask = ~location_data.apply(lambda x: x['place_name'] in x['place_with_parent_names'], axis=1)
print("place_with_parent_names difference with place_name:", location_data[mask].shape[0])


null values: 23
place_with_parent_names difference with place_name: 23


*Conclusión: La información que figura en las columnas place_name, country_name y state_name se encuentra en la columna place_with_parent_names aunque no exactamente igual (parece existir información extra) excepto en la columna 'place_name' con sus 23 NaNs.*

#### Análisis de registros de place_with_parent_names con 3 valores
___

In [12]:
# Revisamos los place_with_parent_names con tres elementos
mask = location_data['place_with_parent_names'].apply(lambda x: len(x) == 3)
location_data_3_elements = location_data[mask].copy()
location_data_3_elements.shape

(76023, 8)

In [13]:
location_data_3_elements['place_with_parent_names'].apply(lambda x: x[2]).value_counts()

Rosario               8504
Córdoba               6606
Mar del Plata         6534
Belgrano              2992
Palermo               2885
                      ... 
El Dorado                1
Banda del Río Salí       1
Tancacha                 1
Oliveros                 1
Malabrigo                1
Name: place_with_parent_names, Length: 527, dtype: int64

In [14]:
cities_list = location_data_3_elements['place_with_parent_names'].apply(lambda x: x[2]).value_counts() 
cities_list[cities_list > 600]

Rosario             8504
Córdoba             6606
Mar del Plata       6534
Belgrano            2992
Palermo             2885
Tigre               2382
Caballito           2273
Pilar               1860
Villa Urquiza       1632
Recoleta            1547
Flores              1354
Villa Crespo        1331
San Telmo           1216
Almagro             1165
Barrio Norte        1140
Escobar             1037
Ituzaingó            982
Pinamar              932
Boedo                890
Nuñez                778
La Plata             767
Punilla              713
Balvanera            667
Puerto Madero        647
San Isidro           641
San Cristobal        614
Villa Carlos Paz     612
Name: place_with_parent_names, dtype: int64

In [15]:
# Corroboramos que no existe el valor "Capital Federal" en la columna place_name
(location_data_3_elements['place_name'] == "Capital Federal").sum()

0

In [16]:
# Corroboramos que no existe el valor "Capital Federal" en la columna place_with_parent_names[2]
(location_data_3_elements['place_with_parent_names'].apply(lambda x: x[2] == "Capital Federal")).sum()

0

In [17]:
# Datos sin el valor "Capital Federal" ni NaN, es decir los que place_name == municipio
location_data_3_elements_clean_sin_cp = location_data_3_elements[~location_data_3_elements['place_name'].isna() \
    & (location_data_3_elements['state_name'] != "Capital Federal")].copy()
location_data_3_elements_clean_sin_cp.shape[0]

46179

In [18]:
# Datos con el valor "Capital Federal" 
location_data_3_elements_capital = location_data_3_elements[location_data_3_elements['state_name'] == "Capital Federal"].copy()
location_data_3_elements_capital.shape[0]

29821

In [19]:
# Datos sin NaN
location_data_3_elements_clean = location_data_3_elements[~location_data_3_elements['place_name'].isna()].copy()
location_data_3_elements_clean.shape[0]

76000

*Conclusión: Los registros que tienen 3 valores y por lo menos 600 entradas se utiliza el municipio como 'place_name' excepto en Capital Federal que se utiliza el barrio.*
- place_name asignado a municipio: 46179
- place_name asignado a Capital Federal: 29821
- place_name asignado a municipio y Capital Federal: 76000
- place_name asignado a NaN: 23


In [20]:
# Vemos los datos correspondientes a los valores nulos de la columna place_name
mask = ~location_data.apply(lambda x: x['place_name'] in x['place_with_parent_names'], axis=1)
location_data_place_name_nan = location_data[mask].copy()
location_data_place_name_nan.head()

Unnamed: 0,place_name,place_with_parent_names,country_name,state_name,geonames_id,lat-lon,lat,lon
6489,,"[Argentina, Bs.As. G.B.A. Zona Norte, Tigre]",Argentina,Bs.As. G.B.A. Zona Norte,,"-34.4008968545,-58.63809847",-34.400897,-58.638098
10201,,"[Argentina, Bs.As. G.B.A. Zona Norte, Tigre]",Argentina,Bs.As. G.B.A. Zona Norte,,"-34.400704,-58.638067",-34.400704,-58.638067
11451,,"[Argentina, Bs.As. G.B.A. Zona Norte, Tigre]",Argentina,Bs.As. G.B.A. Zona Norte,,"-34.4026880384,-58.6262613379",-34.402688,-58.626261
14839,,"[Argentina, Bs.As. G.B.A. Zona Norte, Tigre]",Argentina,Bs.As. G.B.A. Zona Norte,,"-34.4007994779,-58.6381735719",-34.400799,-58.638174
18622,,"[Argentina, Bs.As. G.B.A. Zona Norte, Tigre]",Argentina,Bs.As. G.B.A. Zona Norte,,"-34.4032781195,-58.6263503945",-34.403278,-58.62635


In [21]:
location_data_place_name_nan['place_with_parent_names'].apply(lambda x: x[2]).value_counts()

Tigre    23
Name: place_with_parent_names, dtype: int64

*Conclusión: Los registros que no tienen valor en place_name son los que no coinciden, todos del municipio de Tigre.*  





#### Análisis de registros de place_with_parent_names con 4 valores
___

In [22]:
# Revisamos los place_with_parent_names con cuatro elementos
mask = location_data['place_with_parent_names'].apply(lambda x: len(x) == 4)
location_data_4_elements = location_data[mask].copy()
location_data_4_elements['place_with_parent_names'].shape


(39869,)

In [23]:
# Contamos los valores de la columna place_with_parent_names por el segundo elemento (descartamos el primero porque sabemos que siempre es Argentina)
location_data_4_elements['place_with_parent_names'].apply(lambda x: x[1]).value_counts()


Bs.As. G.B.A. Zona Norte        17198
Bs.As. G.B.A. Zona Sur          11638
Bs.As. G.B.A. Zona Oeste         7058
Capital Federal                  1198
Buenos Aires Costa Atlántica     1177
Buenos Aires Interior             962
Río Negro                         638
Name: place_with_parent_names, dtype: int64

In [24]:
# Confirmamos que los valores de la columna place_with_parent_names[1] coinciden con los valores de la columna state_name
mask = (location_data_4_elements['place_with_parent_names'].apply(lambda x: x[1]) != location_data_4_elements['state_name'])
print("place_with_parent_names[1] difference with state_name:", location_data_4_elements[mask].shape[0])


place_with_parent_names[1] difference with state_name: 0


In [25]:
# Confirmamos que los valores de la columna place_with_parent_names[2] coinciden con los valores de la columna place_name
mask = location_data_4_elements['place_with_parent_names'].apply(lambda x: x[2]) != location_data_4_elements['place_name']
print("place_with_parent_names[2] difference with place_name:", location_data_4_elements[mask].shape[0])
print("place_with_parent_names[2] proportional difference with place_name:", location_data_4_elements[mask].shape[0] / location_data_4_elements.shape[0])
anti_mask = ~mask
location_data_4_elements_non_matching = location_data_4_elements[anti_mask].copy()
location_data_4_elements_non_matching[places].sample(10)

place_with_parent_names[2] difference with place_name: 29842
place_with_parent_names[2] proportional difference with place_name: 0.7485013418947052


Unnamed: 0,place_name,place_with_parent_names,country_name,state_name
107745,La Plata,"[Argentina, Bs.As. G.B.A. Zona Sur, La Plata, La Plata]",Argentina,Bs.As. G.B.A. Zona Sur
40800,La Plata,"[Argentina, Bs.As. G.B.A. Zona Sur, La Plata, La Plata]",Argentina,Bs.As. G.B.A. Zona Sur
103496,Lomas de Zamora,"[Argentina, Bs.As. G.B.A. Zona Sur, Lomas de Zamora, Lomas de Zamora]",Argentina,Bs.As. G.B.A. Zona Sur
108441,La Plata,"[Argentina, Bs.As. G.B.A. Zona Sur, La Plata, La Plata]",Argentina,Bs.As. G.B.A. Zona Sur
41794,San Carlos de Bariloche,"[Argentina, Río Negro, San Carlos de Bariloche, San Carlos de Bariloche]",Argentina,Río Negro
84172,Lanús,"[Argentina, Bs.As. G.B.A. Zona Sur, Lanús, Lanús]",Argentina,Bs.As. G.B.A. Zona Sur
375,La Plata,"[Argentina, Bs.As. G.B.A. Zona Sur, La Plata, La Plata]",Argentina,Bs.As. G.B.A. Zona Sur
81485,Luján,"[Argentina, Buenos Aires Interior, Luján, Luján]",Argentina,Buenos Aires Interior
17042,La Plata,"[Argentina, Bs.As. G.B.A. Zona Sur, La Plata, La Plata]",Argentina,Bs.As. G.B.A. Zona Sur
101826,Lanús,"[Argentina, Bs.As. G.B.A. Zona Sur, Lanús, Lanús]",Argentina,Bs.As. G.B.A. Zona Sur


In [26]:
# No coinciden en 29842 registros pero coinciden en el resto. 
# Confirmamos que los que coinciden son iguales al valor[3]
mask = location_data_4_elements_non_matching['place_with_parent_names'].apply(lambda x: x[2]) == location_data_4_elements_non_matching['place_with_parent_names'].apply(lambda x: x[3])
location_data_4_elements_non_matching[mask].shape[0]/ location_data_4_elements_non_matching.shape[0]

1.0

In [27]:
# Confirmamos que los valores de la columna place_with_parent_names[3] coinciden con los valores de la columna place_name
mask = (location_data_4_elements['place_with_parent_names'].apply(lambda x: x[3]) != location_data_4_elements['place_name'])
print("place_with_parent_names[3] difference with place_name:", location_data_4_elements[mask].shape[0])
location_data_4_elements[~mask][places].sample(10)

place_with_parent_names[3] difference with place_name: 0


Unnamed: 0,place_name,place_with_parent_names,country_name,state_name
19542,Chascomús,"[Argentina, Buenos Aires Interior, Chascomús, Chascomús]",Argentina,Buenos Aires Interior
99216,San Miguel,"[Argentina, Bs.As. G.B.A. Zona Norte, San Miguel, San Miguel]",Argentina,Bs.As. G.B.A. Zona Norte
77300,Adrogué,"[Argentina, Bs.As. G.B.A. Zona Sur, Almirante Brown, Adrogué]",Argentina,Bs.As. G.B.A. Zona Sur
1877,Nordelta,"[Argentina, Bs.As. G.B.A. Zona Norte, Tigre, Nordelta]",Argentina,Bs.As. G.B.A. Zona Norte
75882,Lanús,"[Argentina, Bs.As. G.B.A. Zona Sur, Lanús, Lanús]",Argentina,Bs.As. G.B.A. Zona Sur
80489,Luján,"[Argentina, Buenos Aires Interior, Luján, Luján]",Argentina,Buenos Aires Interior
36243,Haedo,"[Argentina, Bs.As. G.B.A. Zona Oeste, Morón, Haedo]",Argentina,Bs.As. G.B.A. Zona Oeste
81777,Nordelta,"[Argentina, Bs.As. G.B.A. Zona Norte, Tigre, Nordelta]",Argentina,Bs.As. G.B.A. Zona Norte
33866,Marcos Paz,"[Argentina, Bs.As. G.B.A. Zona Oeste, Marcos Paz, Marcos Paz]",Argentina,Bs.As. G.B.A. Zona Oeste
7867,Ramos Mejía,"[Argentina, Bs.As. G.B.A. Zona Oeste, La Matanza, Ramos Mejía]",Argentina,Bs.As. G.B.A. Zona Oeste


*Conclusión: De los 39869 registros con 4 valores, 29842 toman el valor de place_name del 4to valor de la lista (City). Los 10027 restantes toman el 3er valor (Municipality) que está duplicado con el 4to valor (City).*

- place_name asignado a municipio: 10027
- place_name asignado a ciudad: 29842

#### Análisis de registros de place_with_parent_names con 2 valores
___

In [28]:
# Reviso los place_with_parent_names con dos elementos
mask = location_data['place_with_parent_names'].apply(lambda x: len(x) == 2)
location_data_2_elements = location_data[mask].copy()
location_data_2_elements['place_with_parent_names'].value_counts()

[Argentina, Córdoba]                         2648
[Argentina, Capital Federal]                 1297
[Argentina, Bs.As. G.B.A. Zona Norte]         222
[Argentina, Mendoza]                          130
[Argentina, Buenos Aires Interior]            106
[Argentina, Tucumán]                           77
[Argentina, Bs.As. G.B.A. Zona Oeste]          65
[Argentina, Misiones]                          44
[Argentina, Santa Fe]                          33
[Argentina, Buenos Aires Costa Atlántica]      27
[Argentina, San Luis]                          24
[Argentina, Bs.As. G.B.A. Zona Sur]            24
[Argentina, Salta]                             21
[Argentina, Chubut]                            17
[Argentina, Neuquén]                           10
[Argentina, Río Negro]                          8
[Argentina, San Juan]                           7
[Argentina, Tierra Del Fuego]                   6
[Argentina, Catamarca]                          3
[Argentina, Santiago Del Estero]                3


In [29]:
# Reviso los place_with_parent_names de Capital Federal
mask = location_data_2_elements['place_with_parent_names'].apply(lambda x: x[1] == 'Córdoba')
location_data_2_elements[mask].head()

Unnamed: 0,place_name,place_with_parent_names,country_name,state_name,geonames_id,lat-lon,lat,lon
1011,Córdoba,"[Argentina, Córdoba]",Argentina,Córdoba,3860255.0,"-31.3995242,-64.1797594",-31.399524,-64.179759
1023,Córdoba,"[Argentina, Córdoba]",Argentina,Córdoba,3860255.0,"-31.4017668,-64.1645966",-31.401767,-64.164597
1521,Córdoba,"[Argentina, Córdoba]",Argentina,Córdoba,3860255.0,"-31.411331,-64.1641959",-31.411331,-64.164196
3762,Córdoba,"[Argentina, Córdoba]",Argentina,Córdoba,3860255.0,"-31.39231682,-64.4621048",-31.392317,-64.462105
3763,Córdoba,"[Argentina, Córdoba]",Argentina,Córdoba,3860255.0,"-31.42144203,-64.49765015",-31.421442,-64.49765


In [30]:
# Pareciera ser que todos los lugares que tienen dos elementos en place_with_parent_names repiten el nombre del estado en place_name
# Corroboramos esto
mask = location_data_2_elements['place_name'] != location_data_2_elements['state_name']
location_data_2_elements[mask].shape


(0, 8)

*Conclusión: En los 4780 registros de place_with_parent_names que tienen 2 valores se utilizó state_name como place_name.*



#### Análisis de registros de place_with_parent_names con 5 valores
___

In [31]:
# Reviso los place_with_parent_names con 5 elementos
mask = location_data['place_with_parent_names'].apply(lambda x: len(x) == 5)
location_data_5_elements = location_data[mask].copy()
location_data_5_elements['place_with_parent_names'].shape

(548,)

In [32]:
# Reviso si alguno de los valores no corresponde a Nordelta
location_data_5_elements['place_with_parent_names'].apply(lambda x: x[3] != 'Nordelta').sum()


0

In [33]:
# Corroboramos que los valores de la columna place_with_parent_names[2] coinciden con los valores de la columna place_name
mask = (location_data_5_elements['place_with_parent_names'].apply(lambda x: x[2]) != location_data_5_elements['place_name'])
print("place_with_parent_names[2] difference with place_name:", location_data_5_elements[mask].shape[0])
print("place_with_parent_names[2] proportional difference with place_name:", location_data_5_elements[mask].shape[0] / location_data_5_elements.shape[0])


place_with_parent_names[2] difference with place_name: 548
place_with_parent_names[2] proportional difference with place_name: 1.0


In [34]:
# Corroboramos que los valores de la columna place_with_parent_names[3] coinciden con los valores de la columna place_name
mask = (location_data_5_elements['place_with_parent_names'].apply(lambda x: x[3]) != location_data_5_elements['place_name'])
print("place_with_parent_names[3] difference with place_name:", location_data_5_elements[mask].shape[0])
print("place_with_parent_names[3] proportional difference with place_name:", location_data_5_elements[mask].shape[0] / location_data_5_elements.shape[0])


place_with_parent_names[3] difference with place_name: 548
place_with_parent_names[3] proportional difference with place_name: 1.0


In [35]:
# Corroboramos que los valores de la columna place_with_parent_names[4] coinciden con los valores de la columna place_name
mask = (location_data_5_elements['place_with_parent_names'].apply(lambda x: x[4]) != location_data_5_elements['place_name'])
print("place_with_parent_names[4] difference with place_name:", location_data_5_elements[mask].shape[0])
print("place_with_parent_names[4] proportional difference with place_name:", location_data_5_elements[mask].shape[0] / location_data_5_elements.shape[0])

place_with_parent_names[4] difference with place_name: 0
place_with_parent_names[4] proportional difference with place_name: 0.0


*Conclusión: Los 548 registros de place_with_parent_names que tienen 5 valores son de Nordelta y el último valor se refiere al Barrio. place_name toma los valores del barrio*


> ##### Conclusiones finales: 
> - Existen como máximo 5 tipos de registros en place_with_parent_names: Country, State, municipio, Ciudad, Barrio
> - Country y State son iguales a country_name y state_name
> - Place_name se le asigna el valor de municipio: 76000 + 10027 = 86027
> - Place_name se le asigna el valor de ciudad: 29842 (en  estos casos existe el valor municipio también)
> - Place_name se le asigna el valor de barrio: 548 (en estos casos existe el valor municipio y ciudad también)
> - Place_name se le asigna el valor de state_name: 4780 (en estos casos no existe el valor municipio y ciudad)
> - Place_name se le asigna el valor de NaN: 23 (en estos casos existe el valor municipio y ciudad)
> ___
> Por lo tanto se puede concluir que para dar mayor consistencia es posible imputar place_name con el valor del municipio en las mayoría de los casos. Confrontar con geoNamesId.
> ___








## Imputación de valores de place_name
___