In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("./properatti.csv", index_col=0)
data.shape

(121220, 25)

In [3]:
# setea pandas para no truncar los valores de las columnas
pd.set_option('display.max_colwidth', None)

In [4]:
# defino dos grupos de columas para poder trabajar con ellas de forma mas sencilla
places = ['place_name','place_with_parent_names', 'country_name','state_name']
geolocation = ['geonames_id', 'lat-lon','lat','lon']

# creo un nuevo dataframe con las columnas que me interesan
location_data = data[places + geolocation].copy()
location_data.head()

Unnamed: 0,place_name,place_with_parent_names,country_name,state_name,geonames_id,lat-lon,lat,lon
0,Mataderos,|Argentina|Capital Federal|Mataderos|,Argentina,Capital Federal,3430787.0,"-34.6618237,-58.5088387",-34.661824,-58.508839
1,La Plata,|Argentina|Bs.As. G.B.A. Zona Sur|La Plata|,Argentina,Bs.As. G.B.A. Zona Sur,3432039.0,"-34.9038831,-57.9643295",-34.903883,-57.96433
2,Mataderos,|Argentina|Capital Federal|Mataderos|,Argentina,Capital Federal,3430787.0,"-34.6522615,-58.5229825",-34.652262,-58.522982
3,Liniers,|Argentina|Capital Federal|Liniers|,Argentina,Capital Federal,3431333.0,"-34.6477969,-58.5164244",-34.647797,-58.516424
4,Centro,|Argentina|Buenos Aires Costa Atlántica|Mar del Plata|Centro|,Argentina,Buenos Aires Costa Atlántica,3435548.0,"-38.0026256,-57.5494468",-38.002626,-57.549447


#### Corroborar si place_with_parent_names coincide con country_name, state_name, place_name 

In [5]:
# convierte la columna plase_with_parent_names en una lista usando el separador "|"
location_data['place_with_parent_names'] = location_data['place_with_parent_names'].apply(lambda x: x.lstrip("|").rstrip("|").split("|"))
location_data['place_with_parent_names'].head()

0                             [Argentina, Capital Federal, Mataderos]
1                       [Argentina, Bs.As. G.B.A. Zona Sur, La Plata]
2                             [Argentina, Capital Federal, Mataderos]
3                               [Argentina, Capital Federal, Liniers]
4    [Argentina, Buenos Aires Costa Atlántica, Mar del Plata, Centro]
Name: place_with_parent_names, dtype: object

In [6]:
# funcion para chequear si los valores de la columna place_with_parent_names coinciden con 
# los valores de las columnas country_name, state_name, place_name

def is_location_different(row):
    # la lista debería tener 3 elementos
    if len(row['place_with_parent_names']) != 3:
        return True
    if row['country_name'] == row['place_with_parent_names'][0] \
    and row['state_name'] == row['place_with_parent_names'][1] \
    and row['place_name'] == row['place_with_parent_names'][2]:
        return False
    else:
        return True

In [7]:
# crea la mascara utilizando la función is_location_different
mask = location_data.apply(lambda x: is_location_different(x), axis=1)
print("place_with_parent_names difference with country_name, state_name and place_name:", location_data[mask].shape[0])
location_data[mask][places].head()

place_with_parent_names difference with country_name, state_name and place_name: 45220


Unnamed: 0,place_name,place_with_parent_names,country_name,state_name
4,Centro,"[Argentina, Buenos Aires Costa Atlántica, Mar del Plata, Centro]",Argentina,Buenos Aires Costa Atlántica
6,Munro,"[Argentina, Bs.As. G.B.A. Zona Norte, Vicente López, Munro]",Argentina,Bs.As. G.B.A. Zona Norte
12,Martínez,"[Argentina, Bs.As. G.B.A. Zona Norte, San Isidro, Martínez]",Argentina,Bs.As. G.B.A. Zona Norte
13,Palermo Soho,"[Argentina, Capital Federal, Palermo, Palermo Soho]",Argentina,Capital Federal
14,Palermo Soho,"[Argentina, Capital Federal, Palermo, Palermo Soho]",Argentina,Capital Federal


Existen 45220 registros que no coinciden exactamente. 

In [8]:
# cuenta la cantidad de valores de cada fila de la lista place_with_parent_names
location_data['place_with_parent_names'].apply(lambda x: len(x)).value_counts()

3    76023
4    39869
2     4780
5      548
Name: place_with_parent_names, dtype: int64

Los que no coinciden exactamente son aquellas lsita que no tienen 3 valores. Primero revisamos que los valores que están en cada lista tengan su contraparte en las otras columnas 

In [9]:
# country_name
print("null values:", location_data['country_name'].isna().sum())
mask = ~location_data.apply(lambda x: x['country_name'] in x['place_with_parent_names'], axis=1)
print("place_with_parent_names difference with country_name:", location_data[mask].shape[0])

null values: 0
place_with_parent_names difference with country_name: 0


In [10]:
# state_name
print("null values:", location_data['state_name'].isna().sum())
mask = ~location_data.apply(lambda x: x['state_name'] in x['place_with_parent_names'], axis=1)
print("place_with_parent_names difference with state_name:", location_data[mask].shape[0])

null values: 0
place_with_parent_names difference with state_name: 0


In [11]:
# place_name
print("null values:", location_data['place_name'].isna().sum())

mask = ~location_data.apply(lambda x: x['place_name'] in x['place_with_parent_names'], axis=1)
location_data[mask].head()



null values: 23


Unnamed: 0,place_name,place_with_parent_names,country_name,state_name,geonames_id,lat-lon,lat,lon
6489,,"[Argentina, Bs.As. G.B.A. Zona Norte, Tigre]",Argentina,Bs.As. G.B.A. Zona Norte,,"-34.4008968545,-58.63809847",-34.400897,-58.638098
10201,,"[Argentina, Bs.As. G.B.A. Zona Norte, Tigre]",Argentina,Bs.As. G.B.A. Zona Norte,,"-34.400704,-58.638067",-34.400704,-58.638067
11451,,"[Argentina, Bs.As. G.B.A. Zona Norte, Tigre]",Argentina,Bs.As. G.B.A. Zona Norte,,"-34.4026880384,-58.6262613379",-34.402688,-58.626261
14839,,"[Argentina, Bs.As. G.B.A. Zona Norte, Tigre]",Argentina,Bs.As. G.B.A. Zona Norte,,"-34.4007994779,-58.6381735719",-34.400799,-58.638174
18622,,"[Argentina, Bs.As. G.B.A. Zona Norte, Tigre]",Argentina,Bs.As. G.B.A. Zona Norte,,"-34.4032781195,-58.6263503945",-34.403278,-58.62635


Los registros que no tienen valor en place_name son los que no coinciden, todos del municipio de Tigre.  

# TODO: Resta ver y analizar los que tienen más de 3 valores en la lista.