### DATA FROM CATASTRO INSPIRE
___________________________________________________________________________________________________

Obteined through qGIS after mingling from a while with:
- WMS service: http://ovc.catastro.meh.es/cartografia/INSPIRE/spadgcwms.aspx
- WFS buildings : http://ovc.catastro.meh.es/INSPIRE/wfsBU.aspx?
- todos los serivicios INSPIRE: http://www.catastro.minhap.es/webinspire/index.html

*Notas: el uso de los canales WMS/WFS devuelven las capas antes de unificar con el programa Europeo Inspire, por lo que quizá sea la razón por la que no funcionan bien.

[bibliography](#http://www.catastro.minhap.es/webinspire/documentos/Conjuntos%20de%20datos.pdf)

In [1]:
import re
from itertools import combinations
from itertools import chain

import pandas as pd
import numpy as np

import geopandas as gpd
import geojson

pd.set_option('display.max_rows', 1000)

In [2]:
RENAMING_OF_COLS = {'gml_id': 'ID',
                    'localId_part': 'ID_part',
                    'localId_PI': 'ID_pool',
                    'numberOfFloorsAboveGround': 'nFloors_AG',
                    'numberOfFloorsBelowGround': 'nFloors_BG',
                    'heightAboveGround': 'height_AG',
                    'heightBelowGround': 'height_BG',
                    'areaValue': 'area_m2p',
                    'value': 'area_m2c'}

GEOMETRY_COLS = ['geometry', 'pos']

### FUNCTIONAL PIPELINE PROTOTYPING

In [3]:
# ------------------------------------------------------------ checking which columns should be purged

def str_forUniques(num):
    """
    Return different string depending of unique_len in checking_forUniques
    """
    if num == 0: return 'ALL NULLS'
    else: return 'Unique items'

def checking_forUniques(gdf):
    """
    input:
    output:
    """
    cols_with_one_element = []
    
    print(f"\n-------------------- Current Layers in {gdf.name} ------------------------")
    print(f"------------------------------------------------------------------------")
    
    for i,col in enumerate(gdf.columns.tolist()):
        if (col not in GEOMETRY_COLS):
            unique_len = len(gdf[str(col)].value_counts().tolist())
            
            if unique_len == 0: 
                print(f"{i+1}. {col}:\t\t\t{unique_len}\t{str_forUniques(unique_len)}")                
            elif len(col) <= 12 and unique_len != 0: 
                print(f"{i+1}. {col}:\t\t\t\t\t{unique_len}\t{str_forUniques(unique_len)}")
            elif 12 < len(col) <= 19 and unique_len != 0: 
                print(f"{i+1}. {col}:\t\t\t\t{unique_len}\t{str_forUniques(unique_len)}")
            elif 19 < len(col) <= 28 and unique_len != 0: 
                print(f"{i+1}. {col}:\t\t\t{unique_len}\t{str_forUniques(unique_len)}")
            elif 28 < len(col) <= 36 and unique_len != 0: 
                print(f"{i+1}. {col}:\t\t{unique_len}\t{str_forUniques(unique_len)}")
            elif 36 < len(col) and unique_len != 0: 
                print(f"{i+1}. {col}:\t{unique_len}\t{str_forUniques(unique_len)}")
            else: pass
            
            if (unique_len == 1) or (unique_len == 0): cols_with_one_element.append(col)
            else: pass            
        else: pass
    
    print(f"------------------------------------------------------------------------\n")
    return cols_with_one_element

def droping_DupCols(gdf, drop_cols = True):
    """
    
    """   
    if drop_cols:
        cols_to_drop = checking_forUniques(gdf)
        
        print(f"-------------- Droping DUPLICATED COLUMNS in {gdf.name} ------------------")
        [print(f'{i+1}. {col}\v') for i, col in enumerate(cols_to_drop)] # repr without new line
        
        gdf.drop(cols_to_drop, 
                 axis=1, inplace = True)
        
        print(f"-- Finished task -----------------------------------------------------\n")
    else: pass

In [4]:
# ------------------------------------------------------------ separate ID_parts if needed

def get_part(x):
    """
    input: col withs IDs_partXX
    output: XX as int
    Get numeric item in partXX from ID_partXX
    """
    part_str = x.split('_')[1]
    
    if len(re.findall(r"[\.]", part_str)) != 0: return int(part_str.split('.')[1])
    elif len(re.findall(r"t", part_str)) != 0: return int(part_str.split('t')[1])
    else: print(f"Error. Couldnt find anything to split part to")

def get_ID(x):
    """
    input: localID_partXX
    output: localID
    """
    return x.split('_')[0]

def separate_parts(gdf, cols = ['']):
    """
    If it is a geodf with ID_partXX then both parts are separated in different cols
    This is necessary to be able to join gdfs
    """
    print(f"-------------- Checking for COLS to separate in {gdf.name} --------------")
    assert type(cols) == list
    
    c = 0
    for col in cols:
        if (len(re.findall(r"_", gdf[col].tolist()[0])) != 0) and col in gdf.columns.tolist():
    
            print(f"{c+1}. {col}\t\t Dropped")            
            splited_col_name = re.split(r"_", gdf[col].tolist()[0])
            part_title = re.findall(r"\D+", splited_col_name[1])

            gdf[col + f'_{part_title[0]}'] = gdf[col].apply(get_part).astype(dtype = 'int64')
            gdf[col] = gdf[col].apply(get_ID)
            c += 1
        else: print(f"No columns to separate")
            
    print(f"-- Finished task -----------------------------------------------------\n")

In [5]:
# ------------------------------------------------------------ datetime operations

def get_year(strng):
    """
    Input:  string
    Output: year as string
    
    Note_____________________________________________________________
    Pandas requires years to be inside the bound of 1677 - 2262
    To use pandas Timestamp it is need to defined custom Stamp Period
    String operations seems easier in this case
    """
    first_w = strng.split('T')[0]
    return first_w.split('-')[0]
    
def getYearOfConstruction(gdf, LifeSpanCol = 'beginLifespanVersion', drop_col = True):
    """
    Cleaning Datetime and datetime columns if needed
    """
    indicator_of_datetime = ['end', 'begin', 'Lifespan']
    
    print(f"-- Getting YEAR OF CONSTRUCTION in {gdf.name} --------------------------")
    
    if LifeSpanCol in gdf.columns.tolist(): 
        gdf['yearOfConstruction'] = gdf[LifeSpanCol].apply(get_year)
    
        if drop_col:
            print(f"Droping col {drop_col}: \t{LifeSpanCol}")
            gdf.drop(LifeSpanCol, axis = 1, inplace = True)
            
            for col in gdf.columns:
                if len(re.findall("end", col)) != 0  or \
                   len(re.findall("Lifespan", col)) != 0 or \
                   len(re.findall("begin", col)) != 0:

                    print(f"\t\t Droping too col {drop_col}: \t{col}")
                    gdf.drop(col, axis = 1, inplace = True)

        else: print(f"Droping col {drop_col}: \t{LifeSpanCol}")
    
    print(f"-- Finished task -----------------------------------------------------\n")

In [6]:
# ------------------------------------------------------------ Droping duplicated columns

def check_allTrue(gdf, col1, col2):
    """
    Esta función se usa en ....
    """
    print(f"-- Checking if PAIRS are ALL TRUE {gdf.name} ---------------")
    
    # hay columnas que son alturas y otras num de plantas. Con multiplicar x3 se arregla
    if False not in gdf.apply(lambda x: (x[col1] == x[col2]) or (x[col1] == 3*x[col2]) or (3*x[col1] == x[col2]),
                              axis = 1).value_counts().index.tolist():
        
        print(f"All True --\n-- Droping {col2}")
        gdf.drop([col2], axis = 1, inplace = True)
    else:
        print(f"Pass \tThere are inequalities between columns")


def checking_forIdenCols(gdf, drop_cols = True):
    """
    Note_____________________________________________________________
    Same unique elements are an indication that they give the same 
    (or nearly) the same information, therefore to simply ddbb
    all columns that give the same info are purged
    """
    print(f"------------- Checking for SAME LEN COLS in {gdf.name} -----------------")
    
    # 1 // creating vars for search
    cols = [col for col in gdf.columns.tolist() if (col not in GEOMETRY_COLS)]
    len_unique_cols = [len(gdf[col].value_counts().tolist()) for col in cols]
    equal_cols, del_cols = [], []
    
    # 2 // creating pairs of columns that are suspect of giving the same information
    for tup_len, tup_col in zip(list(combinations(len_unique_cols, 2)), list(combinations(cols, 2))):
        if tup_len[0] == tup_len[1]:
            equal_cols.append([tup_col[0], tup_col[1]])
        else: pass

    # 3 // if True, drop columns that are equal, evaluating if all rows are the same       
    if drop_cols and len(equal_cols) != 0:        
        for pair in equal_cols:
            
            if pair[1] not in del_cols:
                gdf.drop(pair[1], axis = 1, inplace = True)
                del_cols.append(pair.pop(1))
            else: pass
                
        print(f"1. Deleted   columns: ") # repr without new line
        [print(f'\t\t\t{i+1}. {col} \v') for i, col in enumerate(del_cols)]; print('\n')
        
        print(f"2. Remaining columns: ") # repr without new line
        [print(f'\t\t\t{i+1}. {col} \v') for i, col in enumerate(list(chain.from_iterable(equal_cols)))]
        
    # 4 // printing columns that remain after purging
    elif len(equal_cols) == 0: print('List to return is empty')
    else: 
        print(f"Remaining columns: \n") # repr without new line
        [print(f'\t\t\t{i+1}. {col} \v') for i, col in enumerate(list(chain.from_iterable(equal_cols)))]

    print(f"-- Finished task -----------------------------------------------------\n")
    return list(chain.from_iterable(equal_cols))

In [7]:
# ------------------------------------------------------------ Unify ID columns if gml_id is dropped
    
def get_strPoint(x):
    """
    Returns last part of Cadastral ID in gml_id inside shorten_localID
    """
    return x.split('.')[-1]

def shorten_localID(gdf, cols_to_shorten = ['gml_id']):
    """
    If localId is dropped in favor of gml_id
    Then, namespace part is purged of name
    """
    print(f"------- Checking for ID col to shorten in {gdf.name} -------------------")
    
    shorted_localID = np.vectorize(get_strPoint)   
    for col_ID in cols_to_shorten:
        if col_ID in gdf.columns.tolist():
            print(f'Shortening columns: {col_ID}')
            gdf[col_ID] = shorted_localID(gdf[col_ID])
            
        else: print(f'Nothing to shorten')
    
    print(f"-- Finished task -----------------------------------------------------\n")
    
# ------------------------------------------------------------ LAST STEP, unify columns names

def rename_cols(gdf):
    """
    If col not in dict, then pass.
    This is used to unify all geojson
    """
    print(f"--------------- Renaming cols in {gdf.name} ----------------------------")
    
    dict_cols_to_rename = RENAMING_OF_COLS # dict
    cols_to_rename = [col for col in gdf.columns.tolist() if col in dict_cols_to_rename.keys()]
    
    gdf.rename(columns = dict_cols_to_rename, inplace = True) # before: after
            
    print(f"1. Initial name: ") # repr without new line
    [print(f'\t\t\t{i+1}. {col} \v') for i, col in enumerate(cols_to_rename)]

    print(f"2. Final name: ") # repr without new line
    [print(f'\t\t\t{i+1}. {dict_cols_to_rename[col]} \v') for i, col in enumerate(cols_to_rename)]
    
    print(f"-- Finished task -----------------------------------------------------\n")

In [8]:
def rawData_infoCleaning(gdf, 
                         drop_cols = True, 
                         cols_to_separate = ['localId', 'gml_id'],
                         datetime_col = 'beginLifespanVersion'):
    """
    Pipeline towards clearer data
    """
    print(f"Initiating cleaning pipeline -----------------------------------------\n")
    
    # -- 1 -- SEARCHING FOR COLS WITHOUT DATA IN {gdf.name} -----------------------
    droping_DupCols(gdf, drop_cols = drop_cols)
    # -- 2 -- SEARCHING FOR UNIQUE COLS {gdf.name} --------------------------------
    checking_forUniques(gdf)
    # -- 3 -- SEARCHING FOR COLS TO SEPARATE {gdf.name} ---------------------------
    separate_parts(gdf = gdf, cols = cols_to_separate)
    # -- 4 -- SEARCHING FOR DUPLICATED INFO {gdf.name} ----------------------------
    checking_forIdenCols(gdf, drop_cols = drop_cols)
    # -- 5 -- REFORMATTING DATA IN {gdf.name} -------------------------------------
    getYearOfConstruction(gdf, LifeSpanCol = datetime_col, drop_col = drop_cols)
    shorten_localID(gdf)
    # -- 6 -- RENAMING INFORMATION IN {gdf.name} ----------------------------------
    rename_cols(gdf)
    

    print(f"Closing cleaning pipeline --------------------------------------------\n")

### CAPAS DISPONIBLES

Dada el volumen de datos, la inspección de éstos se realiza sobre una parte de éstos

In [74]:
from os import listdir
from os.path import isfile, join

SPLIT_PATH = '../data/split_raw'

onlyfiles = [f for f in listdir(SPLIT_PATH) if isfile(join(SPLIT_PATH, f))]
onlyfiles

['A.ES.SDGC.BU.28900.buildingpart_xaj.geojson',
 'A.ES.SDGC.BU.28900.buildingpart_xab.geojson',
 'A.ES.SDGC.BU.28900.buildingpart_xai.geojson',
 'A.ES.SDGC.BU.28900.buildingpart_xao.geojson',
 'A.ES.SDGC.BU.28900.buildingpart_xag.geojson',
 'A.ES.SDGC.BU.28900.buildingpart_xak.geojson',
 'A.ES.SDGC.BU.28900.buildingpart_xam.geojson',
 'A.ES.SDGC.BU.28900.buildingpart_xal.geojson',
 'A.ES.SDGC.BU.28900.buildingpart_xad.geojson',
 'A.ES.SDGC.BU.28900.buildingpart_xac.geojson',
 'A.ES.SDGC.BU.28900.buildingpart_xan.geojson',
 'A.ES.SDGC.BU.28900.buildingpart_xaq.geojson',
 'A.ES.SDGC.BU.28900.buildingpart_xae.geojson',
 'A.ES.SDGC.BU.28900.buildingpart_xaa.geojson',
 'A.ES.SDGC.BU.28900.buildingpart_xah.geojson',
 'A.ES.SDGC.BU.28900.buildingpart_xaf.geojson',
 'A.ES.SDGC.BU.28900.buildingpart_xap.geojson']

In [76]:
onlyfiles[0].split('.')[-2]

'buildingpart_xaj'

In [69]:
oConstrs.shape

NameError: name 'oConstrs' is not defined

In [39]:
CATASTRO_PATH = '../data/raw/catastro'

# Dentro de los datos displibles hay 4 capas en formato geojson

building_df = gpd.read_file(f"{CATASTRO_PATH}/A.ES.SDGC.BU.28900.building.geojson")
#buildingParts_df = gpd.read_file(f"{CATASTRO_PATH}/A.ES.SDGC.BU.28900.buildingpart.geojson")
otherConstruction_df = gpd.read_file(f"{CATASTRO_PATH}/A.ES.SDGC.BU.28900.otherconstruction.geojson")
#cadastralParcel = gpd.read_file(f"{CATASTRO_PATH}/A.ES.SDGC.CP.28900.cadastralparcel.geojson")
#cadastralZoning = gpd.read_file(f"{CATASTRO_PATH}/A.ES.SDGC.CP.28900.cadastralzoning.geojson")

Checkeo de uso de memoria

In [40]:
print(f"Building Layer total memory usage: \t\t\t{building_df.memory_usage(index=True).sum()/1000} \tKbytes")
#print(f"Building Parts Layer total memory usage: \t\t{buildingParts_df.memory_usage(index=True).sum()/1000} \tKbytes")
print(f"Other Construction Layer total memory usage: \t\t{otherConstruction_df.memory_usage(index=True).sum()/1000} \t\tKbytes")
#print(f"Cadastral Parcel Layer total memory usage: \t\t{cadastralParcel.memory_usage(index=True).sum()/1000} \tKbytes")
#print(f"Cadastral Zoning Layer total memory usage: \t\t{cadastralZoning.memory_usage(index=True).sum()/1000} \tKbytes")

Other Construction Layer total memory usage: 		774.16 		Kbytes


In [11]:
## NAMES OF GEO
# IN OBJECT MAKE IT SO THAT filename == NAME

building_df.name = 'BU_ALL'
buildingParts_df.name = 'BU_PARTS'
otherConstruction_df.name = 'BU_OTHER'
cadastralParcel.name = 'CAD_PARCEL'
cadastralZoning.name = 'CAD_ZONING'

onlyfiles[0].split('.')[-2]

In [48]:
otherConstruction_df.iloc[4999:5001, :]

Unnamed: 0,gml_id,beginLifespanVersion,conditionOfConstruction,localId,namespace,constructionNature,geometry
4999,ES.SDGC.BU.3895604VK4839F_PI.5000,2017-02-21T00:00:00,,3895604VK4839F_PI.5000,ES.SDGC.BU,openAirPool,"POLYGON ((443814.920 4479357.670, 443811.610 4..."
5000,ES.SDGC.BU.3895605VK4839F_PI.5001,2004-10-21T00:00:00,,3895605VK4839F_PI.5001,ES.SDGC.BU,openAirPool,"POLYGON ((443786.131 4479373.672, 443785.241 4..."


In [49]:
otherConstruction_1_df.iloc[-1,:].T

gml_id                                     ES.SDGC.BU.3895604VK4839F_PI.5000
beginLifespanVersion                                     2017-02-21T00:00:00
conditionOfConstruction                                                 None
localId                                               3895604VK4839F_PI.5000
namespace                                                         ES.SDGC.BU
constructionNature                                               openAirPool
geometry                   POLYGON ((443814.92 4479357.67, 443811.61 4479...
Name: 4999, dtype: object

In [50]:
otherConstruction_2_df.iloc[0,:].T

gml_id                                     ES.SDGC.BU.3895605VK4839F_PI.5001
beginLifespanVersion                                     2004-10-21T00:00:00
conditionOfConstruction                                                 None
localId                                               3895605VK4839F_PI.5001
namespace                                                         ES.SDGC.BU
constructionNature                                               openAirPool
geometry                   POLYGON ((443786.131 4479373.6715, 443785.241 ...
Name: 0, dtype: object

## 1.1 Building Layer

En qGIS esta capa representa la parte edificada de los solares.

In [12]:
display(building_df.info())
display(building_df.shape)

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 25 columns):
 #   Column                                   Non-Null Count  Dtype   
---  ------                                   --------------  -----   
 0   gml_id                                   25000 non-null  object  
 1   beginLifespanVersion                     25000 non-null  object  
 2   conditionOfConstruction                  25000 non-null  object  
 3   beginning                                25000 non-null  object  
 4   end                                      25000 non-null  object  
 5   endLifespanVersion                       13 non-null     object  
 6   informationSystem                        25000 non-null  object  
 7   reference                                25000 non-null  object  
 8   localId                                  25000 non-null  object  
 9   namespace                                25000 non-null  object  
 10  horizontalGeometryEstimate

None

(25000, 25)

In [13]:
rawData_infoCleaning(building_df, 
                     drop_cols = True, 
                     cols_to_separate = ['localId', 'gml_id'], 
                     datetime_col = 'beginning')

Initiating cleaning pipeline -----------------------------------------


-------------------- Current Layers in BU_ALL ------------------------
------------------------------------------------------------------------
1. gml_id:					25000	Unique items
2. beginLifespanVersion:			3064	Unique items
3. conditionOfConstruction:			4	Unique items
4. beginning:					181	Unique items
5. end:					178	Unique items
6. endLifespanVersion:				12	Unique items
7. informationSystem:				25000	Unique items
8. reference:					25000	Unique items
9. localId:					25000	Unique items
10. namespace:					1	Unique items
11. horizontalGeometryEstimatedAccuracy:		1	Unique items
12. horizontalGeometryEstimatedAccuracy_uom:	1	Unique items
13. horizontalGeometryReference:			1	Unique items
14. referenceGeometry:				1	Unique items
15. currentUse:					6	Unique items
16. numberOfBuildingUnits:			385	Unique items
17. numberOfDwellings:				222	Unique items
18. numberOfFloorsAboveGround:			0	ALL NULLS
19. documentLink:					2500

In [14]:
building_df.head(2)

Unnamed: 0,ID,conditionOfConstruction,currentUse,numberOfBuildingUnits,numberOfDwellings,area_m2c,geometry,yearOfConstruction
0,VK4700H,-,,0,0,0,"POLYGON ((440433.629 4470953.961, 440432.170 4...",2000
1,00006Z8VK4800A,-,,0,0,0,"POLYGON ((440020.830 4479859.707, 440020.064 4...",2000


In [15]:
print(building_df['conditionOfConstruction'].value_counts())
print()
print(building_df['currentUse'].value_counts())

functional    24592
-               248
declined        109
ruin             51
Name: conditionOfConstruction, dtype: int64

1_residential         21807
4_2_retail              777
3_industrial            765
4_3_publicServices      698
4_1_office              657
2_agriculture            17
Name: currentUse, dtype: int64


**____________________**

## 1.2 Building Parts Layer

In [16]:
print(f"{buildingParts_df.info()}\n")
print(f"Shape of {buildingParts_df.name}: \t\t\t     {buildingParts_df.shape}")

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 14 columns):
 #   Column                                   Non-Null Count  Dtype   
---  ------                                   --------------  -----   
 0   gml_id                                   25000 non-null  object  
 1   beginLifespanVersion                     25000 non-null  object  
 2   conditionOfConstruction                  0 non-null      object  
 3   localId                                  25000 non-null  object  
 4   namespace                                25000 non-null  object  
 5   horizontalGeometryEstimatedAccuracy      25000 non-null  float64 
 6   horizontalGeometryEstimatedAccuracy_uom  25000 non-null  object  
 7   horizontalGeometryReference              25000 non-null  object  
 8   referenceGeometry                        25000 non-null  bool    
 9   numberOfFloorsAboveGround                25000 non-null  int64   
 10  heightBelowGround         

In [17]:
rawData_infoCleaning(buildingParts_df, 
                     drop_cols = True, 
                     cols_to_separate = ['localId', 'gml_id'], 
                     datetime_col = 'beginLifespanVersion')

Initiating cleaning pipeline -----------------------------------------


-------------------- Current Layers in BU_PARTS ------------------------
------------------------------------------------------------------------
1. gml_id:					25000	Unique items
2. beginLifespanVersion:			974	Unique items
3. conditionOfConstruction:			0	ALL NULLS
4. localId:					25000	Unique items
5. namespace:					1	Unique items
6. horizontalGeometryEstimatedAccuracy:		1	Unique items
7. horizontalGeometryEstimatedAccuracy_uom:	1	Unique items
8. horizontalGeometryReference:			1	Unique items
9. referenceGeometry:				1	Unique items
10. numberOfFloorsAboveGround:			21	Unique items
11. heightBelowGround:				11	Unique items
12. heightBelowGround_uom:			1	Unique items
13. numberOfFloorsBelowGround:			11	Unique items
------------------------------------------------------------------------

-------------- Droping DUPLICATED COLUMNS in BU_PARTS ------------------
1. conditionOfConstruction
2. namespace
3. horizontalGeo

In [18]:
buildingParts_df.head(3)

Unnamed: 0,ID,nFloors_AG,height_BG,geometry,ID_part,yearOfConstruction
0,000200100VK48E,1,0,"POLYGON ((441657.574 4487050.292, 441658.164 4...",1,2004
1,000200500VK56E,1,0,"POLYGON ((451584.920 4467181.410, 451586.250 4...",1,2013
2,000200500VK56E,1,0,"POLYGON ((451612.580 4467215.370, 451618.650 4...",2,2013


**________________________**

**________________________**

## 1.3 Other Construction Layer

In [19]:
# All data corresponds to open AIR POOLS
# I don't need this dataset to begin with
# BUT lets cleaned it

In [20]:
display(otherConstruction_df.info())
display(otherConstruction_df.shape)

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 13822 entries, 0 to 13821
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype   
---  ------                   --------------  -----   
 0   gml_id                   13822 non-null  object  
 1   beginLifespanVersion     13822 non-null  object  
 2   conditionOfConstruction  0 non-null      object  
 3   localId                  13822 non-null  object  
 4   namespace                13822 non-null  object  
 5   constructionNature       13822 non-null  object  
 6   geometry                 13822 non-null  geometry
dtypes: geometry(1), object(6)
memory usage: 756.0+ KB


None

(13822, 7)

In [21]:
rawData_infoCleaning(otherConstruction_df, 
                     drop_cols = True, 
                     cols_to_separate = ['localId', 'gml_id'], 
                     datetime_col = 'beginLifespanVersion')

Initiating cleaning pipeline -----------------------------------------


-------------------- Current Layers in BU_OTHER ------------------------
------------------------------------------------------------------------
1. gml_id:					13822	Unique items
2. beginLifespanVersion:			2191	Unique items
3. conditionOfConstruction:			0	ALL NULLS
4. localId:					13822	Unique items
5. namespace:					1	Unique items
6. constructionNature:				1	Unique items
------------------------------------------------------------------------

-------------- Droping DUPLICATED COLUMNS in BU_OTHER ------------------
1. conditionOfConstruction
2. namespace
3. constructionNature
-- Finished task -----------------------------------------------------


-------------------- Current Layers in BU_OTHER ------------------------
------------------------------------------------------------------------
1. gml_id:					13822	Unique items
2. beginLifespanVersion:			2191	Unique items
3. localId:					13822	Unique items
------

In [22]:
otherConstruction_df.head(3)

Unnamed: 0,ID,geometry,localId_PI.,yearOfConstruction
0,0006601VK3800E,"POLYGON ((429964.360 4480456.150, 429961.960 4...",1,2017
1,0007201VK3800E,"POLYGON ((429883.850 4480596.541, 429888.120 4...",2,2003
2,0007202VK3800E,"POLYGON ((429967.989 4480563.520, 429968.469 4...",3,2006


**________________________**

## 1.4 Cadastral Parcel Layer

In [23]:
display(cadastralParcel.info())
display(cadastralParcel.shape)

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype   
---  ------                      --------------  -----   
 0   gml_id                      25000 non-null  object  
 1   areaValue                   25000 non-null  int64   
 2   areaValue_uom               25000 non-null  object  
 3   beginLifespanVersion        25000 non-null  object  
 4   endLifespanVersion          0 non-null      object  
 5   localId                     25000 non-null  object  
 6   namespace                   25000 non-null  object  
 7   label                       25000 non-null  object  
 8   nationalCadastralReference  25000 non-null  object  
 9   pos                         25000 non-null  object  
 10  geometry                    25000 non-null  geometry
dtypes: geometry(1), int64(1), object(9)
memory usage: 2.1+ MB


None

(25000, 11)

In [24]:
rawData_infoCleaning(cadastralParcel, 
                     drop_cols = True, 
                     cols_to_separate = ['localId', 'gml_id'], 
                     datetime_col = 'beginLifespanVersion')

Initiating cleaning pipeline -----------------------------------------


-------------------- Current Layers in CAD_PARCEL ------------------------
------------------------------------------------------------------------
1. gml_id:					25000	Unique items
2. areaValue:					5910	Unique items
3. areaValue_uom:				1	Unique items
4. beginLifespanVersion:			1446	Unique items
5. endLifespanVersion:			0	ALL NULLS
6. localId:					25000	Unique items
7. namespace:					1	Unique items
8. label:					768	Unique items
9. nationalCadastralReference:			25000	Unique items
------------------------------------------------------------------------

-------------- Droping DUPLICATED COLUMNS in CAD_PARCEL ------------------
1. areaValue_uom
2. endLifespanVersion
3. namespace
-- Finished task -----------------------------------------------------


-------------------- Current Layers in CAD_PARCEL ------------------------
------------------------------------------------------------------------
1. gml_id:					

In [25]:
cadastralParcel.head(4)

Unnamed: 0,ID,area_m2p,label,pos,geometry,yearOfConstruction
0,000200500VK56E,1268,5,451607.03 4467199.27,"MULTIPOLYGON (((451599.360 4467174.940, 451584...",2013
1,000205600VK56E,297,56,451592.44 4467170.83,"MULTIPOLYGON (((451593.000 4467163.250, 451579...",2013
2,000205700VK56E,155,57,451587.93 4467161.77,"MULTIPOLYGON (((451590.040 4467156.910, 451576...",2013
3,000205800VK56E,174,58,451584.62 4467155.04,"MULTIPOLYGON (((451586.650 4467149.650, 451573...",2013


**________________________**

## 1.5 Cadastral Zoning Layer

In [26]:
display(cadastralZoning.info())
display(cadastralZoning.shape)

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 12202 entries, 0 to 12201
Data columns (total 13 columns):
 #   Column                           Non-Null Count  Dtype   
---  ------                           --------------  -----   
 0   gml_id                           12202 non-null  object  
 1   beginLifespanVersion             12202 non-null  object  
 2   endLifespanVersion               0 non-null      object  
 3   estimatedAccuracy                12202 non-null  float64 
 4   estimatedAccuracy_uom            12202 non-null  object  
 5   localId                          12202 non-null  object  
 6   namespace                        12202 non-null  object  
 7   label                            12202 non-null  object  
 8   LocalisedCharacterString         12202 non-null  object  
 9   nationalCadastalZoningReference  12202 non-null  object  
 10  originalMapScaleDenominator      12202 non-null  int64   
 11  pos                              12202 non-null  object  
 

None

(12202, 13)

In [27]:
## son iguales ---> SI, son las mismas
display(cadastralZoning['estimatedAccuracy'].value_counts())
display(cadastralZoning['originalMapScaleDenominator'].value_counts())
display(cadastralZoning['LocalisedCharacterString'].value_counts()) # son iguales ??

cadastralZoning.apply(lambda x : x['estimatedAccuracy'] == 0.5 and \
                                 x['LocalisedCharacterString'] == 'MANZANA ' and \
                                 x['originalMapScaleDenominator'] == 500,
                      axis = 1).value_counts() # All true
print("""
Nota_______________________________________________________________________ \n
estimatedAccuracy == originalMapScaleDenominator == LocalisedCharacterString\n
              0.5 == 500                     500 == MANZANA                 \n
              1.0 == 5000                   5000 == POLIGONO                \n
""")

0.5    12111
1.0       91
Name: estimatedAccuracy, dtype: int64

500     12111
5000       91
Name: originalMapScaleDenominator, dtype: int64

MANZANA      12111
POLIGONO        91
Name: LocalisedCharacterString, dtype: int64


Nota_______________________________________________________________________ 

estimatedAccuracy == originalMapScaleDenominator == LocalisedCharacterString

              0.5 == 500                     500 == MANZANA                 

              1.0 == 5000                   5000 == POLIGONO                




In [28]:
rawData_infoCleaning(cadastralZoning, 
                     drop_cols = True, 
                     cols_to_separate = ['localId', 'gml_id'], 
                     datetime_col = 'beginLifespanVersion')

Initiating cleaning pipeline -----------------------------------------


-------------------- Current Layers in CAD_ZONING ------------------------
------------------------------------------------------------------------
1. gml_id:					12202	Unique items
2. beginLifespanVersion:			2381	Unique items
3. endLifespanVersion:			0	ALL NULLS
4. estimatedAccuracy:				2	Unique items
5. estimatedAccuracy_uom:			1	Unique items
6. localId:					12202	Unique items
7. namespace:					1	Unique items
8. label:					11507	Unique items
9. LocalisedCharacterString:			2	Unique items
10. nationalCadastalZoningReference:		12202	Unique items
11. originalMapScaleDenominator:			2	Unique items
------------------------------------------------------------------------

-------------- Droping DUPLICATED COLUMNS in CAD_ZONING ------------------
1. endLifespanVersion
2. estimatedAccuracy_uom
3. namespace
-- Finished task -----------------------------------------------------


-------------------- Current Layers in CA

In [29]:
cadastralZoning.head()

Unnamed: 0,ID,estimatedAccuracy,label,pos,geometry,yearOfConstruction
0,28900A000,1.0,0,440302.42 4492704.6,"MULTIPOLYGON (((440301.968 4492704.637, 440302...",2019
1,28900A001,1.0,1,444909.27 4497953.5,"MULTIPOLYGON (((444555.092 4499363.444, 444568...",2019
2,28900A002,1.0,2,446474.92 4496527.94,"MULTIPOLYGON (((445948.540 4496495.370, 445945...",2013
3,28900A003,1.0,3,447749.11 4494280.98,"MULTIPOLYGON (((449091.096 4493635.359, 449089...",2014
4,28900A004,1.0,4,447065.76 4493887.71,"MULTIPOLYGON (((445797.560 4495048.620, 445790...",2013


**_______________**

**_______________**

### CHECKING MEMORY USAGE

In [30]:
print(f"Building Layer total memory usage: \t\t\t{building_df.memory_usage(index=True).sum()/1000} \tKbytes")
print(f"Building Parts Layer total memory usage: \t\t{buildingParts_df.memory_usage(index=True).sum()/1000} \tKbytes")
print(f"Other Construction Layer total memory usage: \t\t{otherConstruction_df.memory_usage(index=True).sum()/1000} \tKbytes")

print(f"Cadastral Parcel Layer total memory usage: \t\t{cadastralParcel.memory_usage(index=True).sum()/1000} \tKbytes")
print(f"Cadastral Zoning Layer total memory usage: \t\t{cadastralZoning.memory_usage(index=True).sum()/1000} \t\tKbytes")

Building Layer total memory usage: 			1600.128 	Kbytes
Building Parts Layer total memory usage: 		1200.128 	Kbytes
Other Construction Layer total memory usage: 		442.432 	Kbytes
Cadastral Parcel Layer total memory usage: 		1200.128 	Kbytes
Cadastral Zoning Layer total memory usage: 		585.824 		Kbytes


--- INITIALLY---

    Building Layer total memory usage: 			     4825.128 	Kbytes
    Building Parts Layer total memory usage: 		 2625.128 	Kbytes
    Other Construction Layer total memory usage: 	 774.16 	Kbytes
    Cadastral Parcel Layer total memory usage: 		 2200.128 	Kbytes
    Cadastral Zoning Layer total memory usage: 		 1269.136 	Kbytes

In [31]:
## CHANGE

print(f"Building Layer memory optimization: \t\t\t{np.round(building_df.memory_usage(index=True).sum()/(10*4825.128), 2)} \t%")
print(f"Building Parts Layer memory optimization: \t\t{np.round(buildingParts_df.memory_usage(index=True).sum()/(10*2625.128), 2)} \t%")
print(f"Other Construction Layer memory optimization: \t\t{np.round(otherConstruction_df.memory_usage(index=True).sum()/(10*774.16), 2)} \t%")

print(f"Cadastral Parcel Layer memory optimization: \t\t{np.round(cadastralParcel.memory_usage(index=True).sum()/(10*2200.128), 2)} \t%")
print(f"Cadastral Zoning Layer memory optimization: \t\t{np.round(cadastralZoning.memory_usage(index=True).sum()/(10*1269.136), 2)} \t%")

Building Layer memory optimization: 			33.16 	%
Building Parts Layer memory optimization: 		45.72 	%
Other Construction Layer memory optimization: 		57.15 	%
Cadastral Parcel Layer memory optimization: 		54.55 	%
Cadastral Zoning Layer memory optimization: 		46.16 	%


**_______________**

** EXAMPLE OF ALL TOGETHER **

In [89]:
from os import listdir
from os.path import isfile, join

CATASTRO_PATH = '../data/raw/catastro'
CLEAN_PATH = '../data/clean'

all_raw_geofiles = [f for f in listdir(CATASTRO_PATH) if isfile(join(CATASTRO_PATH, f))]

for geofile in all_raw_geofiles:
    
    print(geofile)
    print()
    
    file_gdf = gpd.read_file(f"{CATASTRO_PATH}/{geofile}")
    file_gdf.name = geofile.split('.')[-2]
    
    rawData_infoCleaning(file_gdf, 
                     drop_cols = True, 
                     cols_to_separate = ['localId', 'gml_id'], 
                     datetime_col = 'beginLifespanVersion')

    display(cadastralZoning.info())
    display(cadastralZoning.shape)
    
    file_gdf.to_file(f"{CLEAN_PATH}/{file_gdf.name}.geojson", driver='GeoJSON')

A.ES.SDGC.CP.28900.cadastralparcel.geojson

Initiating cleaning pipeline -----------------------------------------


-------------------- Current Layers in cadastralparcel ------------------------
------------------------------------------------------------------------
1. gml_id:					139931	Unique items
2. areaValue:					11586	Unique items
3. areaValue_uom:				1	Unique items
4. beginLifespanVersion:			3647	Unique items
5. endLifespanVersion:			0	ALL NULLS
6. localId:					139931	Unique items
7. namespace:					1	Unique items
8. label:					981	Unique items
9. nationalCadastralReference:			139931	Unique items
------------------------------------------------------------------------

-------------- Droping DUPLICATED COLUMNS in cadastralparcel ------------------
1. areaValue_uom
2. endLifespanVersion
3. namespace
-- Finished task -----------------------------------------------------


-------------------- Current Layers in cadastralparcel ------------------------
-------------------------

None

(12202, 6)

A.ES.SDGC.BU.28900.building.geojson

Initiating cleaning pipeline -----------------------------------------


-------------------- Current Layers in building ------------------------
------------------------------------------------------------------------
1. gml_id:					123634	Unique items
2. beginLifespanVersion:			4165	Unique items
3. conditionOfConstruction:			4	Unique items
4. beginning:					204	Unique items
5. end:					191	Unique items
6. endLifespanVersion:				44	Unique items
7. informationSystem:				123634	Unique items
8. reference:					123634	Unique items
9. localId:					123634	Unique items
10. namespace:					1	Unique items
11. horizontalGeometryEstimatedAccuracy:		1	Unique items
12. horizontalGeometryEstimatedAccuracy_uom:	1	Unique items
13. horizontalGeometryReference:			1	Unique items
14. referenceGeometry:				1	Unique items
15. currentUse:					6	Unique items
16. numberOfBuildingUnits:			593	Unique items
17. numberOfDwellings:				329	Unique items
18. numberOfFloorsAboveGroun

None

(12202, 6)

A.ES.SDGC.AD.28900.geojson

Initiating cleaning pipeline -----------------------------------------


-------------------- Current Layers in 28900 ------------------------
------------------------------------------------------------------------
1. gml_id:					160893	Unique items
2. localId:					160893	Unique items
3. namespace:					1	Unique items
4. specification:				2	Unique items
5. method:					1	Unique items
6. default:					1	Unique items
7. designator:					3655	Unique items
8. type:					1	Unique items
9. level:					1	Unique items
10. validFrom:			0	ALL NULLS
11. beginLifespanVersion:			3628	Unique items
------------------------------------------------------------------------

-------------- Droping DUPLICATED COLUMNS in 28900 ------------------
1. namespace
2. method
3. default
4. type
5. level
6. validFrom
-- Finished task -----------------------------------------------------


-------------------- Current Layers in 28900 ------------------------
-----------------------------

None

(12202, 6)

A.ES.SDGC.BU.28900.otherconstruction.geojson

Initiating cleaning pipeline -----------------------------------------


-------------------- Current Layers in otherconstruction ------------------------
------------------------------------------------------------------------
1. gml_id:					13822	Unique items
2. beginLifespanVersion:			2191	Unique items
3. conditionOfConstruction:			0	ALL NULLS
4. localId:					13822	Unique items
5. namespace:					1	Unique items
6. constructionNature:				1	Unique items
------------------------------------------------------------------------

-------------- Droping DUPLICATED COLUMNS in otherconstruction ------------------
1. conditionOfConstruction
2. namespace
3. constructionNature
-- Finished task -----------------------------------------------------


-------------------- Current Layers in otherconstruction ------------------------
------------------------------------------------------------------------
1. gml_id:					13822	Unique items
2. beginLifes

None

(12202, 6)

A.ES.SDGC.BU.28900.buildingpart.geojson

Initiating cleaning pipeline -----------------------------------------


-------------------- Current Layers in buildingpart ------------------------
------------------------------------------------------------------------
1. gml_id:					825014	Unique items
2. beginLifespanVersion:			4190	Unique items
3. conditionOfConstruction:			0	ALL NULLS
4. localId:					825014	Unique items
5. namespace:					1	Unique items
6. horizontalGeometryEstimatedAccuracy:		1	Unique items
7. horizontalGeometryEstimatedAccuracy_uom:	1	Unique items
8. horizontalGeometryReference:			1	Unique items
9. referenceGeometry:				1	Unique items
10. numberOfFloorsAboveGround:			56	Unique items
11. heightBelowGround:				11	Unique items
12. heightBelowGround_uom:			1	Unique items
13. numberOfFloorsBelowGround:			11	Unique items
------------------------------------------------------------------------

-------------- Droping DUPLICATED COLUMNS in buildingpart ------------------
1. con

None

(12202, 6)

A.ES.SDGC.CP.28900.cadastralzoning.geojson

Initiating cleaning pipeline -----------------------------------------


-------------------- Current Layers in cadastralzoning ------------------------
------------------------------------------------------------------------
1. gml_id:					12202	Unique items
2. beginLifespanVersion:			2381	Unique items
3. endLifespanVersion:			0	ALL NULLS
4. estimatedAccuracy:				2	Unique items
5. estimatedAccuracy_uom:			1	Unique items
6. localId:					12202	Unique items
7. namespace:					1	Unique items
8. label:					11507	Unique items
9. LocalisedCharacterString:			2	Unique items
10. nationalCadastalZoningReference:		12202	Unique items
11. originalMapScaleDenominator:			2	Unique items
------------------------------------------------------------------------

-------------- Droping DUPLICATED COLUMNS in cadastralzoning ------------------
1. endLifespanVersion
2. estimatedAccuracy_uom
3. namespace
-- Finished task -------------------------------------------

None

(12202, 6)