# Cleaning the datasets for Registro Público de Concesiones and zipcodes

This code includes all the cleaning done for the datasets that were scrapped and aggregates these into a big dataset.

- [Zipcodes](https://xn--cdigospostales-lob.es/listado-de-codigos-postales-de-espana/)
- [Main dataset](https://sedeaplicaciones.minetur.gob.es/RPC_Consulta)
    - Main page dataset
    - Pop-up datasets

## Packages used

- datetime
- numpy
- pandas
- fuzzywuzzy
- math
- datetime

# Import packages

In [243]:
import numpy as np
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from datetime import datetime as dt
import math

# Import data

In [312]:

# Main dataset
df = pd.read_csv('/Users/niko/Documents/Personal/GitHub/RadioLinkConcessionsSpain/new_data/fre.csv')
# Pop-up dataset
zipcodes = pd.read_excel('/Users/niko/Documents/Personal/GitHub/RadioLinkConcessionsSpain/new_data/listado-codigos-postales-con-LatyLon.xls')

# Import all the datasets for all regions
# region_1 = pd.read_csv('')
# region_2 = pd.read_csv('')
# region_3 = pd.read_csv('')
# region_4 = pd.read_csv('')
# region_5 = pd.read_csv('')
# region_6 = pd.read_csv('')
# region_7 = pd.read_csv('')
# region_8 = pd.read_csv('')
# region_9 = pd.read_csv('')
# region_10 = pd.read_csv('')
# region_11 = pd.read_csv('')
# region_12 = pd.read_csv('')
# region_13 = pd.read_csv('')
# region_14 = pd.read_csv('')
# region_15 = pd.read_csv('')
# region_16 = pd.read_csv('')
# region_17 = pd.read_csv('')
# region_18 = pd.read_csv('')
# region_19 = pd.read_csv('')

# Concat all the datasets
# df_concat = pd.concat([
#                         region_1,
#                         region_2,
#                         region_3,
#                         region_4,
#                         region_5,
#                         region_6,
#                         region_7,
#                         region_8,
#                         region_9,
#                         region_10,
#                         region_11,
#                         region_12,
#                         region_13,
#                         region_14,
#                         region_15,
#                         region_16,
#                         region_17,
#                         region_18,
#                         region_19,
#                         ])





# Clean the pop-up data

In [313]:
""" 
    Doing a fuzzy join on the dataset of zipcodes and the data of reference
    data inside the 'Consulta del Registro Público de Concesiones'.
    This is needed, because the data from the source does not have zipcodes, longitute and latitude.
    The fuzzy join does a cross join, then calculates the fuzzy ratio and include the data based on a fuzzy_ratio.
    Then, the duplicated columns are dropped.
    The output is a new dataset that includes the zipcode, longitude and latitude per tower.
 """

# Define the fuzzy ratio used to include the data
fuzzy_ratio = 20

# Clean all the pop-up datasets
df = df.drop(df.columns[[0]],axis = 1)
df = df.apply(lambda x: x.str.strip()).replace('', np.nan)
df = df.fillna(method='ffill')
df[['Frequencias', 'Tipo']] = df['Frecuencia'].str.split(' ', 1, expand=True)
del df['Frecuencia']
df['Frequencias'] = df['Frequencias'].apply(lambda x: x.replace('.', '')).apply(lambda x: x.replace(',', '.')).astype('float')

# --- 

# Cleaning of zipcodes

# Define the columns that we want to include in the final dataset
zipcodes_columns = ['codigopostalid','lat','lon']
df_columns = [ 'Referencia','Comunidad','Provincia','Municipio','Frequencias','Tipo']
all_new_columns = df_columns+zipcodes_columns

# Create new column to use to join both datasets
zipcodes['merge']='all'
df2['merge']='all'

# Join both datasets per row
all_datasets = pd.merge(df2,zipcodes,on='merge')
del all_datasets['merge']

# Create list of tuples based on the columns that we want to use for the join
datasets_tuple = all_datasets[['Municipio', 'poblacion']].apply(tuple, axis=1).tolist()

# Create the fuzz ratio on the list of tuples ceated
all_datasets['ratio'] = [fuzz.token_sort_ratio(*i) for i in datasets_tuple]

# Exclude those that have a low match ratio, the threshhold is set low because some matches have a low score
all_datasets = all_datasets[all_datasets.ratio>fuzzy_ratio]

# Drop all duplicates based on the defined columns and keep all the wanted ones
final_df = all_datasets[all_new_columns].drop_duplicates(subset=['Referencia','Municipio','Frequencias'])

AttributeError: Can only use .str accessor with string values!

# Clean the main dataset

In [247]:
main_df = pd.read_csv('/Users/niko/Documents/Personal/GitHub/RadioLinkConcessionsSpain/RegistroPublicoConcesiones_General.csv')

In [296]:
# --- Remove the rows that are unecessary

# Function to return a list with unique numeric values
def unique(list1):
    x = np.array(list1)
    return list(np.unique(x))

# Creates list of the returned values 
list_cities = unique(main_df.Localidad)

# Includes only text values, all cities of Spain
new_list_cities = []
for i in list_cities:
    if i.isnumeric() is False:
        if i != ' ':
            new_list_cities.append(i)

# Filter out all the values that are not inside the new_list_cities
main_df = main_df.loc[main_df['Localidad'].isin(new_list_cities)]

# --- Work on booleans

# Fill in False to all nulls for specific boolean columns
main_df[['Susceptible cesion','Susceptible mutualizacion','Obtenido por transferencia']] = \
                            main_df[['Susceptible cesion','Susceptible mutualizacion','Obtenido por transferencia']].fillna(False)
                            
main_df['Susceptible cesion'] = main_df['Susceptible cesion'].replace("true", True)
main_df['Obtenido por transferencia'] = main_df['Obtenido por transferencia'].replace("Detalle", True)

# --- Work on the dates

# Select columns that contain dates
date_columns = ['F. Caducidad','F. Concesion']

# Transform date objects to datetime
main_df[date_columns] = main_df[date_columns].apply(pd.to_datetime, errors='coerce')

# New features day, month and year
main_df['dia_concesion'] = round((main_df['F. Caducidad'] - main_df['F. Concesion']).dt.days,0).fillna(0).apply(np.int64)
main_df['mes_concesion'] = round((main_df['F. Caducidad'] - main_df['F. Concesion']).dt.days/12).fillna(0).apply(np.int64)
main_df['año_concesion'] = round((main_df['F. Caducidad'] - main_df['F. Concesion']).dt.days/360).fillna(0).apply(np.int64)

# Joined datasets

In [309]:
df_joined = pd.merge(main_df,final_df, how='left', left_on = 'Referencia', right_on = 'Referencia')

df_joined