In [1]:
import os
import html
import re
import math

from datetime import datetime
from sklearn.feature_extraction.text import strip_accents_ascii, strip_accents_unicode


import dask
import dask.array as da
import dask.dataframe as dd
import pandas as pd
import matplotlib.pyplot as plt


%matplotlib inline
numcores = 16
tiene_gpu = False
pd.set_option('display.max_columns', 99)
pd.set_option('display.max_rows', 100)


#dask.config.set(scheduler='processes')  
dask.config.set(scheduler='threads')

<dask.config.set at 0x203026f0d30>

In [2]:
# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

In [3]:
 
start_time = datetime.now()

In [4]:
# Este es el root en el servidor de jupyter
data_root = 'C:/code/hotelmapping/data/'


# Basic Clean function for String Column

In [5]:

def CleanStringCol(col_serie, encoding='utf-8', decode_error='strict', replace_by_none=r'[^ \-\_A-Za-z0-9ñÑ]+', remove_brackets=True, replace_by_whitespace=r'[\-\_]', IsDask: bool = True):
    # Decode Strings
    col_serie = col_serie.map(lambda x: x.decode(encoding, decode_error) if type(x) == bytes else x)        
    # unescape html string in col_serie
    col_serie = col_serie.map(lambda x: html.unescape(x))
    # To lowercase
    col_serie = col_serie.str.lower()
    
    # Remove  accents from unicode strings
    if IsDask:        
        col_serie = col_serie.map(lambda x: strip_accents_unicode(x) ,meta=(None, 'string')   )
    else:
        col_serie = col_serie.map(lambda x: strip_accents_unicode(x) )
        
    # Remove all content between brackets
    if remove_brackets:
        col_serie = col_serie.str.replace(r'(\[.*?\]|\(.*?\)|\{.*?\})', '')
    #Reemplaza por espacios algunos simbolos 
    col_serie = col_serie.str.replace(replace_by_whitespace, ' ')
    #Saca los símbolos permitidos
    col_serie = col_serie.str.replace(replace_by_none, '')
    # Remove multiple whitespaces
    col_serie = col_serie.str.replace(r'\s\s+', ' ')
    # Strip 
    col_serie = col_serie.str.lstrip().str.rstrip()
    return col_serie




# Regex (caracteres para eliminar)

In [6]:
address_regex=r'[^ \-\_\.@#\/&ñÑA-Za-z0-9]+'
website_regex=r'[^\-\.\/%?&A-Za-z0-9]+'
phone_regex = r'[^0-9]+' 
email_regex = r'[^\-\_\.@A-Za-z0-9]+'

# Otra phone_regex para revisar ([^\-\+\.()0-9]+)

# Geonames (pandas)



In [7]:
countryinfo_file = os.path.join(data_root, 'geo/countryinfo.csv')
df_countryinfo = pd.read_csv(countryinfo_file,sep='\t',low_memory=False,infer_datetime_format=True, keep_default_na = False)
df_country = df_countryinfo[['Country','ISO','ISO3','fips']].drop_duplicates().set_index('ISO')
df_country['countrycorregido'] = CleanStringCol(df_country['Country'],IsDask=False)
# df_country.head()


In [8]:
#df_country.head()

```

```

# Inventory

In [9]:
inventory_file = os.path.join(data_root, 'travcoding/Properties.parquet')
inventory_ddf = dd.read_parquet(inventory_file, engine='pyarrow' ).set_index('PropertyId').repartition(npartitions=numcores)

# Completa el rating
inventory_ddf.StarRating = inventory_ddf.StarRating.fillna('0')

# Corrige los tipos de datos
inventory_ddf = inventory_ddf.astype({
    'nameshort': 'string', 
    'namefull' : 'string', 
    'propertytype': 'string',
    'latitud' : 'float32',
    'longitud' : 'float32',
    'Address' : 'string',
    'ZipCode' : 'string',
    'CityCode' : 'string',
    'City' : 'string',
    'StateCode' : 'string',
    'State': 'string',
    'CountryCode' : 'string',
    'Country' : 'string',
    'Email' : 'string',
    'Phone' : 'string',
    'Fax' : 'string',
    'Website' : 'string',
    'StarRating' : 'int32'
})

inventory_ddf = inventory_ddf.rename(columns={
    'latitud' : 'lat',
    'longitud': 'lng',
    'Address' : 'address',
    'ZipCode' : 'zipcode',
    'City'    : 'city',
    'State'   : 'state',
    'Country' : 'country',
    'Email'   : 'email',
    'Phone'   : 'phone',
    'Fax'     : 'fax',
    'Website' : 'website',
    'StarRating' : 'starrating',
    'CountryCode' : 'countrycode'    
})  

#inventory_ddf.head()

In [10]:
#inventory_ddf.index.size.compute()

In [11]:
#inventory_ddf.address.to_csv(os.path.join(data_root, 'travcoding/address.csv'), index=False)

### Clean Inventory

In [12]:
# Country
inventory_ddf['countrycode'] = inventory_ddf['countrycode'].fillna('')
inventory_ddf['countrycorregido'] = inventory_ddf.merge(df_country, left_on='countrycode', right_index=True , how="left")['countrycorregido'].fillna('').compute()
inventory_ddf['countrycorregido'] = inventory_ddf['countrycorregido'].where(inventory_ddf['countrycorregido'] > '',  inventory_ddf['country'].fillna('') )
inventory_ddf['countrycorregido'] = CleanStringCol(inventory_ddf['countrycorregido'])


In [13]:
#inventory_ddf.index.size.compute()

In [14]:
# Completa la latitud y longitud
inventory_ddf.lat = inventory_ddf.lat.fillna(9999)
inventory_ddf.lng = inventory_ddf.lng.fillna(9999)


# Completa el rating
inventory_ddf.starrating = inventory_ddf.starrating.fillna('0')


# propertyname
inventory_ddf['propertyname'] = CleanStringCol(inventory_ddf['namefull']).fillna('')


#propertytype
inventory_ddf['propertytype'] = CleanStringCol(inventory_ddf['propertytype']).fillna('')


# email
inventory_ddf['email'] = CleanStringCol(inventory_ddf['email'].fillna(''),replace_by_none=email_regex).fillna('')


# phone y fax
inventory_ddf['phone'] = CleanStringCol(inventory_ddf['phone'].fillna(''),replace_by_none=phone_regex).fillna('')
inventory_ddf['fax'] = CleanStringCol(inventory_ddf['fax'].fillna(''),replace_by_none=phone_regex).fillna('')


# website
inventory_ddf['website'] = CleanStringCol(inventory_ddf['website'].fillna(''),replace_by_none=website_regex).fillna('')

# State
inventory_ddf['state'] = CleanStringCol(inventory_ddf['state'].fillna('')).fillna('')

# zipcode
inventory_ddf['zipcode'] = CleanStringCol(inventory_ddf['zipcode'].fillna('')).fillna('')

# adress
inventory_ddf['address'] = CleanStringCol(inventory_ddf['address'].fillna(''),replace_by_none=address_regex, remove_brackets=True).fillna('')

# city
inventory_ddf['city'] = CleanStringCol(inventory_ddf['city'].fillna('')).fillna('')

# Drop unused columns
inventory_ddf = inventory_ddf.drop(columns=['nameshort', 'namefull', 'CityCode', 'StateCode', 'country'])



In [15]:
#inventory_ddf.index.size.compute()

In [16]:
#inventory_ddf.head()

# Save Inventory

In [17]:

inventory_clean_file = os.path.join(data_root, 'travcoding/properties_clean.parquet')
inventory_ddf.to_parquet(inventory_clean_file, engine='pyarrow')


# Provider

In [18]:

providers_file = os.path.join(data_root, 'travcoding/PropertiesByProvider.parquet')
providers_ddf = dd.read_parquet(providers_file, engine ='pyarrow' ).set_index('PropertyByProviderId').repartition(npartitions=numcores)

providers_ddf['starrating'] = providers_ddf['starrating'].fillna('0')
providers_ddf['starrating'] = providers_ddf['starrating'].replace({
    'null':'0',
    '0.0' : '0',
    '1.0' : '1',
    '1.5' : '1',
    '2.0' : '2',
    '2.5' : '2',
    '3.0' : '3',
    '3.5' : '3',
    '4.0' : '4',
    '4.5' : '4',
    '5.0' : '5',
    '5.5' : '5',
    '6.0' : '6'
    } )

providers_ddf['lat'] = providers_ddf['lat'].replace({'null':'9999'} )
providers_ddf['lng'] = providers_ddf['lng'].replace({'null':'9999'} )

providers_ddf = providers_ddf.astype({
    'ProviderId': 'string',
    #'PropertyByProviderId': 'int32',
    'propertytype': 'string',
    'name' : 'string',
    'lat' : 'float32',
    'lng' : 'float32',
    'address' : 'string',
    'zipcode' : 'string',
    'citycode' : 'string',
    'cityname' : 'string',
    'statecode' : 'string',
    'statename' : 'string',
    'countrycode' : 'string',
    'countryname' : 'string',
    'email' : 'string',
    'phone' : 'string',
    'fax' : 'string',
    'website' : 'string',
    'starrating' : 'int32'
})

providers_ddf = providers_ddf.rename(columns={
    'cityname'      : 'city',
    'statename'     : 'state',
    'countryname'   : 'country',
    'Language'      : 'language',
    'ProviderId'    : 'providerid',
    'PropertyId'    : 'propertyid'
})  



In [19]:
# 'name', 
providers_ddf['name'] = providers_ddf['name'].fillna('')
providers_ddf['propertyname'] = CleanStringCol(providers_ddf['name'])


# State
providers_ddf['state'] = CleanStringCol(providers_ddf['state'].fillna('')).fillna('')

#city
providers_ddf['city'] = CleanStringCol(providers_ddf['city'].fillna('')).fillna('')


# 'country_corregido',
providers_ddf['countrycode'] = providers_ddf['countrycode'].fillna('')
providers_ddf['countrycorregido'] = providers_ddf.merge(df_country, left_on='countrycode', right_index=True , how="left")['countrycorregido'].fillna('').compute()
providers_ddf['countrycorregido']= providers_ddf['countrycorregido'].where(providers_ddf['countrycorregido'] > '',  providers_ddf['country'].fillna('') )
providers_ddf['countrycorregido'] = CleanStringCol(providers_ddf['countrycorregido'])

#propertytype
providers_ddf['propertytype'] = CleanStringCol(providers_ddf['propertytype'].fillna(''))


# email
providers_ddf['email'] = CleanStringCol(providers_ddf['email'].fillna(''),replace_by_none=email_regex).fillna('')


# phone y fax
providers_ddf['phone'] = CleanStringCol(providers_ddf['phone'].fillna(''),replace_by_none=phone_regex).fillna('')
providers_ddf['fax'] = CleanStringCol(providers_ddf['fax'].fillna(''),replace_by_none=phone_regex).fillna('')


# website
providers_ddf['website'] = CleanStringCol(providers_ddf['website'].fillna(''),replace_by_none=website_regex).fillna('')


# property_type
providers_ddf['propertytype'] = providers_ddf['propertytype'].where(providers_ddf['propertytype'].notnull(),  providers_ddf['propertytype2'].fillna('') )


# address
providers_ddf['address'] = CleanStringCol(providers_ddf['address'].fillna(''),replace_by_none=address_regex,remove_brackets=True).fillna('')

#zipcode
providers_ddf['zipcode'] = CleanStringCol(providers_ddf['zipcode'].fillna('')).fillna('')


providers_ddf = providers_ddf.drop(columns=[ 'propertytype2', 'citycode', 'statecode', 'country','name','propertyid2' ])  




In [20]:
#providers_ddf.providerid.value_counts().compute()

# Save providers

In [21]:
providers_clen_file = os.path.join(data_root, 'travcoding/providers_clean.parquet')
providers_ddf.to_parquet(providers_clen_file, engine ='pyarrow')

# Otras pruebas

In [22]:
# Para hacer pruebas lo convierto a pandas

# inventory_pdf = inventory_ddf.compute()
# providers_pdf = providers_ddf.compute()
# p1= providers_pdf
# p1[p1.country_corregido == ''].countrycode.unique()

# #['NA', 'UK', 'NY', 'C0', 'SF', 'BK', 'KV', '']



In [23]:
#['UK', 'NY', 'C0', 'SF', 'BK', 'KV', '']
#UK --> GB
#NY --> 
#C0 --> 
#SF --> ZA
#BK --> BA
#KV --> XK


In [24]:
# # Freceunace ade caracteres
# char_frec = p1.address.fillna(' ').str.split().explode().apply(lambda x: list(str(x))).explode().value_counts()
# char_frec.to_csv('data/travcoding/char_frec.csv')

In [25]:
time_elapsed = datetime.now() - start_time
print('Time elapsed (hh:mm:ss.ms) {}'.format(time_elapsed))

Time elapsed (hh:mm:ss.ms) 0:02:39.980190
