In [1]:
import dask.dataframe as dd
import dask.array as da
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import strip_accents_ascii, strip_accents_unicode
import matplotlib.pyplot as plt
import math
import os 
import jellyfish as jf


%matplotlib inline
numcores = 16
tiene_gpu = False
pd.set_option('display.max_columns', 99)
pd.set_option('display.max_rows', 100)



In [2]:
# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

In [3]:
# Este es el root en el servidor de jupyter
data_root = 'C:/code/hotelmapping/data/'


In [15]:
pair_candidates_file = os.path.join(data_root, 'travcoding/pair_candidates.parquet')
inventory_clean_file = os.path.join(data_root, 'travcoding/properties_clean.parquet')                                               
providers_clean_file = os.path.join(data_root, 'travcoding/Providers_clean.parquet')

# Cargo los archivos
inventory_ddf = dd.read_parquet(inventory_clean_file, engine='pyarrow').repartition(numcores)
provider_ddf  = dd.read_parquet(providers_clean_file, engine='pyarrow').repartition(numcores)
candidates_ddf = dd.read_parquet(pair_candidates_file, engine='pyarrow').repartition(numcores)




In [5]:
#inventory_clean_file
inventory_ddf.index.size.compute()

523935

In [16]:
provider_ddf.index.size.compute()

703021

In [17]:
candidates_ddf.index.size.compute()

291923

In [6]:
candidates = candidates_ddf.merge(inventory_ddf, how='inner', left_on=['PropertyId'], right_index=True)\
    .merge(provider_ddf, how='inner', left_on=['PropertyByProviderId'], right_index=True)


In [7]:
cols = ['PropertyId',
    'providerid',
    'PropertyByProviderId',
    'idf_count',
    'idf_sum',
    'idf_max',
    'idf_mean',
    #---------------------------------------
    #'countrycode_x',
    #'countrycode_y',
    'countrycorregido_x',
    'countrycorregido_y',
    'propertytype_x',
    'propertytype_y',
    'propertyname_x',
    'propertyname_y',
    'lat_x',
    'lat_y',
    'lng_x',
    'lng_y',
    'address_x',
    'address_y',
    'zipcode_y',
    'zipcode_x',
    'city_x',
    'city_y',
    'state_y',
    'state_x',
    'starrating_y',
    'starrating_x',
    'email_x',
    'email_y',
    'phone_x',
    'phone_y',
    'fax_y',
    'fax_x',
    'website_x',
    'website_y'    
    # 'propertyid',
    #'language'
    ]

In [8]:
# distance in kilometers
def haversine_np(lon1, lat1, lon2, lat2):
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2      
    return 12734 * np.arcsin(np.sqrt(a))

#Similarity location
def simi_location(distance):
    return 1/(1+100 * distance * distance)


# Calcula la similitud entre dos strings usando leventein
def simi_levenshtein(s1, s2):
    if len(s2) == 0 or len(s1) == 0:
        return 0    
    if s1 == s2:
        return 1
    return 1-(jf.levenshtein_distance(s1,s2)/max(len(s1),len(s2)))

# # Calcula la similitud entre dos strings usando leventein
# def simi_levenshtein(s1, s2):
#     if len(s2) == 0 or len(s1) == 0:
#         return 0    
#     if s1 == s2:
#         return 1
#     return 1-(jf.levenshtein_distance(s1,s2)/max(len(s1),len(s2)))


# Similarity distance
def simi_stars(star1,star2):
    if math.isnan(star1) or math.isnan(star2):
        d = 9999
    else:
        d = star1 - star2
    #Como la potencia es par no saco valor absoluto !!!        
    return 1/(1+d**2)


In [9]:
s1= 'no. 109 wenyuan east street'
s2=	'no. 188 grand west street'
s1= 'no. 109 wenyuan east '
s2=	'no. 188 grand west '

s1= 'rua bambuzal 21 praia de geriba'
s2=	'rua bambuzal 21'
		

print('longitud maxima =',max(len(s1),len(s2)))
print('Leventein=',jf.levenshtein_distance(s1,s2))
print('demerau=',jf.damerau_levenshtein_distance(s1,s2))
print('hamming=',jf.hamming_distance(s1,s2))
print('jaro =',jf.jaro_similarity(s1,s2))
print('jaro winkler=',jf.jaro_winkler_similarity(s1,s2))
print(simi_levenshtein(s1,s2))



longitud maxima = 31
Leventein= 16
demerau= 16
hamming= 16
jaro = 0.8279569892473119
jaro winkler= 0.8967741935483872
0.4838709677419355


In [10]:
candidates['loc_distance'] = haversine_np(candidates['lng_x'],candidates['lat_x'],candidates['lng_y'],candidates['lat_y'])

In [11]:
candidates['propertyname_sim'] = candidates.apply(lambda x: simi_levenshtein(x['propertyname_x'],x['propertyname_y']),axis=1, meta=(None, 'float32')).astype(np.float32)
candidates['address_sim'] = candidates.apply(lambda x: simi_levenshtein(x['address_x'],x['address_y']),axis=1, meta=(None, 'float32')).astype(np.float32)
candidates['loc_sim'] = candidates.apply(lambda x: simi_location(x['loc_distance']),axis=1, meta=(None, 'float32')).astype(np.float32) 
candidates['city_sim'] = candidates.apply(lambda x: simi_levenshtein(x['city_x'],x['city_y']),axis=1, meta=(None, 'float32')).astype(np.float32)
candidates['starrating_sim'] = candidates.apply(lambda x: simi_stars(x['starrating_x'],x['starrating_y']),axis=1, meta=(None, 'float32')).astype(np.float32)
candidates['state_sim'] = candidates.apply(lambda x: simi_levenshtein(x['state_x'],x['state_y']),axis=1, meta=(None, 'float32')).astype(np.float32)
candidates['zipcode_sim'] = candidates.apply(lambda x: simi_levenshtein(x['zipcode_x'],x['zipcode_y']),axis=1, meta=(None, 'float32')).astype(np.float32)
candidates['email_sim'] = candidates.apply(lambda x: simi_levenshtein(x['email_x'],x['email_y']),axis=1, meta=(None, 'float32')).astype(np.float32)
candidates['phone_sim'] =  candidates.apply(lambda x: simi_levenshtein(x['phone_x'],x['phone_y']),axis=1, meta=(None, 'float32')).astype(np.float32)
candidates['fax_sim'] = candidates.apply(lambda x: simi_levenshtein(x['fax_x'],x['fax_y']),axis=1, meta=(None, 'float32')).astype(np.float32)
candidates['website_sim'] = candidates.apply(lambda x: simi_levenshtein(x['website_x'],x['website_y']),axis=1, meta=(None, 'float32')).astype(np.float32)



In [None]:
candidates.columns

In [12]:
cols2 = [
    'PropertyId',
    'providerid',
    'PropertyByProviderId',
    'countrycorregido_x',
    'countrycorregido_y',
    'propertytype_x',
    'propertytype_y',
    'propertyname_x',
    'propertyname_y',
    'propertyname_sim',
    'lat_x',
    'lat_y',
    'lng_x',
    'lng_y',
    'loc_distance',
    'loc_sim',
    'address_x',
    'address_y',
    'address_sim',
    'zipcode_y',
    'zipcode_x',
    'zipcode_sim',
    'city_x',
    'city_y',
    'city_sim',
    'state_y',
    'state_x',
    'state_sim',
    'starrating_y',
    'starrating_x',
    'starrating_sim'
    
    
    
    ]
candidates.loc[(candidates.providerid == 'NTP') ].head(20)[cols2]

#& (candidates.loc_sim > .1)






Unnamed: 0,PropertyId,providerid,PropertyByProviderId,countrycorregido_x,countrycorregido_y,propertytype_x,propertytype_y,propertyname_x,propertyname_y,propertyname_sim,lat_x,lat_y,lng_x,lng_y,loc_distance,loc_sim,address_x,address_y,address_sim,zipcode_y,zipcode_x,zipcode_sim,city_x,city_y,city_sim,state_y,state_x,state_sim,starrating_y,starrating_x,starrating_sim


In [14]:
candidates.providerid.value_counts().compute( )

EXP     262792
HBD      29130
EXPS         1
Name: providerid, dtype: Int64

In [None]:
#candidates[candidates.loc_distance > 0.5].head()

In [None]:
#cand['propertyname_sim'] = cand.apply(lambda x: 1-jellyfish.levenshtein_distance(x['propertyname_x'],x['propertyname_y'])/max([len(x['propertyname_x']), len(x['propertyname_y'])]),axis=1).astype(np.float32)
#cand.head()[['propertyname_x', 'propertyname_y', 'propertyname_sim']]
#candidates.head()[['propertyname_x', 'propertyname_y', 'propertyname_sim']]

#cand['address_sim'] = cand.apply(lambda x: 1-jellyfish.levenshtein_distance(x['address_x'],x['address_y'])/max([len(x['address_x']), len(x['propertyname_y'])]),axis=1).astype(np.float32)
#cand.head(50)[['address_x', 'address_y', 'address_sim']]
#candidates.head()[['address_x', 'address_y', 'address_sim']]