# Data load and python code optimization

## Time optimization 

In [37]:
#Load data
spd_opt_data = pd.read_csv('data/dataset_speed_optimization.csv')
#Extract only the needed features
spd_opt_data=spd_opt_data[['latitude','longitude']]

#### First round: 10,5 s

In [32]:
%%timeit
#Function to apply to all rows in the DataFrame
def functiontoapply_first(lat,lon):
    a=np.sin(lat/2)**2+np.cos(lat)*np.cos(lon)*np.sin(lon/2)**2
    return a
 
#Add new column to the DataFrame
listresults=[]
for i in range ( 0 ,len(spd_opt_data)):
    r = functiontoapply_first(used_data.iloc[i]['latitude'],used_data.iloc[i]['longitude'])
    listresults.append(r)
spd_opt_data['distance']=listresults

10.5 s ± 462 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### Second round: 4.72 s

In [33]:
%%timeit
def functiontoapply_second(lat,lon):
    return np.sin(lat/2)**2+np.cos(lat)*np.cos(lon)*np.sin(lon/2)**2
    
spd_opt_data['distance']=0

for index, row in spd_opt_data.iterrows():
    spd_opt_data.at[index, 'distance']= functiontoapply(row['latitude'],row['longitude'])

4.72 s ± 519 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### third round: 5.64 ms

In [38]:
%%timeit
def functiontoapply_third(lat,lon):
    return np.sin(lat/2)**2+np.cos(lat)*np.cos(lon)*np.sin(lon/2)**2
spd_opt_data['distances']= functiontoapply_third(spd_opt_data['latitude'], spd_opt_data['longitude'])

5.64 ms ± 181 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## Memory optimization

#### Initial load

In [190]:
#Load data
mem_opt_data = pd.read_csv('data/dataset_memory_optimization.csv')

In [191]:
mem_opt_data .head()

Unnamed: 0,ean_hotel_id,name,address1,city,state_province,postal_code,latitude,longitude,star_rating,high_rate,low_rate
0,269955,Hilton Garden Inn Albany/SUNY Area,1389 Washington Ave,Albany,NY,12206,42.68751,-73.81643,3.0,154.0272,124.0216
1,113431,Courtyard by Marriott Albany Thruway,1455 Washington Avenue,Albany,NY,12206,42.68971,-73.82021,3.0,179.01,134.0
2,108151,Radisson Hotel Albany,205 Wolf Rd,Albany,NY,12205,42.7241,-73.79822,3.0,134.17,84.16
3,254756,Hilton Garden Inn Albany Medical Center,62 New Scotland Ave,Albany,NY,12208,42.65157,-73.77638,3.0,308.2807,228.4597
4,198232,CrestHill Suites SUNY University Albany,1415 Washington Avenue,Albany,NY,12206,42.68873,-73.81854,3.0,169.39,89.39


In [192]:
mem_opt_data.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 417536 entries, 0 to 417535
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   ean_hotel_id    417536 non-null  int64  
 1   name            417536 non-null  object 
 2   address1        417536 non-null  object 
 3   city            417536 non-null  object 
 4   state_province  417536 non-null  object 
 5   postal_code     417536 non-null  object 
 6   latitude        417536 non-null  float64
 7   longitude       417536 non-null  float64
 8   star_rating     417280 non-null  float64
 9   high_rate       417536 non-null  float64
 10  low_rate        417536 non-null  float64
dtypes: float64(5), int64(1), object(5)
memory usage: 157.2 MB


In [193]:
mem_opt_data.memory_usage(deep=True).sum()

164885632

In [100]:
# Min and max value to now the optimized type.
print('ean_hotel_id  max:'+str(mem_opt_data['ean_hotel_id'].max())+'--- min:'+str(mem_opt_data['ean_hotel_id'].min()))
print('high_rate  max:'+str(mem_opt_data['high_rate'].max())+'--- min:'+str(mem_opt_data['high_rate'].min()))
print('low_rate  max:'+str(mem_opt_data['low_rate'].max())+'--- min:'+str(mem_opt_data['low_rate'].min()))
print('star_rating  max:'+str(mem_opt_data['star_rating'].max())+'--- min:'+str(mem_opt_data['star_rating'].min()))
#print('star_rating:  '+ str(mem_opt_data["address1"].str.len().max()))
#print('name:   '+ str(mem_opt_data["name"].str.len().max()))

ean_hotel_id  max:685047--- min:6295
high_rate  max:10888.5--- min:0.0
low_rate  max:5990.25--- min:0.0
star_rating  max:5.0--- min:1.0


#### Optimized load

In [194]:
#Optimized 
mem_opt_data_opt = pd.read_csv('data/dataset_memory_optimization.csv', 
                                   dtype={"ean_hotel_id": "uint32",
                                            "name": "category",
                                            "address1": "category",
                                            "city": "category",  
                                            "state_province": "category",
                                            "postal_code": "category",
                                            "latitude": "float16",
                                            "longitude": "float16", 
                                            "star_rating": "float16",
                                            "high_rate": "float32", 
                                            "low_rate": "float32"})

In [195]:
mem_opt_data_opt.head()

Unnamed: 0,ean_hotel_id,name,address1,city,state_province,postal_code,latitude,longitude,star_rating,high_rate,low_rate
0,269955,Hilton Garden Inn Albany/SUNY Area,1389 Washington Ave,Albany,NY,12206,42.6875,-73.8125,3.0,154.027206,124.021599
1,113431,Courtyard by Marriott Albany Thruway,1455 Washington Avenue,Albany,NY,12206,42.6875,-73.8125,3.0,179.009995,134.0
2,108151,Radisson Hotel Albany,205 Wolf Rd,Albany,NY,12205,42.71875,-73.8125,3.0,134.169998,84.160004
3,254756,Hilton Garden Inn Albany Medical Center,62 New Scotland Ave,Albany,NY,12208,42.65625,-73.75,3.0,308.280701,228.459702
4,198232,CrestHill Suites SUNY University Albany,1415 Washington Avenue,Albany,NY,12206,42.6875,-73.8125,3.0,169.389999,89.389999


In [196]:
mem_opt_data_opt.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 417536 entries, 0 to 417535
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype   
---  ------          --------------   -----   
 0   ean_hotel_id    417536 non-null  uint32  
 1   name            417536 non-null  category
 2   address1        417536 non-null  category
 3   city            417536 non-null  category
 4   state_province  417536 non-null  category
 5   postal_code     417536 non-null  category
 6   latitude        417536 non-null  float16 
 7   longitude       417536 non-null  float16 
 8   star_rating     417280 non-null  float16 
 9   high_rate       417536 non-null  float32 
 10  low_rate        417536 non-null  float32 
dtypes: category(5), float16(3), float32(2), uint32(1)
memory usage: 11.2 MB


In [90]:
mem_opt_data_opt.memory_usage(deep=True).sum()

14684052

In [99]:
# CHeck wether the min and amx on the new type
print('ean_hotel_id  max:'+str(mem_opt_data_opt['ean_hotel_id'].max())+'--- min:'+str(mem_opt_data_opt['ean_hotel_id'].min()))
print('high_rate  max:'+str(mem_opt_data_opt['high_rate'].max())+'--- min:'+str(mem_opt_data_opt['high_rate'].min()))
print('low_rate  max:'+str(mem_opt_data_opt['low_rate'].max())+'--- min:'+str(mem_opt_data_opt['low_rate'].min()))
print('star_rating  max:'+str(mem_opt_data_opt['star_rating'].max())+'--- min:'+str(mem_opt_data_opt['star_rating'].min()))
#print('star_rating:  '+ str(mem_opt_data["address1"].str.len().max()))
#print('name:   '+ str(mem_opt_data["name"].str.len().max()))

ean_hotel_id  max:685047--- min:6295
high_rate  max:10888.5--- min:0.0
low_rate  max:5990.25--- min:0.0
star_rating  max:5.0--- min:1.0
