In [None]:
import numpy as np
import pandas as pd
import datetime
import boto3
import os

In [None]:
#Gets coordinates for the addresses in the dataframe from location.csv
#If coordinates are not in such file, it uses AWS Location Service to retrieve them
#and the it stores it on the location.csvs file
def get_coordinates_from_file(df:pd.DataFrame):

    #Retrieves coordinates from location.csv file
    try:
        location_df=pd.read_csv('location.csv')

        non_geolocated_index=list(df[df['latitude'].isna()].index)
        for row_index in non_geolocated_index:
            #Be mindful that the if statement only works if we use 'value in DataFrame.values'
            #Otherwise it will be always false as the in expresion 
            #takes the index of the dataframe if not specified
            if df.loc[row_index,'address'] in location_df['address'].values:
                latitude=location_df[location_df['address']==df.loc[row_index,'address']]['latitude'].values[0]
                longitude=location_df[location_df['address']==df.loc[row_index,'address']]['longitude'].values[0]
                
                df.at[row_index,'latitude']=latitude
                df.at[row_index,'longitude']=longitude
    except:
        print('location.csv does not exist')

    return df
    
def get_coordinates_from_aws(df:pd.DataFrame):
    #Calls AWS LocationService
    location=boto3.client('location')

    #Geocodes addresses for rows with latitude value = None
    for row in df[df['latitude'].isna()].index[0:100]:
        response = location.search_place_index_for_text(
            FilterCategories=[
                'AddressType',
                'StreetType',
                'PostalCodeType'
            ],
            FilterCountries=[
                'GBR',
            ],
            IndexName='rtm-index',
            Key=os.getenv('AWS_GEOCODING_KEY'),
            Language='en',
            Text=df.iloc[row]['address']
        )
        try:
            df.at[row,'latitude']=response['Results'][0]['Place']['Geometry']['Point'][0]
            df.at[row,'longitude']=response['Results'][0]['Place']['Geometry']['Point'][1]
            print(f"{df.loc[row,'address']} -- {df.loc[row,'latitude']} -- {df.loc[row,'longitude']}")
        except:
            print(f"Coordinates not found for address {df.iloc[row]['address']} at index {row}")
    return df

#Saves coordinates in a .csv file with 'address','latitude' and 'longitude' fields
#It should be made more robust to prevent overwriting in case there are
#problems retrieving coordinates in previous steps
def save_locations_to_file(df):
    df[df['latitude'].notna()][['address','latitude','longitude']].to_csv('location.csv',index=False)




In [None]:
#Gets the path of current directory
dir=os.path.realpath('.')
#Lists files in current directory, excluding child directories
list_files=list(os.walk(dir))[0][2]
#Selects final_data csv file name
final_data_file_name=[file for file in list_files if 'final_data' in file][0]
if len(final_data_file_name)==0:
    raise Exception('There is not final data file')
else:
    #Generates dataframe from file
    df=pd.read_csv(final_data_file_name)
    
    #Checks for latitude columns. If it already exists does not set it to None
    if 'latitude' not in df.columns:
        df['latitude']=np.nan
        df['longitude']=np.nan
        print('Columns latitude and longitude added to dataframe')
    else:
        print('Columns latitude and longitude already exist in the dataframe')

In [None]:
#Gets coordinates for the addresses in the dataframe using AWS Location Service
df=get_coordinates_from_file(df)
df=get_coordinates_from_aws(df)
save_locations_to_file(df)
df.to_csv(final_data_file_name,index=False)
