In [472]:
from __future__ import annotations
import pandas as pd
import csv
from haversine import haversine

# 1. Cleanup

### Data before cleanup:

In [473]:
pd.read_csv('./data/DataSample.csv')

Unnamed: 0,_ID,TimeSt,Country,Province,City,Latitude,Longitude
0,4516516,2017-06-21 00:00:00.143,CA,ON,Waterloo,43.49347,-80.49123
1,4516547,2017-06-21 18:00:00.193,CA,ON,London,42.93990,-81.27090
2,4516550,2017-06-21 15:00:00.287,CA,ON,Guelph,43.57760,-80.22010
3,4516600,2017-06-21 15:00:00.307,CA,ON,Stratford,43.37160,-80.97730
4,4516613,2017-06-21 15:00:00.497,CA,ON,Stratford,43.37160,-80.97730
...,...,...,...,...,...,...,...
22020,5614801,2017-06-21 12:23:07.880,CA,ON,Saint Catharines,43.16440,-79.24560
22021,5614888,2017-06-21 08:23:01.793,CA,AB,Calgary,51.02093,-114.10621
22022,5614909,2017-06-21 00:23:07.903,CA,ON,Whitby,43.88730,-78.94220
22023,5614912,2017-06-21 11:23:07.953,CA,ON,Oakville,43.49340,-79.71260


#### Place the data into a dictionary to filter it for duplicates

In [474]:
def filter_data(file: str) -> dict[str, list]:
    """Returns a dictionary of the data with the suspicious entries filtered out"""
    raw_data = pd.read_csv(file)
    seen = {}
    
    for index, row in raw_data.iterrows():
        formatted = f'{row[" TimeSt"]}, {row["Latitude"]} {row["Longitude"]}'
        
        if formatted in seen:
            seen[formatted] = []
        else:
            seen.setdefault(formatted, []).append(row)
    return seen

#### After filtering the data, write it to a new file

In [475]:
def cleanup(file: str) -> None:
    """Filter the data from <file> and write it to a new file"""
    filtered_data = filter_data(file)
    
    with open('./data/FilteredDataSample.csv', 'w') as f:
        csv_writer = csv.writer(f)
        header = ['_ID', ' TimeSt', 'Country', 'Province', 'City', 'Latitude', 'Longitude']
        csv_writer.writerow(header)
        
        for key in filtered_data:
            if filtered_data[key]:
                csv_writer.writerows(filtered_data[key])

In [476]:
cleanup('./data/DataSample.csv')

### Data after cleanup:

In [477]:
pd.read_csv('./data/FilteredDataSample.csv')

Unnamed: 0,_ID,TimeSt,Country,Province,City,Latitude,Longitude
0,4516516,2017-06-21 00:00:00.143,CA,ON,Waterloo,43.49347,-80.49123
1,4516547,2017-06-21 18:00:00.193,CA,ON,London,42.93990,-81.27090
2,4516550,2017-06-21 15:00:00.287,CA,ON,Guelph,43.57760,-80.22010
3,4516600,2017-06-21 15:00:00.307,CA,ON,Stratford,43.37160,-80.97730
4,4516613,2017-06-21 15:00:00.497,CA,ON,Stratford,43.37160,-80.97730
...,...,...,...,...,...,...,...
17968,5614689,2017-06-21 13:23:01.370,CA,ON,London,42.96110,-81.24310
17969,5614801,2017-06-21 12:23:07.880,CA,ON,Saint Catharines,43.16440,-79.24560
17970,5614909,2017-06-21 00:23:07.903,CA,ON,Whitby,43.88730,-78.94220
17971,5614912,2017-06-21 11:23:07.953,CA,ON,Oakville,43.49340,-79.71260


# 2. Label

In [478]:
def get_poi_data(file: str) -> list[dict]:
    """Returns the list of POIs with their information"""
    raw_data = pd.read_csv(file)
    poi_list = []
    
    for index, row in raw_data.iterrows():
        poi_list.append({'id': row['POIID'], 'lat': row[' Latitude'], 'lon': row['Longitude']})
    return poi_list

def get_closest_poi(request, poi_list: list[dict]) -> dict:
    """Returns the closest POI to <request>"""
    # Temporarily set closest POI and distance to a large value
    closest_poi = poi_list[0]
    closest_distance = 1000000 # No distance will be greater than this
    request_coords = (request['Latitude'], request['Longitude'])

    for poi in poi_list:
            poi_coords = (poi['lat'], poi['lon'])
            distance = haversine(request_coords, poi_coords)
            
            if distance < closest_distance:
                closest_poi = poi
                closest_distance = distance
    return closest_poi
    
    
def label(file: str) -> None:
    """Assign each request from <file> to it's closest POI"""
    poi_list = get_poi_data('./data/POIList.csv')
    
    df = pd.read_csv(file)
    closest_poi_list = []
    
    for index, row in df.iterrows():
        closest_poi_list.append(get_closest_poi(row, poi_list)['id'])

    
    # Add column indicating the id of the closest POI
    df['Closest Poi'] = closest_poi_list
    df.to_csv('./data/FilteredDataSample.csv')
        

In [479]:
label('./data/FilteredDataSample.csv')

### After labeling:

In [480]:
pd.read_csv('./data/FilteredDataSample.csv')

Unnamed: 0.1,Unnamed: 0,_ID,TimeSt,Country,Province,City,Latitude,Longitude,Closest Poi
0,0,4516516,2017-06-21 00:00:00.143,CA,ON,Waterloo,43.49347,-80.49123,POI3
1,1,4516547,2017-06-21 18:00:00.193,CA,ON,London,42.93990,-81.27090,POI3
2,2,4516550,2017-06-21 15:00:00.287,CA,ON,Guelph,43.57760,-80.22010,POI3
3,3,4516600,2017-06-21 15:00:00.307,CA,ON,Stratford,43.37160,-80.97730,POI3
4,4,4516613,2017-06-21 15:00:00.497,CA,ON,Stratford,43.37160,-80.97730,POI3
...,...,...,...,...,...,...,...,...,...
17968,17968,5614689,2017-06-21 13:23:01.370,CA,ON,London,42.96110,-81.24310,POI3
17969,17969,5614801,2017-06-21 12:23:07.880,CA,ON,Saint Catharines,43.16440,-79.24560,POI3
17970,17970,5614909,2017-06-21 00:23:07.903,CA,ON,Whitby,43.88730,-78.94220,POI3
17971,17971,5614912,2017-06-21 11:23:07.953,CA,ON,Oakville,43.49340,-79.71260,POI3
