In [None]:
from __future__ import annotations
from haversine import haversine
import pandas as pd
import csv
import math

# 1. Cleanup

### Data before cleanup:

In [None]:
pd.read_csv('./data/DataSample.csv')

#### Place the data into a dictionary to filter it for duplicates

In [None]:
def filter_data(file: str) -> dict[str, list]:
    """Returns a dictionary of the data with the suspicious entries filtered out"""
    raw_data = pd.read_csv(file)
    seen = {}
    
    for index, row in raw_data.iterrows():
        formatted = f'{row[" TimeSt"]}, {row["Latitude"]} {row["Longitude"]}'
        
        if formatted in seen:
            seen[formatted] = []
        else:
            seen.setdefault(formatted, []).append(row)
    return seen

#### After filtering the data, write it to a new file

In [None]:
def cleanup(file: str) -> None:
    """Filter the data from <file> and write it to a new file"""
    filtered_data = filter_data(file)
    
    with open('./data/FilteredDataSample.csv', 'w') as f:
        csv_writer = csv.writer(f)
        header = ['_ID', ' TimeSt', 'Country', 'Province', 'City', 'Latitude', 'Longitude']
        csv_writer.writerow(header)
        
        for key in filtered_data:
            if filtered_data[key]:
                csv_writer.writerows(filtered_data[key])

In [None]:
cleanup('./data/DataSample.csv')

### Data after cleanup:

In [None]:
pd.read_csv('./data/FilteredDataSample.csv')

# 2. Label

In [None]:
def get_poi_data(file: str) -> list[dict]:
    """Returns the list of POIs with their information"""
    raw_data = pd.read_csv(file)
    poi_list = []
    
    for index, row in raw_data.iterrows():
        poi_list.append({'id': row['POIID'], 'lat': row[' Latitude'], 'lon': row['Longitude']})
    return poi_list

def get_closest_poi(request, poi_list: list[dict]) -> tuple[dict, float]:
    """Returns the closest POI and its distance to <request>"""
    # Temporarily set closest POI and distance to a large value
    closest_poi = poi_list[0]
    closest_distance = 1000000 # No distance will be greater than this
    request_coords = (request['Latitude'], request['Longitude'])

    for poi in poi_list:
            poi_coords = (poi['lat'], poi['lon'])
            distance = haversine(request_coords, poi_coords)
            
            if distance < closest_distance:
                closest_poi = poi
                closest_distance = distance
    return (closest_poi, distance)
    
    
def label(file: str) -> None:
    """Assign each request from <file> to it's closest POI"""
    poi_list = get_poi_data('./data/POIList.csv')
    
    df = pd.read_csv(file)
    closest_poi_list = []
    poi_distances = []
    
    for index, row in df.iterrows():
        closest_info = get_closest_poi(row, poi_list)
        closest_poi_list.append(closest_info[0]['id'])
        poi_distances.append(closest_info[1])
        

    
    # Add column indicating the id of the closest POI
    df['ClosestPOI'] = closest_poi_list
    df['POIDistance'] = poi_distances
    df.to_csv('./data/FilteredDataSample.csv')
        

In [None]:
label('./data/FilteredDataSample.csv')

### After labeling:

In [None]:
pd.read_csv('./data/FilteredDataSample.csv')

# 3. Analysis

In [None]:
def get_poi_data(file: str) -> dict[str, list[float]]:
    """
    Returns a dictionary with the keys corresponding to 
    the POI id's and the values as lists of the distances to the requests
    """
    df = pd.read_csv(file)
    
    poi_data = {}
    for index, row in df.iterrows():
        closest_poi = row['ClosestPOI']
        distance = row['POIDistance']
        poi_data.setdefault(closest_poi, []).append(distance)
    
    return poi_data

def get_average(data: list[float]) -> float:
    """Returns the average of a list"""
    return sum(data) / len(data)

def get_std_dev(data: list[float]) -> float:
    """Returns the standard deviation of a list"""
    average = get_average(data)
    deviations = [(distance - average) ** 2 for distance in data]
    variance = sum(deviations) / len(data)
    
    return math.sqrt(variance)

### Results

In [None]:
poi_data = get_poi_data('./data/FilteredDataSample.csv')

for poi in poi_data:
    distances = poi_data[poi]
    
    average = get_average(distances)
    std_dev = get_std_dev(distances)
    
    print(f'''
    POI: {poi}
    Average Distance: {format(average, ".2f")}
    Standard Deviation: {format(std_dev, ".2f")}
    ''')