In [None]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, time
from sklearn.cluster import DBSCAN
from sklearn import metrics
from geopy.distance import great_circle
from shapely.geometry import MultiPoint
%matplotlib inline

In [None]:
### Load File. 
emdata = pd.read_csv('/Users/AlfHaugen/Python/Wildfire_Data/Missoula Emisions Data RDS-2017-0039/Emissions_Year/emissions_year2003to2015.csv')

In [None]:
emyear2005 = emdata[(emdata.year == 2005) & (emdata.doy > 181) & (emdata.doy < 213)]
print('There are {:,} rows'.format(len(emyear2005)))

In [None]:
kms_per_radian = 3956
### 3956 for miles, 6371.0088 for kilometers
print(kms_per_radian)

In [None]:
coords = emyear2005.as_matrix(columns=['latitude', 'longitude'])
epsilon = 1.5 / kms_per_radian

In [None]:
start_time = time.time()
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
cluster_labels = db.labels_

# get the number of clusters
num_clusters = len(set(cluster_labels))

# all done, print the outcome
message = 'Clustered {:,} points down to {:,} clusters, for {:.1f}% compression in {:,.2f} seconds'
print(message.format(len(emyear2005), num_clusters, 100*(1 - float(num_clusters) / len(emyear2005)), time.time()-start_time))
print('Silhouette coefficient: {:0.03f}'.format(metrics.silhouette_score(coords, cluster_labels)))


In [None]:
# turn the clusters into a pandas series, where each element is a cluster of points
clusters = pd.Series([coords[cluster_labels==n] for n in range(num_clusters)])

In [None]:
def get_centermost_point(cluster):
    centroid = (MultiPoint(cluster).centroid.x, MultiPoint(cluster).centroid.y)
    centermost_point = min(cluster, key=lambda point: great_circle(point, centroid).m)
    return tuple(centermost_point)

centermost_points = clusters.map(get_centermost_point)

In [None]:
# unzip the list of centermost points (lat, lon) tuples into separate lat and lon lists
latlon = zip(*centermost_points)
print(latlon)

In [None]:
# unzip the list of centermost points (lat, lon) tuples into separate lat and lon lists
latitude, longitude = zip(*centermost_points)

In [None]:
print(latitude[1:5], longitude[1:5])

In [None]:
# from these lats/lons create a new df of one representative point for each cluster
rep_points = pd.DataFrame({'longitude':longitude, 'latitude':latitude})
rep_points.head()

In [None]:
# pull row from original data set where lat/lon match the lat/lon of each row of representative points
# that way we get the full details like city, country, and date from the original dataframe

#emyear2005 = emdata[(emdata.year == 2005) & (emdata.doy > 181) & (emdata.doy < 213)]
rs = rep_points.apply(lambda row: emyear2005[(emyear2005['latitude']==row['latitude']) & (emyear2005['longitude']==row['longitude'])].iloc[0], axis=1)
rs.to_csv('/Users/AlfHaugen/Python/Wildfire_Data/Predicting-Wildfires/gps-dbscan.csv', encoding='utf-8')
rs.tail()