In [1]:
# import libraries
import pandas as pd
import numpy as np 
import warnings
import gmplot
import sys
import geopy
from geopy import distance
from copy import deepcopy
sys.path.append("../HistoricalData/")
from getData import get_data

warnings.filterwarnings('ignore')

In [2]:
# this call to get_data function that will take bounding box and timeframe and return cleaned data

UP_LEFT = (38.008050, -122.536985)    # (lat, lon)
UP_RIGHT = (38.008050, -122.186437)   # (lat, lon)
DOWN_RIGHT = (37.701933, -122.186437) # (lat, lon)
DOWN_LEFT = (37.701933, -122.536985)  # (lat, lon)
START_DATE = '2019/09/01' # begin date to start taking data
END_DATE = '2019/09/10'   # end date to start taking data
START_HOUR = '0'         # hour EACH DAY to start, this allows us to control for time of day effects
END_HOUR = '1'           # hour EACH DAY to end, this allows us to control for time of day effects
color = "red"

data_df = get_data(UP_LEFT, UP_RIGHT, DOWN_RIGHT, DOWN_LEFT, START_DATE, END_DATE, START_HOUR, END_HOUR)

In [3]:
data_df.columns

Index(['0_3um', '0_5um', '10_0um', '1_0um', '2_5um', '5_0um', 'a_h',
       'agency_name', 'aqi', 'call_sign', 'call_sign2', 'call_sign3',
       'category', 'city', 'county', 'created', 'created_at', 'day',
       'device_loc_typ', 'epa_pm25_unit', 'epa_pm25_value', 'full_aqs_code',
       'gust_speed', 'gusts', 'hidden', 'high_reading_flag', 'hour',
       'humidity', 'interval', 'intl_aqs_code', 'is_owner', 'lat', 'lon',
       'minute', 'month', 'parent_id', 'pm10_0', 'pm10_0_atm', 'pm1_0',
       'pm1_0_atm', 'pm2_5_atm', 'pm2_5_cf_1', 'raw_concentration',
       'report_modifier', 'rssi', 'sensor_id', 'sensor_name', 'site_name',
       'sys_maint_reqd', 'temperature', 'thingspeak_primary_id',
       'thingspeak_primary_id_read_key', 'thingspeak_secondary_id',
       'thingspeak_secondary_id_read_key', 'uptime', 'variable_wind_info',
       'variable_winds', 'wban_number', 'wind_data', 'wind_direction',
       'wind_speed', 'year', 'zipcode', 'zulu_time'],
      dtype='object')

In [8]:
data_df.groupby(data_df.sensor_id).count()

Unnamed: 0_level_0,0_3um,0_5um,10_0um,1_0um,2_5um,5_0um,a_h,agency_name,aqi,call_sign,...,uptime,variable_wind_info,variable_winds,wban_number,wind_data,wind_direction,wind_speed,year,zipcode,zulu_time
sensor_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1004,600,600,600,600,600,600,0,600,600,492,...,600,492,492,492,492,492,492,600,600,492
1005,600,600,600,600,600,600,0,600,600,492,...,600,492,492,492,492,492,492,600,600,492
10066,212,212,212,212,212,212,0,430,430,324,...,212,324,324,324,324,324,324,430,430,324
10067,201,201,201,201,201,201,0,424,424,318,...,207,318,318,318,318,318,318,424,424,318
1014,600,600,600,600,600,600,0,600,600,492,...,600,492,492,492,492,492,492,600,600,492
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6925,600,600,600,600,600,600,0,540,540,491,...,600,491,491,491,491,491,477,600,600,491
7198,494,494,494,494,494,494,0,582,582,476,...,494,476,476,476,476,476,476,582,582,476
7199,494,494,494,494,494,494,0,582,582,476,...,494,476,476,476,476,476,476,582,582,476
8468,600,600,600,600,600,600,0,600,600,492,...,600,492,492,492,492,492,492,600,600,492


In [None]:
data_df.groupby(data_df.created).count()

In [8]:
class Chain:
    """ 
    Chain is a class that defines a chain of sensors in time and space. 
    A chain has a list of timestamps that are paired with a list of particle locations, intersecting sensors, and readings
    """
    def __init__(self, sensor_id, start_time):
        self.start_time = start_time
        self.time = [self.start_time]
        self.sensor_path = [sensor_id]
        self.last_sensor = sensor_id
        self.particle_path = [(data_df[data_df.sensor_id == sensor_id].lat.iloc[0], data_df[data_df.sensor_id == sensor_id].lon.iloc[0])]
        self.readings = [data_df[(data_df.sensor_id == sensor_id) & (data_df.created == start_time)]['2_5um'].iloc[0]]
    
    def plot_next(self):
        # update time
        MINUTES = 10 # Joined data seems to be in increments of 10 minutes
        origin_time = self.time[-1]
        destination_time = self.time[-1] + MINUTES
        self.time.append(destination_time)
        
        # need to take wind speed in knots
        origin = geopy.Point(self.particle_path[-1][0], self.particle_path[-1][1])
        km_1_knot_per_hour = 1.852
        km_hour_test = km_1_knot_per_hour * data_df[(data_df.sensor_id == self.last_sensor) & (data_df.created == origin_time)].wind_speed
        if len(km_hour_test) == 0: # handle empty data
            return False
        km_hour = km_hour_test.iloc[0] # so you are grabbing from last sensor seen... might not be nearest wind data
        d = (MINUTES / 60) * km_hour 
    
        # convert wind_direction to a bearing
        bearing = data_df[(data_df.sensor_id == self.last_sensor) & (data_df.created == origin_time)].wind_direction.iloc[0] 
        try:
            b = int(bearing) + 180
        except: # 'VRB' = variable winds
            return False 
        if b >= 360:
            b -= 360
        
        # given: lat1, lon1, b = bearing in degrees, d = distance in kilometers
        if d == 0:
            destination = self.particle_path[-1]
        else:
            destination = geopy.distance.VincentyDistance(kilometers=d).destination(origin, b)
            destination = (destination.latitude, destination.longitude)
        
            # end chain if outside bounding box
            if (destination[0] > UP_LEFT[0]) or (destination[0] < DOWN_LEFT[0]) or (destination[1] < UP_LEFT[1]) or (destination[1] > DOWN_RIGHT[1]):
                return False
        
            self.particle_path.append(destination)
        
            # check is there's a sensor nearby
            CLOSE_ENOUGH_DISTANCE = 1 # have to be within 1000 meters
        
            closest_sensor_id = None
            closest_sensor_dist = CLOSE_ENOUGH_DISTANCE
            for sensor in data_df.sensor_id.unique(): # iterate through all sensors in bounding box
                test_sensor = (float(data_df[data_df.sensor_id == sensor].lat.iloc[0]), float(data_df[data_df.sensor_id == sensor].lon.iloc[0]))
                testy = geopy.point.Point(test_sensor)
                desty = geopy.point.Point(destination)
                proximity = geopy.distance.distance(testy, desty).km
                if (proximity < closest_sensor_dist):
                    closest_sensor_id = sensor
                    closest_sensor_dist = proximity
                
            # update other lists
            self.sensor_path.append(closest_sensor_id)
            if closest_sensor_id:
                self.last_sensor = closest_sensor_id
                try: # handle empty reading
                    self.readings.append(data_df[(data_df.sensor_id == closest_sensor_id) & (data_df.created == destination_time)]['2_5um'].iloc[0])
                except:
                    return False
        return True
    
    def plot_pollution(self):
        lats = []
        lons = []
        colors= ['white'] # first color not used
        link = 0 # counts the links
        reading = 0 # counts the readings
        for point in self.particle_path:
            lats.append(point[0])
            lons.append(point[1])
            if (link > 0) and (self.sensor_path[link]):
                reading += 1
                if reading > (len(self.readings)-1): # out of index, missing reading:
                    colors.append('black')
                elif self.readings[reading] > self.readings[reading-1]: # possible pollution source
                    colors.append('red')
                    look_back = link-1
                    while ((look_back >= 0) and (self.sensor_path[look_back] == None)):
                        colors.append('red')
                        look_back -= 1
                else: # no pollution source
                    colors.append('blue')
                    look_back = link-1
                    while ((look_back >= 0) and (self.sensor_path[look_back] == None)):
                        colors.append('blue')
                        look_back -= 1
            link += 1

        # fill in black color for chain links at end that never feed into another sensor
        if len(self.particle_path) > 1:
            for index in range(len(self.particle_path),-1,-1):
                try:
                    if self.sensor_path[index] == None:
                        colors.append('black')
                    else:
                        return lats, lons, colors
                except:
                    colors.append('black')
        return lats, lons, colors

In [None]:
chains = [] # a list of chain 

set_size_chain = 10
start_time = 201909030000

# build the chains
for sensor in data_df.sensor_id.unique(): # iterate through all sensors in bounding box
    chain = Chain(sensor, start_time) # init one chain per sensor
    time_step = 0
    while (chain.plot_next() & (time_step < set_size_chain)): # method returns false if out of bounds/data
        time_step += 1
    chains.append(chain) # add the chain object to the list of chains
    print("Chain", len(chains), "is", chain.sensor_path)
    print("Chain", len(chains), "readings are", chain.readings)
print("Done.")

In [12]:
# plot it
latitude_list = []
longitude_list = []
for sensor in data_df.sensor_id.unique():
    try:
        latitude_list.append(float(data_df[data_df.sensor_id == sensor].lat.iloc[0]))
        longitude_list.append(float(data_df[data_df.sensor_id == sensor].lon.iloc[0]))
    except:
        pass
gmap3 = gmplot.GoogleMapPlotter((max(latitude_list)+min(latitude_list))/2,
                                (max(longitude_list)+min(longitude_list))/2,
                                zoom=11)
gmap3.scatter(latitude_list, longitude_list, '# FF0000', 
                              size = 40, marker = False ) 

debug = 0
for chain in chains:
    debug +=1
    lats, lons, colors = chain.plot_pollution()
    for index in range(1,len(lats)):
        try:
            gmap3.plot([lats[index-1], lats[index]], [lons[index-1], lons[index]], colors[index], edge_width = 2.5)
        except:
            break
    
# draw the map and send to file
gmap3.draw("../data/chain_map.html") 
print("Done")

Done


In [11]:
debug

25