In [1]:
import pandas as pd
import requests
import numpy as np
import re
import os

# Step 1 -- Get Input Data
> Option 1: API Data Downloading Procedure
> Option 2: CSV Upload Procedure

## Option 1: API Downloading Data Procedure

Downloading data follows these steps:
- Request the non GEOJSON data from the GLOBE API
- Get the results from the JSON and pass it into a pandas dataframe
- Unpacks the "data" entry
- Merge the data onto the original dataframe
- Remove the "data" entry from the original dataframe

In [8]:
start_date = "2017-05-29"
end_date = "2020-05-31"
url = f"https://api.globe.gov/search/v1/measurement/protocol/measureddate/?protocols=mosquito_habitat_mapper&startdate={start_date}&enddate={end_date}&geojson=FALSE&sample=FALSE"

# Downloads data from the GLOBE API and saves into a pandas df
response = requests.get(url)
results = response.json()["results"]

df = pd.DataFrame(results)

#Expand the 'data' column by listing the contents and passing as a new dataframe
df = pd.concat([df, pd.DataFrame(list(df['data']))], axis=1)

#Drop the previously nested data column
df = df.drop('data', 1)

# Displays the dataframe
df

Unnamed: 0,protocol,measuredDate,createDate,updateDate,publishDate,organizationId,organizationName,siteId,siteName,countryName,...,mosquitohabitatmapperComments,mosquitohabitatmapperMosquitoPupae,mosquitohabitatmapperWaterSourcePhotoUrls,mosquitohabitatmapperDataSource,mosquitohabitatmapperLarvaFullBodyPhotoUrls,mosquitohabitatmapperMeasurementLatitude,mosquitohabitatmapperLastIdentifyStage,mosquitohabitatmapperWaterSourceType,mosquitohabitatmapperMosquitoHabitatMapperId,mosquitohabitatmapperMeasurementLongitude
0,mosquito_habitat_mapper,2018-11-25,2020-01-25T18:09:52,2020-01-25T18:09:52,2020-02-14T20:29:11,13063641.0,GPM Satellite Mission,35785,18SUJ105472,United States,...,,False,https://data.globe.gov/system/photos/2018/11/2...,GLOBE Observer App,,39.2538,,container: artificial,5188,-77.1959
1,mosquito_habitat_mapper,2019-04-07,2020-01-25T18:24:27,2020-01-25T18:24:27,2020-03-20T22:19:48,13063641.0,GPM Satellite Mission,35785,18SUJ105472,United States,...,,False,https://data.globe.gov/system/photos/2019/04/0...,GLOBE Observer App,,39.2535,identify,container: artificial,10365,-77.1960
2,mosquito_habitat_mapper,2019-04-07,2020-01-25T18:24:27,2020-01-25T18:24:27,2020-03-20T22:19:48,13063641.0,GPM Satellite Mission,35785,18SUJ105472,United States,...,,False,https://data.globe.gov/system/photos/2019/04/0...,GLOBE Observer App,,39.2536,identify,container: artificial,10360,-77.1956
3,mosquito_habitat_mapper,2019-05-29,2020-01-25T18:29:36,2020-01-25T18:29:36,2020-03-20T22:19:48,13063641.0,GPM Satellite Mission,35785,18SUJ105472,United States,...,,False,https://data.globe.gov/system/photos/2019/05/2...,GLOBE Observer App,https://data.globe.gov/system/photos/2019/05/2...,39.2542,identify-siphon-pecten,container: artificial,12424,-77.1962
4,mosquito_habitat_mapper,2019-08-04,2020-01-25T18:45:20,2020-01-25T18:45:20,2020-03-20T22:19:48,13063641.0,GPM Satellite Mission,35785,18SUJ105472,United States,...,,False,https://data.globe.gov/system/photos/2019/08/0...,GLOBE Observer App,,39.2536,identify,container: artificial,14822,-77.1957
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22452,mosquito_habitat_mapper,2020-05-30,2020-05-30T18:25:03,2020-05-30T19:30:02,2020-11-23T21:32:54,14054356.0,lycee Thilmakha,200580,28PCB553673,Senegal,...,,True,https://data.globe.gov/system/photos/2020/05/3...,GLOBE Observer App,https://data.globe.gov/system/photos/2020/05/3...,15.0778,identify-siphon-shape,container: artificial,22772,-16.3463
22453,mosquito_habitat_mapper,2020-05-29,2020-06-03T07:30:03,2020-06-03T16:15:20,2020-11-23T21:32:54,19841715.0,Madagascar Citizen Science,201123,38KQE645064,,...,,False,https://data.globe.gov/system/photos/2020/05/2...,GLOBE Observer App,,-18.9168,identify,container: artificial,22806,47.5121
22454,mosquito_habitat_mapper,2020-05-06,2020-07-14T08:40:05,2020-07-14T13:40:07,2020-11-23T21:32:54,18306968.0,Taiwan Partnership Citizen Science,208771,51RUH399684,,...,,True,,GLOBE Observer App,https://data.globe.gov/system/photos/2020/05/0...,25.0235,identify-basal-tuft,container: artificial,24654,121.4134
22455,mosquito_habitat_mapper,2020-05-31,2020-07-18T23:35:02,2020-07-18T23:35:02,2020-11-23T21:32:54,14054356.0,lycee Thilmakha,209660,28PDB055429,Senegal,...,,True,https://data.globe.gov/system/photos/2020/05/3...,GLOBE Observer App,,14.8591,identify-aedes-tuft,container: artificial,24880,-15.8784


## Option 2: CSV Downloading

This cell allows for the option of using mosquito mapper data from CSV Files.

This enables an analysis of different subsets of the total mosquito mapper data (this will overwrite data downloaded from the API).


In [20]:
filename = "Africa.csv"
df = pd.read_csv(filename)
df

Unnamed: 0.1,Unnamed: 0,protocol,measuredDate,createDate,updateDate,publishDate,organizationId,organizationName,siteId,siteName,...,mosquitohabitatmapperComments,mosquitohabitatmapperMosquitoPupae,mosquitohabitatmapperWaterSourcePhotoUrls,mosquitohabitatmapperDataSource,mosquitohabitatmapperLarvaFullBodyPhotoUrls,mosquitohabitatmapperMeasurementLatitude,mosquitohabitatmapperLastIdentifyStage,mosquitohabitatmapperWaterSourceType,mosquitohabitatmapperMosquitoHabitatMapperId,mosquitohabitatmapperMeasurementLongitude
0,594,mosquito_habitat_mapper,2017-07-05,2020-01-25T17:36:06,2020-01-25T17:36:06,2020-02-14T20:29:11,23351884.0,Twene Amanfo Senior High/Technical School,53048,30NWP747107,...,,,https://data.globe.gov/system/photos/2017/07/0...,GLOBE Observer App,,7.333747,,still: lake/pond/swamp,137,-2.323205
1,635,mosquito_habitat_mapper,2017-07-06,2020-01-25T17:36:06,2020-01-25T17:36:06,2020-02-14T20:29:11,24610401.0,Terre Rouge SSS,53208,40KEC556754,...,,True,,GLOBE Observer App,https://data.globe.gov/system/photos/2017/07/0...,-20.117747,identify-siphon-shape,still: lake/pond/swamp,100,57.531913
2,640,mosquito_habitat_mapper,2017-07-07,2020-01-25T17:36:06,2020-01-25T17:36:06,2020-02-14T20:29:11,24610401.0,Terre Rouge SSS,53238,40KEC557756,...,,True,,GLOBE Observer App,https://data.globe.gov/system/photos/2017/07/0...,-20.115937,identify-no-siphon,container: artificial,104,57.532863
3,2618,mosquito_habitat_mapper,2017-08-01,2020-01-25T17:36:55,2020-01-25T17:36:55,2020-02-14T20:29:11,236305.0,Kabojja Secondary School,99016,18TXL725776,...,,True,https://data.globe.gov/system/photos/2017/08/0...,GLOBE Observer App,,41.331432,identify,still: lake/pond/swamp,935,-72.938605
4,3410,mosquito_habitat_mapper,2018-09-08,2020-01-25T18:05:43,2020-01-25T18:05:43,2020-02-14T20:29:11,127300.0,Indwe Secondary School,106855,34HFH012220,...,,,https://data.globe.gov/system/photos/2018/09/0...,GLOBE Observer App,,-34.137100,,container: artificial,4401,22.098000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4537,22449,mosquito_habitat_mapper,2020-05-30,2020-05-30T18:25:03,2020-05-30T19:30:02,2020-11-23T21:32:54,14054356.0,lycee Thilmakha,200580,28PCB553673,...,,True,https://data.globe.gov/system/photos/2020/05/3...,GLOBE Observer App,https://data.globe.gov/system/photos/2020/05/3...,15.077700,identify-siphon-shape,container: artificial,22776,-16.346300
4538,22450,mosquito_habitat_mapper,2020-05-30,2020-05-30T18:25:03,2020-05-30T19:30:02,2020-11-23T21:32:54,14054356.0,lycee Thilmakha,200580,28PCB553673,...,,True,https://data.globe.gov/system/photos/2020/05/3...,GLOBE Observer App,https://data.globe.gov/system/photos/2020/05/3...,15.077700,identify-siphon-pecten,container: artificial,22779,-16.346300
4539,22451,mosquito_habitat_mapper,2020-05-30,2020-05-30T18:25:03,2020-05-30T19:30:02,2020-11-23T21:32:54,14054356.0,lycee Thilmakha,200580,28PCB553673,...,,True,https://data.globe.gov/system/photos/2020/05/3...,GLOBE Observer App,https://data.globe.gov/system/photos/2020/05/3...,15.077800,identify-siphon-shape,container: artificial,22768,-16.346300
4540,22452,mosquito_habitat_mapper,2020-05-30,2020-05-30T18:25:03,2020-05-30T19:30:02,2020-11-23T21:32:54,14054356.0,lycee Thilmakha,200580,28PCB553673,...,,True,https://data.globe.gov/system/photos/2020/05/3...,GLOBE Observer App,https://data.globe.gov/system/photos/2020/05/3...,15.077800,identify-siphon-shape,container: artificial,22772,-16.346300


# Step 2 -- GeoLocational Data Cleaning Procedure:
> Removes bad geo-locational data. Examples of bad location data include:
 * Site Latitude and Longitude match GPS Latitude and Longitude (These values should differ)
 * Integer GPS Measurements (GPS Measurements should be very precise decimals)

In [21]:
# Finds all bad locational data
def geolocational_filter(gps_lat, gps_lon, recorded_lat, recorded_lon):
    return ((recorded_lat == gps_lat and 
        recorded_lon == gps_lon) or
         gps_lat == int(gps_lat) or
         gps_lon == int(gps_lon)
        )

vectorized_filter = np.vectorize(geolocational_filter)
bad_data = vectorized_filter(df["mosquitohabitatmapperMeasurementLatitude"].to_numpy(),
                             df["mosquitohabitatmapperMeasurementLongitude"].to_numpy(),
                             df["latitude"].to_numpy(),
                             df["longitude"].to_numpy()
                            )

# Removes all bad locational data from the working dataframe
df = df[~bad_data]
df = df.reset_index().drop(["index"], axis = 1)

# Writes filtered data to a csv
df.to_csv("Geolocation.csv")

df

Unnamed: 0.1,Unnamed: 0,protocol,measuredDate,createDate,updateDate,publishDate,organizationId,organizationName,siteId,siteName,...,mosquitohabitatmapperComments,mosquitohabitatmapperMosquitoPupae,mosquitohabitatmapperWaterSourcePhotoUrls,mosquitohabitatmapperDataSource,mosquitohabitatmapperLarvaFullBodyPhotoUrls,mosquitohabitatmapperMeasurementLatitude,mosquitohabitatmapperLastIdentifyStage,mosquitohabitatmapperWaterSourceType,mosquitohabitatmapperMosquitoHabitatMapperId,mosquitohabitatmapperMeasurementLongitude
0,3410,mosquito_habitat_mapper,2018-09-08,2020-01-25T18:05:43,2020-01-25T18:05:43,2020-02-14T20:29:11,127300.0,Indwe Secondary School,106855,34HFH012220,...,,,https://data.globe.gov/system/photos/2018/09/0...,GLOBE Observer App,,-34.137100,,container: artificial,4401,22.098000
1,4927,mosquito_habitat_mapper,2018-06-30,2020-01-25T17:58:58,2020-01-25T17:58:58,2020-02-14T20:29:11,10532743.0,Africa GLOBE v-School,132251,36NVF470218,...,,True,https://data.globe.gov/system/photos/2018/06/3...,GLOBE Observer App,https://data.globe.gov/system/photos/2018/06/3...,0.197667,identify-no-siphon,container: artificial,3096,32.523838
2,4928,mosquito_habitat_mapper,2018-06-30,2020-01-25T17:58:58,2020-01-25T17:58:58,2020-02-14T20:29:11,10532743.0,Africa GLOBE v-School,132256,36NVF477242,...,,True,https://data.globe.gov/system/photos/2018/06/3...,GLOBE Observer App,https://data.globe.gov/system/photos/2018/06/3...,0.219000,identify-saddle-comb,container: artificial,3097,32.530600
3,5058,mosquito_habitat_mapper,2018-07-03,2020-01-25T18:00:55,2020-01-25T18:00:55,2020-02-14T20:29:11,10532743.0,Africa GLOBE v-School,132455,36NVF504347,...,we are going to destroy the habitat by using t...,False,https://data.globe.gov/system/photos/2018/07/0...,GLOBE Observer App,https://data.globe.gov/system/photos/2018/07/0...,0.314200,identify-no-basal-tuft,container: artificial,3186,32.555000
4,5063,mosquito_habitat_mapper,2018-07-03,2020-01-25T18:00:55,2020-01-25T18:00:55,2020-02-14T20:29:11,10532743.0,Africa GLOBE v-School,132496,36NVF511349,...,,False,https://data.globe.gov/system/photos/2018/07/0...,GLOBE Observer App,https://data.globe.gov/system/photos/2018/07/0...,0.316600,identify-saddle-complete,container: artificial,3191,32.561000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4481,22449,mosquito_habitat_mapper,2020-05-30,2020-05-30T18:25:03,2020-05-30T19:30:02,2020-11-23T21:32:54,14054356.0,lycee Thilmakha,200580,28PCB553673,...,,True,https://data.globe.gov/system/photos/2020/05/3...,GLOBE Observer App,https://data.globe.gov/system/photos/2020/05/3...,15.077700,identify-siphon-shape,container: artificial,22776,-16.346300
4482,22450,mosquito_habitat_mapper,2020-05-30,2020-05-30T18:25:03,2020-05-30T19:30:02,2020-11-23T21:32:54,14054356.0,lycee Thilmakha,200580,28PCB553673,...,,True,https://data.globe.gov/system/photos/2020/05/3...,GLOBE Observer App,https://data.globe.gov/system/photos/2020/05/3...,15.077700,identify-siphon-pecten,container: artificial,22779,-16.346300
4483,22451,mosquito_habitat_mapper,2020-05-30,2020-05-30T18:25:03,2020-05-30T19:30:02,2020-11-23T21:32:54,14054356.0,lycee Thilmakha,200580,28PCB553673,...,,True,https://data.globe.gov/system/photos/2020/05/3...,GLOBE Observer App,https://data.globe.gov/system/photos/2020/05/3...,15.077800,identify-siphon-shape,container: artificial,22768,-16.346300
4484,22452,mosquito_habitat_mapper,2020-05-30,2020-05-30T18:25:03,2020-05-30T19:30:02,2020-11-23T21:32:54,14054356.0,lycee Thilmakha,200580,28PCB553673,...,,True,https://data.globe.gov/system/photos/2020/05/3...,GLOBE Observer App,https://data.globe.gov/system/photos/2020/05/3...,15.077800,identify-siphon-shape,container: artificial,22772,-16.346300


# Step 3 -- Suspected Training Event Removal

We identify training groups (with similar observational data) and remove this training group data from the working dataframe.

* We group entries with the same measured date, site latitude, site longitude, siteName, and water source.
* We isolate groups with a size greater than our threshold (10 members/similar entries). We remove these groups and write their data into a separate CSV.

## Threshold Variable

The threshold variable is the minimum number of duplicate entries (exclusive) that are needed for the the group of entries to be classified as an event.

Set the threshold value and then run the following cells

In [9]:
threshold = 10

In [10]:
# Groups / filters suspected events
suspect_df = df.groupby(by=['measuredDate','latitude','mosquitohabitatmapperWaterSource','siteName','longitude']).filter(lambda x: len(x) > threshold)

# Writes the entries to a CSV
suspect_df.to_csv("Possible Events.csv")       

# Displays the suspected entries
suspect_df

Unnamed: 0,protocol,measuredDate,createDate,updateDate,publishDate,organizationId,organizationName,siteId,siteName,countryName,...,mosquitohabitatmapperComments,mosquitohabitatmapperMosquitoPupae,mosquitohabitatmapperWaterSourcePhotoUrls,mosquitohabitatmapperDataSource,mosquitohabitatmapperLarvaFullBodyPhotoUrls,mosquitohabitatmapperMeasurementLatitude,mosquitohabitatmapperLastIdentifyStage,mosquitohabitatmapperWaterSourceType,mosquitohabitatmapperMosquitoHabitatMapperId,mosquitohabitatmapperMeasurementLongitude
229,mosquito_habitat_mapper,2017-08-30,2020-01-25T17:36:55,2020-01-25T17:36:55,2020-02-14T20:29:11,191867.0,Walailak University,49206,47PNK987556,Thailand,...,,,https://data.globe.gov/system/photos/2017/08/3...,GLOBE Observer App,,8.643907,,still: lake/pond/swamp,485,99.89708
1355,mosquito_habitat_mapper,2017-08-30,2020-01-25T17:36:55,2020-01-25T17:36:55,2020-02-14T20:29:11,17615655.0,Thailand Citizen Science,97675,47PNK987556,,...,,,https://data.globe.gov/system/photos/2017/08/3...,GLOBE Observer App,,8.643907,,still: lake/pond/swamp,463,99.89708
1356,mosquito_habitat_mapper,2017-08-30,2020-01-25T17:36:55,2020-01-25T17:36:55,2020-02-14T20:29:11,17615655.0,Thailand Citizen Science,97675,47PNK987556,,...,,,https://data.globe.gov/system/photos/2017/08/3...,GLOBE Observer App,,8.643907,,still: lake/pond/swamp,505,99.89708
1357,mosquito_habitat_mapper,2017-08-30,2020-01-25T17:36:55,2020-01-25T17:36:55,2020-02-14T20:29:11,17615655.0,Thailand Citizen Science,97675,47PNK987556,,...,,,https://data.globe.gov/system/photos/2017/08/3...,GLOBE Observer App,,8.643907,,still: lake/pond/swamp,508,99.89708
1359,mosquito_habitat_mapper,2017-08-30,2020-01-25T17:36:55,2020-01-25T17:36:55,2020-02-14T20:29:11,17615655.0,Thailand Citizen Science,97675,47PNK987556,,...,,,https://data.globe.gov/system/photos/2017/08/3...,GLOBE Observer App,,8.643907,,still: lake/pond/swamp,451,99.89708
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22329,mosquito_habitat_mapper,2020-01-20,2020-04-20T07:30:02,2020-04-20T07:31:25,2020-11-23T21:32:54,17615655.0,Thailand Citizen Science,196380,47QMB636059,,...,,False,,GLOBE Observer App,,19.046100,identify,container: artificial,22344,98.65500
22330,mosquito_habitat_mapper,2020-01-20,2020-04-20T07:30:02,2020-04-20T07:31:25,2020-11-23T21:32:54,17615655.0,Thailand Citizen Science,196380,47QMB636059,,...,,False,,GLOBE Observer App,,19.046100,identify,container: artificial,22356,98.65490
22331,mosquito_habitat_mapper,2020-01-20,2020-04-20T07:30:02,2020-04-20T07:31:25,2020-11-23T21:32:54,17615655.0,Thailand Citizen Science,196380,47QMB636059,,...,,False,,GLOBE Observer App,,19.046100,identify,container: artificial,22321,98.65490
22334,mosquito_habitat_mapper,2020-01-20,2020-04-20T07:30:02,2020-04-20T07:31:25,2020-11-23T21:32:54,17615655.0,Thailand Citizen Science,196380,47QMB636059,,...,,False,,GLOBE Observer App,,19.046100,identify,container: artificial,22335,98.65490


In [11]:
# Removes suspected group data from the working dataframe
suspect_mask = df.isin(suspect_df)

df = df[~suspect_mask].dropna(how = "all")

# Saves new cleaned data to a separate CSV
df.to_csv("Removed_Groups.csv")
df

Unnamed: 0,protocol,measuredDate,createDate,updateDate,publishDate,organizationId,organizationName,siteId,siteName,countryName,...,mosquitohabitatmapperComments,mosquitohabitatmapperMosquitoPupae,mosquitohabitatmapperWaterSourcePhotoUrls,mosquitohabitatmapperDataSource,mosquitohabitatmapperLarvaFullBodyPhotoUrls,mosquitohabitatmapperMeasurementLatitude,mosquitohabitatmapperLastIdentifyStage,mosquitohabitatmapperWaterSourceType,mosquitohabitatmapperMosquitoHabitatMapperId,mosquitohabitatmapperMeasurementLongitude
0,mosquito_habitat_mapper,2018-11-25,2020-01-25T18:09:52,2020-01-25T18:09:52,2020-02-14T20:29:11,13063641.0,GPM Satellite Mission,35785.0,18SUJ105472,United States,...,,False,https://data.globe.gov/system/photos/2018/11/2...,GLOBE Observer App,,39.2538,,container: artificial,5188.0,-77.1959
1,mosquito_habitat_mapper,2019-04-07,2020-01-25T18:24:27,2020-01-25T18:24:27,2020-03-20T22:19:48,13063641.0,GPM Satellite Mission,35785.0,18SUJ105472,United States,...,,False,https://data.globe.gov/system/photos/2019/04/0...,GLOBE Observer App,,39.2535,identify,container: artificial,10365.0,-77.1960
2,mosquito_habitat_mapper,2019-04-07,2020-01-25T18:24:27,2020-01-25T18:24:27,2020-03-20T22:19:48,13063641.0,GPM Satellite Mission,35785.0,18SUJ105472,United States,...,,False,https://data.globe.gov/system/photos/2019/04/0...,GLOBE Observer App,,39.2536,identify,container: artificial,10360.0,-77.1956
3,mosquito_habitat_mapper,2019-05-29,2020-01-25T18:29:36,2020-01-25T18:29:36,2020-03-20T22:19:48,13063641.0,GPM Satellite Mission,35785.0,18SUJ105472,United States,...,,False,https://data.globe.gov/system/photos/2019/05/2...,GLOBE Observer App,https://data.globe.gov/system/photos/2019/05/2...,39.2542,identify-siphon-pecten,container: artificial,12424.0,-77.1962
4,mosquito_habitat_mapper,2019-08-04,2020-01-25T18:45:20,2020-01-25T18:45:20,2020-03-20T22:19:48,13063641.0,GPM Satellite Mission,35785.0,18SUJ105472,United States,...,,False,https://data.globe.gov/system/photos/2019/08/0...,GLOBE Observer App,,39.2536,identify,container: artificial,14822.0,-77.1957
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22452,mosquito_habitat_mapper,2020-05-30,2020-05-30T18:25:03,2020-05-30T19:30:02,2020-11-23T21:32:54,14054356.0,lycee Thilmakha,200580.0,28PCB553673,Senegal,...,,True,https://data.globe.gov/system/photos/2020/05/3...,GLOBE Observer App,https://data.globe.gov/system/photos/2020/05/3...,15.0778,identify-siphon-shape,container: artificial,22772.0,-16.3463
22453,mosquito_habitat_mapper,2020-05-29,2020-06-03T07:30:03,2020-06-03T16:15:20,2020-11-23T21:32:54,19841715.0,Madagascar Citizen Science,201123.0,38KQE645064,,...,,False,https://data.globe.gov/system/photos/2020/05/2...,GLOBE Observer App,,-18.9168,identify,container: artificial,22806.0,47.5121
22454,mosquito_habitat_mapper,2020-05-06,2020-07-14T08:40:05,2020-07-14T13:40:07,2020-11-23T21:32:54,18306968.0,Taiwan Partnership Citizen Science,208771.0,51RUH399684,,...,,True,,GLOBE Observer App,https://data.globe.gov/system/photos/2020/05/0...,25.0235,identify-basal-tuft,container: artificial,24654.0,121.4134
22455,mosquito_habitat_mapper,2020-05-31,2020-07-18T23:35:02,2020-07-18T23:35:02,2020-11-23T21:32:54,14054356.0,lycee Thilmakha,209660.0,28PDB055429,Senegal,...,,True,https://data.globe.gov/system/photos/2020/05/3...,GLOBE Observer App,,14.8591,identify-aedes-tuft,container: artificial,24880.0,-15.8784


# Step 4 -- Anomaly Removal

The following two cells are designed to remove anomalous (overreporting) mosquito larvae from the dataset (e.g., reporting 1,000 mosquito larvae), and resolve any sets of reported ranges (e.g., 1-25) by choosing the upper bound.

## Anomaly Threshold Variable
This is the threshold for the number of reported mosquito larvae that marks an entry as an anomaly. It will remove any entries that have mosquito larvae count observed higher than this number.

In [12]:
anomaly_threshold = 1000

In [13]:
def to_int(x):
    try:
        return int(x)
    except ValueError:
        try:
            return int(re.sub(r".*-", "", x))
        except ValueError:
            return 0

vectorized_int = np.vectorize(to_int)
df["mosquitohabitatmapperLarvaeCount"] = vectorized_int(df["mosquitohabitatmapperLarvaeCount"].fillna(0).values)
df = df[df["mosquitohabitatmapperLarvaeCount"] < anomaly_threshold]
df.to_csv("Fixed_Larvae_Count.csv")

Unnamed: 0,protocol,measuredDate,createDate,updateDate,publishDate,organizationId,organizationName,siteId,siteName,countryName,...,mosquitohabitatmapperComments,mosquitohabitatmapperMosquitoPupae,mosquitohabitatmapperWaterSourcePhotoUrls,mosquitohabitatmapperDataSource,mosquitohabitatmapperLarvaFullBodyPhotoUrls,mosquitohabitatmapperMeasurementLatitude,mosquitohabitatmapperLastIdentifyStage,mosquitohabitatmapperWaterSourceType,mosquitohabitatmapperMosquitoHabitatMapperId,mosquitohabitatmapperMeasurementLongitude
0,mosquito_habitat_mapper,2018-11-25,2020-01-25T18:09:52,2020-01-25T18:09:52,2020-02-14T20:29:11,13063641.0,GPM Satellite Mission,35785.0,18SUJ105472,United States,...,,False,https://data.globe.gov/system/photos/2018/11/2...,GLOBE Observer App,,39.2538,,container: artificial,5188.0,-77.1959
1,mosquito_habitat_mapper,2019-04-07,2020-01-25T18:24:27,2020-01-25T18:24:27,2020-03-20T22:19:48,13063641.0,GPM Satellite Mission,35785.0,18SUJ105472,United States,...,,False,https://data.globe.gov/system/photos/2019/04/0...,GLOBE Observer App,,39.2535,identify,container: artificial,10365.0,-77.1960
2,mosquito_habitat_mapper,2019-04-07,2020-01-25T18:24:27,2020-01-25T18:24:27,2020-03-20T22:19:48,13063641.0,GPM Satellite Mission,35785.0,18SUJ105472,United States,...,,False,https://data.globe.gov/system/photos/2019/04/0...,GLOBE Observer App,,39.2536,identify,container: artificial,10360.0,-77.1956
3,mosquito_habitat_mapper,2019-05-29,2020-01-25T18:29:36,2020-01-25T18:29:36,2020-03-20T22:19:48,13063641.0,GPM Satellite Mission,35785.0,18SUJ105472,United States,...,,False,https://data.globe.gov/system/photos/2019/05/2...,GLOBE Observer App,https://data.globe.gov/system/photos/2019/05/2...,39.2542,identify-siphon-pecten,container: artificial,12424.0,-77.1962
4,mosquito_habitat_mapper,2019-08-04,2020-01-25T18:45:20,2020-01-25T18:45:20,2020-03-20T22:19:48,13063641.0,GPM Satellite Mission,35785.0,18SUJ105472,United States,...,,False,https://data.globe.gov/system/photos/2019/08/0...,GLOBE Observer App,,39.2536,identify,container: artificial,14822.0,-77.1957
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22452,mosquito_habitat_mapper,2020-05-30,2020-05-30T18:25:03,2020-05-30T19:30:02,2020-11-23T21:32:54,14054356.0,lycee Thilmakha,200580.0,28PCB553673,Senegal,...,,True,https://data.globe.gov/system/photos/2020/05/3...,GLOBE Observer App,https://data.globe.gov/system/photos/2020/05/3...,15.0778,identify-siphon-shape,container: artificial,22772.0,-16.3463
22453,mosquito_habitat_mapper,2020-05-29,2020-06-03T07:30:03,2020-06-03T16:15:20,2020-11-23T21:32:54,19841715.0,Madagascar Citizen Science,201123.0,38KQE645064,,...,,False,https://data.globe.gov/system/photos/2020/05/2...,GLOBE Observer App,,-18.9168,identify,container: artificial,22806.0,47.5121
22454,mosquito_habitat_mapper,2020-05-06,2020-07-14T08:40:05,2020-07-14T13:40:07,2020-11-23T21:32:54,18306968.0,Taiwan Partnership Citizen Science,208771.0,51RUH399684,,...,,True,,GLOBE Observer App,https://data.globe.gov/system/photos/2020/05/0...,25.0235,identify-basal-tuft,container: artificial,24654.0,121.4134
22455,mosquito_habitat_mapper,2020-05-31,2020-07-18T23:35:02,2020-07-18T23:35:02,2020-11-23T21:32:54,14054356.0,lycee Thilmakha,209660.0,28PDB055429,Senegal,...,,True,https://data.globe.gov/system/photos/2020/05/3...,GLOBE Observer App,,14.8591,identify-aedes-tuft,container: artificial,24880.0,-15.8784


# Step 5 -- Event Photo Downloading Script

The following script downloads photos associated with possible mosquito mapper training events (identified in the previous script) in addition to other information to help a human identify later identify which groups were truly training events. This can also aid in future research exploring these training events.

The procedure is as follows:
- Obtains identified training events
- Generates a CSV of these events for future reference
- For each group, creates a subfolder and performs the following tasks:
    - Downloads all the photos collected in the suspected group
    - Creates a CSV containing all the photo file names, urls, and attributions
    - Creates a CSV containing all the entries for that group
    

In [14]:
# Identifies groups
suspect_groups = suspect_df.groupby(by = ['measuredDate','latitude','mosquitohabitatmapperWaterSource','siteName','longitude'])

In [15]:
# Creates a groups dataset
groups_list = []
groups_data = pd.DataFrame()
for group, df in suspect_groups:
    groups_list.append(str(group))
    new_df = df.copy(True)
    new_df["Group Name"] = str(group)
    groups_data = groups_data.append(new_df, ignore_index = True)
groups_df= pd.DataFrame(groups_list, columns=["Group"])
groups_data.to_csv("Group Data.csv")
groups_df

Unnamed: 0,Group
0,"('2017-08-30', 8.643907, 'pond', '47PNK987556'..."
1,"('2018-06-08', -5.193823, 'fountain or bird ba..."
2,"('2018-06-13', -5.193823, 'fountain or bird ba..."
3,"('2018-06-15', -4.911338, 'other', '17MNQ73457..."
4,"('2018-07-30', 7.812627, 'other', '47NPJ040637..."
...,...
116,"('2020-01-27', 15.037919, 'cement, metal or pl..."
117,"('2020-01-27', 18.01673, 'well or cistern', '4..."
118,"('2020-02-20', 15.040585, 'cement, metal or pl..."
119,"('2020-03-04', 8.643917, 'cement, metal or pla..."


In [16]:
# Creates a results directory if it doesn't already exist
if not os.path.isdir("Results"):
    os.mkdir("Results")
        
# Writes groups' data to a csv in that folder
groups_df.to_csv("Results/groups.csv")

In [17]:
# Downloads a given photo with important metadata
def download_picture(url, directory):
    def get_picture(file_name):
        downloaded_obj = requests.get(url, allow_redirects=True)
        parent_dir = os.path.join(directory, file_name)
        with open(parent_dir, "wb") as file:
            file.write(downloaded_obj.content)

    if "https://" in url:
        photo_id = re.search(r'(?<=\d\d\d\d\/\d\d\/\d\d\/).*(?=\/)', url).group(0)
        file_name = f"{photo_id}.jpg".replace(":","-")
        get_picture(file_name)
        temp_dict = {
                        "image_name" : [file_name],
                        "origin" : "GLOBE",
                        "link" : [url],
                        "attribution" : "GLOBE",
                        "license" : "Creative Commons - share adapt attribute"
                    }
        return pd.DataFrame.from_dict(temp_dict)      

In [18]:
# Photodownloading script for suspect groups
for data, group in suspect_groups:
    pics = pd.DataFrame()
    name = f"{data[0]}-{data[2]}-{data[3]}".replace("/", "-")
    dir_name = f"Results/{name}"
    if not os.path.isdir(dir_name):
        os.mkdir(dir_name)
    group_water_source = group["mosquitohabitatmapperWaterSourcePhotoUrls"].dropna().tolist()
    group_abdomen = group["mosquitohabitatmapperAbdomenCloseupPhotoUrls"].dropna().tolist()
    group_larvae = group["mosquitohabitatmapperLarvaFullBodyPhotoUrls"].dropna().tolist()
    group_urls = group_water_source + group_abdomen + group_larvae
    for urls in group_urls:
        if urls:
            urls = urls.split(";")
            for url in urls:
                pics = pics.append(download_picture(url, dir_name), ignore_index = True)
                
    pics.to_csv(f"{dir_name}/pictures.csv")
    group.to_csv(f"{dir_name}/data.csv")

KeyboardInterrupt: 

# Step 6 -- Clean Photo Download

This script downloads all of the photos from the clean dataset into a "Clean Data" folder.

In [19]:
if not os.path.isdir("Clean Data"):
    os.mkdir("Clean Data")
        
# photodownloading script for clean data download
pics = pd.DataFrame()
water_source = df["mosquitohabitatmapperWaterSourcePhotoUrls"].dropna().tolist()
abdomen = df["mosquitohabitatmapperAbdomenCloseupPhotoUrls"].dropna().tolist()
larvae = df["mosquitohabitatmapperLarvaFullBodyPhotoUrls"].dropna().tolist()
url_list = water_source + abdomen + larvae
for urls in url_list:
    if urls:
        urls = urls.split(";")
        for url in urls:
            pics = pics.append(download_picture(url, "Clean Data"), ignore_index = True)
                
pics.to_csv(f"Clean Data/pictures.csv")

KeyboardInterrupt: 