# API Downloading Data Procedure:

Downloading data follows these steps:
- Request the non GEOJSON data from the GLOBE API
- Get the results from the JSON and pass it into a pandas dataframe
- Unpack the "data" entry
- Merge the data onto the original dataframe
- Remove the "data" entry from the original dataframe

In [1]:
import pandas as pd
import requests

start_date = "2017-05-29"
end_date = "2020-05-31"
url = f"https://api.globe.gov/search/v1/measurement/protocol/measureddate/?protocols=mosquito_habitat_mapper&startdate={start_date}&enddate={end_date}&geojson=FALSE&sample=FALSE"

# downloads data from the GLOBE API
response = requests.get(url)

# Converts data into a useable dataframe
data = response.json()["results"]
        
df = pd.DataFrame(data)

# unpacking and joining the data entry
data_df = pd.DataFrame(df["data"].to_dict())
data_df = data_df.transpose()
df = df.join(data_df)
df.drop(["data"], axis=1, inplace = True)

# display the dataframe
df

Unnamed: 0,protocol,measuredDate,createDate,updateDate,publishDate,organizationId,organizationName,siteId,siteName,countryName,...,mosquitohabitatmapperComments,mosquitohabitatmapperMosquitoPupae,mosquitohabitatmapperWaterSourcePhotoUrls,mosquitohabitatmapperDataSource,mosquitohabitatmapperLarvaFullBodyPhotoUrls,mosquitohabitatmapperMeasurementLatitude,mosquitohabitatmapperLastIdentifyStage,mosquitohabitatmapperWaterSourceType,mosquitohabitatmapperMosquitoHabitatMapperId,mosquitohabitatmapperMeasurementLongitude
0,mosquito_habitat_mapper,2018-11-25,2020-01-25T18:09:52,2020-01-25T18:09:52,2020-02-14T20:29:11,13063641.0,GPM Satellite Mission,35785,18SUJ105472,United States,...,,False,https://data.globe.gov/system/photos/2018/11/2...,GLOBE Observer App,,39.2538,,container: artificial,5188,-77.1959
1,mosquito_habitat_mapper,2019-04-07,2020-01-25T18:24:27,2020-01-25T18:24:27,2020-03-20T22:19:48,13063641.0,GPM Satellite Mission,35785,18SUJ105472,United States,...,,False,https://data.globe.gov/system/photos/2019/04/0...,GLOBE Observer App,,39.2535,identify,container: artificial,10365,-77.196
2,mosquito_habitat_mapper,2019-04-07,2020-01-25T18:24:27,2020-01-25T18:24:27,2020-03-20T22:19:48,13063641.0,GPM Satellite Mission,35785,18SUJ105472,United States,...,,False,https://data.globe.gov/system/photos/2019/04/0...,GLOBE Observer App,,39.2536,identify,container: artificial,10360,-77.1956
3,mosquito_habitat_mapper,2019-05-29,2020-01-25T18:29:36,2020-01-25T18:29:36,2020-03-20T22:19:48,13063641.0,GPM Satellite Mission,35785,18SUJ105472,United States,...,,False,https://data.globe.gov/system/photos/2019/05/2...,GLOBE Observer App,https://data.globe.gov/system/photos/2019/05/2...,39.2542,identify-siphon-pecten,container: artificial,12424,-77.1962
4,mosquito_habitat_mapper,2019-08-04,2020-01-25T18:45:20,2020-01-25T18:45:20,2020-03-20T22:19:48,13063641.0,GPM Satellite Mission,35785,18SUJ105472,United States,...,,False,https://data.globe.gov/system/photos/2019/08/0...,GLOBE Observer App,,39.2536,identify,container: artificial,14822,-77.1957
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22449,mosquito_habitat_mapper,2020-05-30,2020-05-30T18:25:03,2020-05-30T19:30:02,2020-09-05T21:23:09,14054356.0,lycee Thilmakha,200580,28PCB553673,Senegal,...,,True,https://data.globe.gov/system/photos/2020/05/3...,GLOBE Observer App,https://data.globe.gov/system/photos/2020/05/3...,15.0777,identify-siphon-shape,container: artificial,22776,-16.3463
22450,mosquito_habitat_mapper,2020-05-29,2020-06-03T07:30:03,2020-06-03T16:15:20,2020-09-05T21:23:09,19841715.0,Madagascar Citizen Science,201123,38KQE645064,,...,,False,https://data.globe.gov/system/photos/2020/05/2...,GLOBE Observer App,,-18.9168,identify,container: artificial,22806,47.5121
22451,mosquito_habitat_mapper,2020-05-06,2020-07-14T08:40:05,2020-07-14T13:40:07,2020-09-05T21:23:09,18306968.0,Taiwan Partnership Citizen Science,208771,51RUH399684,,...,,True,,GLOBE Observer App,https://data.globe.gov/system/photos/2020/05/0...,25.0235,identify-basal-tuft,container: artificial,24654,121.413
22452,mosquito_habitat_mapper,2020-05-31,2020-07-18T23:35:02,2020-07-18T23:35:02,2020-09-05T21:23:09,14054356.0,lycee Thilmakha,209660,28PDB055429,Senegal,...,,True,https://data.globe.gov/system/photos/2020/05/3...,GLOBE Observer App,,14.8591,identify-aedes-tuft,container: artificial,24880,-15.8784


# CSV Downloading

This cell allows for the option of using mosquito mapper data from CSV Files.

This enables an analysis of different subsets of the total mosquito mapper data (this will overwrite data downloaded from the API).


In [None]:
filename = "Africa.csv"
df = pd.read_csv(filename)
df

# GeoLocational Data Cleaning Procedure:
Here are the conditions in which the data is removed:
- The Site Latitude and Longitude match the ones measured by the GPS
- The GPS Measurements are integers



In [2]:
for index, row in df.iterrows():
    gps_lat = row["mosquitohabitatmapperMeasurementLatitude"]
    gps_lon = row["mosquitohabitatmapperMeasurementLongitude"]
    if ((row["latitude"] == gps_lat and 
        row["longitude"] == gps_lon) or
        isinstance(gps_lat, int) or
        isinstance(gps_lon, int)
        ):
        df = df.drop(index)

df = df.reset_index().drop(["index"], axis = 1)
# displaying the dataframe
df

Unnamed: 0,protocol,measuredDate,createDate,updateDate,publishDate,organizationId,organizationName,siteId,siteName,countryName,...,mosquitohabitatmapperComments,mosquitohabitatmapperMosquitoPupae,mosquitohabitatmapperWaterSourcePhotoUrls,mosquitohabitatmapperDataSource,mosquitohabitatmapperLarvaFullBodyPhotoUrls,mosquitohabitatmapperMeasurementLatitude,mosquitohabitatmapperLastIdentifyStage,mosquitohabitatmapperWaterSourceType,mosquitohabitatmapperMosquitoHabitatMapperId,mosquitohabitatmapperMeasurementLongitude
0,mosquito_habitat_mapper,2018-11-25,2020-01-25T18:09:52,2020-01-25T18:09:52,2020-02-14T20:29:11,13063641.0,GPM Satellite Mission,35785,18SUJ105472,United States,...,,False,https://data.globe.gov/system/photos/2018/11/2...,GLOBE Observer App,,39.2538,,container: artificial,5188,-77.1959
1,mosquito_habitat_mapper,2019-04-07,2020-01-25T18:24:27,2020-01-25T18:24:27,2020-03-20T22:19:48,13063641.0,GPM Satellite Mission,35785,18SUJ105472,United States,...,,False,https://data.globe.gov/system/photos/2019/04/0...,GLOBE Observer App,,39.2535,identify,container: artificial,10365,-77.196
2,mosquito_habitat_mapper,2019-04-07,2020-01-25T18:24:27,2020-01-25T18:24:27,2020-03-20T22:19:48,13063641.0,GPM Satellite Mission,35785,18SUJ105472,United States,...,,False,https://data.globe.gov/system/photos/2019/04/0...,GLOBE Observer App,,39.2536,identify,container: artificial,10360,-77.1956
3,mosquito_habitat_mapper,2019-05-29,2020-01-25T18:29:36,2020-01-25T18:29:36,2020-03-20T22:19:48,13063641.0,GPM Satellite Mission,35785,18SUJ105472,United States,...,,False,https://data.globe.gov/system/photos/2019/05/2...,GLOBE Observer App,https://data.globe.gov/system/photos/2019/05/2...,39.2542,identify-siphon-pecten,container: artificial,12424,-77.1962
4,mosquito_habitat_mapper,2019-08-04,2020-01-25T18:45:20,2020-01-25T18:45:20,2020-03-20T22:19:48,13063641.0,GPM Satellite Mission,35785,18SUJ105472,United States,...,,False,https://data.globe.gov/system/photos/2019/08/0...,GLOBE Observer App,,39.2536,identify,container: artificial,14822,-77.1957
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19274,mosquito_habitat_mapper,2020-05-30,2020-05-30T18:25:03,2020-05-30T19:30:02,2020-09-05T21:23:09,14054356.0,lycee Thilmakha,200580,28PCB553673,Senegal,...,,True,https://data.globe.gov/system/photos/2020/05/3...,GLOBE Observer App,https://data.globe.gov/system/photos/2020/05/3...,15.0777,identify-siphon-shape,container: artificial,22776,-16.3463
19275,mosquito_habitat_mapper,2020-05-29,2020-06-03T07:30:03,2020-06-03T16:15:20,2020-09-05T21:23:09,19841715.0,Madagascar Citizen Science,201123,38KQE645064,,...,,False,https://data.globe.gov/system/photos/2020/05/2...,GLOBE Observer App,,-18.9168,identify,container: artificial,22806,47.5121
19276,mosquito_habitat_mapper,2020-05-06,2020-07-14T08:40:05,2020-07-14T13:40:07,2020-09-05T21:23:09,18306968.0,Taiwan Partnership Citizen Science,208771,51RUH399684,,...,,True,,GLOBE Observer App,https://data.globe.gov/system/photos/2020/05/0...,25.0235,identify-basal-tuft,container: artificial,24654,121.413
19277,mosquito_habitat_mapper,2020-05-31,2020-07-18T23:35:02,2020-07-18T23:35:02,2020-09-05T21:23:09,14054356.0,lycee Thilmakha,209660,28PDB055429,Senegal,...,,True,https://data.globe.gov/system/photos/2020/05/3...,GLOBE Observer App,,14.8591,identify-aedes-tuft,container: artificial,24880,-15.8784


In [3]:
# writes the filtered data to a csv
df.to_csv("Geolocation.csv")

# Suspected Training Event Removal

The procedure that is used is as follows:
- Mosquito Entries that share the same measured date, site latitude, site longitude, siteName, and water source are grouped together
- These groups are then filtered resulting in identifying groups of a size greater than a specified threshold.
- Then these identified groups are removed from the overall dataframe
- The dataframe containing the groups and the clean dataframe are then written into csv files. 


## Threshold Variable

The threshold variable is the minimum number of duplicate entries (exclusive) that are needed for the the group of entries to be classified as an event.

Set the threshold value and then run the following cells

In [4]:
threshold = 10

In [5]:
# groups / filters suspected events
suspect_df = df.groupby(by=['measuredDate','latitude','mosquitohabitatmapperWaterSource','siteName','longitude']).filter(lambda x: len(x) > threshold)

# writes the entries to a CSV
suspect_df.to_csv("Possible Events.csv")       

# displays the suspected entries
suspect_df

Unnamed: 0,protocol,measuredDate,createDate,updateDate,publishDate,organizationId,organizationName,siteId,siteName,countryName,...,mosquitohabitatmapperComments,mosquitohabitatmapperMosquitoPupae,mosquitohabitatmapperWaterSourcePhotoUrls,mosquitohabitatmapperDataSource,mosquitohabitatmapperLarvaFullBodyPhotoUrls,mosquitohabitatmapperMeasurementLatitude,mosquitohabitatmapperLastIdentifyStage,mosquitohabitatmapperWaterSourceType,mosquitohabitatmapperMosquitoHabitatMapperId,mosquitohabitatmapperMeasurementLongitude
469,mosquito_habitat_mapper,2018-08-17,2020-01-25T18:02:43,2020-01-25T18:02:43,2020-02-14T20:29:11,17615655.0,Thailand Citizen Science,97680,47PNK988555,,...,,,rejected,GLOBE Observer App,,8.6437,,still: lake/pond/swamp,3856,99.8984
473,mosquito_habitat_mapper,2018-08-17,2020-01-25T18:02:43,2020-01-25T18:02:43,2020-02-14T20:29:11,17615655.0,Thailand Citizen Science,97680,47PNK988555,,...,,,https://data.globe.gov/system/photos/2018/08/1...,GLOBE Observer App,,8.6436,,still: lake/pond/swamp,3858,99.8985
475,mosquito_habitat_mapper,2018-08-17,2020-01-25T18:02:43,2020-01-25T18:02:43,2020-02-14T20:29:11,17615655.0,Thailand Citizen Science,97680,47PNK988555,,...,,,https://data.globe.gov/system/photos/2018/08/1...,GLOBE Observer App,,8.6436,identify,still: lake/pond/swamp,3877,99.8983
476,mosquito_habitat_mapper,2018-08-17,2020-01-25T18:02:43,2020-01-25T18:02:43,2020-02-14T20:29:11,17615655.0,Thailand Citizen Science,97680,47PNK988555,,...,,,rejected,GLOBE Observer App,,8.6436,,still: lake/pond/swamp,3855,99.8985
481,mosquito_habitat_mapper,2018-08-17,2020-01-25T18:02:43,2020-01-25T18:02:43,2020-02-14T20:29:11,17615655.0,Thailand Citizen Science,97680,47PNK988555,,...,,,https://data.globe.gov/system/photos/2018/08/1...,GLOBE Observer App,,8.6437,,still: lake/pond/swamp,4044,99.8981
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19154,mosquito_habitat_mapper,2020-01-20,2020-04-20T07:30:02,2020-04-20T07:31:25,2020-09-05T21:23:09,17615655.0,Thailand Citizen Science,196380,47QMB636059,,...,,False,,GLOBE Observer App,,19.0461,identify,container: artificial,22321,98.6549
19156,mosquito_habitat_mapper,2020-01-20,2020-04-20T07:30:02,2020-04-20T07:31:25,2020-09-05T21:23:09,17615655.0,Thailand Citizen Science,196380,47QMB636059,,...,,False,,GLOBE Observer App,,19.0461,identify,container: artificial,22335,98.6549
19157,mosquito_habitat_mapper,2020-01-20,2020-04-20T07:30:02,2020-04-20T07:31:25,2020-09-05T21:23:09,17615655.0,Thailand Citizen Science,196380,47QMB636059,,...,,False,,GLOBE Observer App,,19.0461,identify,container: artificial,22336,98.655
19158,mosquito_habitat_mapper,2020-01-20,2020-04-20T07:30:02,2020-04-20T07:31:25,2020-09-05T21:23:09,17615655.0,Thailand Citizen Science,196380,47QMB636059,,...,,False,,GLOBE Observer App,,19.0461,identify,container: artificial,22338,98.655


In [6]:

suspect_mask = df.isin(suspect_df)

clean_df = df[~suspect_mask].dropna(how = "all")

clean_df.to_csv("Clean CSV.csv")

# display's cleaned data
clean_df

Unnamed: 0,protocol,measuredDate,createDate,updateDate,publishDate,organizationId,organizationName,siteId,siteName,countryName,...,mosquitohabitatmapperComments,mosquitohabitatmapperMosquitoPupae,mosquitohabitatmapperWaterSourcePhotoUrls,mosquitohabitatmapperDataSource,mosquitohabitatmapperLarvaFullBodyPhotoUrls,mosquitohabitatmapperMeasurementLatitude,mosquitohabitatmapperLastIdentifyStage,mosquitohabitatmapperWaterSourceType,mosquitohabitatmapperMosquitoHabitatMapperId,mosquitohabitatmapperMeasurementLongitude
0,mosquito_habitat_mapper,2018-11-25,2020-01-25T18:09:52,2020-01-25T18:09:52,2020-02-14T20:29:11,13063641.0,GPM Satellite Mission,35785.0,18SUJ105472,United States,...,,False,https://data.globe.gov/system/photos/2018/11/2...,GLOBE Observer App,,39.2538,,container: artificial,5188,-77.1959
1,mosquito_habitat_mapper,2019-04-07,2020-01-25T18:24:27,2020-01-25T18:24:27,2020-03-20T22:19:48,13063641.0,GPM Satellite Mission,35785.0,18SUJ105472,United States,...,,False,https://data.globe.gov/system/photos/2019/04/0...,GLOBE Observer App,,39.2535,identify,container: artificial,10365,-77.196
2,mosquito_habitat_mapper,2019-04-07,2020-01-25T18:24:27,2020-01-25T18:24:27,2020-03-20T22:19:48,13063641.0,GPM Satellite Mission,35785.0,18SUJ105472,United States,...,,False,https://data.globe.gov/system/photos/2019/04/0...,GLOBE Observer App,,39.2536,identify,container: artificial,10360,-77.1956
3,mosquito_habitat_mapper,2019-05-29,2020-01-25T18:29:36,2020-01-25T18:29:36,2020-03-20T22:19:48,13063641.0,GPM Satellite Mission,35785.0,18SUJ105472,United States,...,,False,https://data.globe.gov/system/photos/2019/05/2...,GLOBE Observer App,https://data.globe.gov/system/photos/2019/05/2...,39.2542,identify-siphon-pecten,container: artificial,12424,-77.1962
4,mosquito_habitat_mapper,2019-08-04,2020-01-25T18:45:20,2020-01-25T18:45:20,2020-03-20T22:19:48,13063641.0,GPM Satellite Mission,35785.0,18SUJ105472,United States,...,,False,https://data.globe.gov/system/photos/2019/08/0...,GLOBE Observer App,,39.2536,identify,container: artificial,14822,-77.1957
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19274,mosquito_habitat_mapper,2020-05-30,2020-05-30T18:25:03,2020-05-30T19:30:02,2020-09-05T21:23:09,14054356.0,lycee Thilmakha,200580.0,28PCB553673,Senegal,...,,True,https://data.globe.gov/system/photos/2020/05/3...,GLOBE Observer App,https://data.globe.gov/system/photos/2020/05/3...,15.0777,identify-siphon-shape,container: artificial,22776,-16.3463
19275,mosquito_habitat_mapper,2020-05-29,2020-06-03T07:30:03,2020-06-03T16:15:20,2020-09-05T21:23:09,19841715.0,Madagascar Citizen Science,201123.0,38KQE645064,,...,,False,https://data.globe.gov/system/photos/2020/05/2...,GLOBE Observer App,,-18.9168,identify,container: artificial,22806,47.5121
19276,mosquito_habitat_mapper,2020-05-06,2020-07-14T08:40:05,2020-07-14T13:40:07,2020-09-05T21:23:09,18306968.0,Taiwan Partnership Citizen Science,208771.0,51RUH399684,,...,,True,,GLOBE Observer App,https://data.globe.gov/system/photos/2020/05/0...,25.0235,identify-basal-tuft,container: artificial,24654,121.413
19277,mosquito_habitat_mapper,2020-05-31,2020-07-18T23:35:02,2020-07-18T23:35:02,2020-09-05T21:23:09,14054356.0,lycee Thilmakha,209660.0,28PDB055429,Senegal,...,,True,https://data.globe.gov/system/photos/2020/05/3...,GLOBE Observer App,,14.8591,identify-aedes-tuft,container: artificial,24880,-15.8784


# Anomaly Removal

The following two cells are designed to remove anomalous (overreporting) mosquito larvae from the dataset (e.g., reporting 1,000 mosquito larvae), and resolve any sets of reported ranges (e.g., 1-25) by choosing the upper bound.

## Anomaly Threshold Variable
This is the threshold for the number of reported mosquito larvae that marks an entry as an anomaly. It will remove any entries that have mosquito larvae count observed higher than this number.

In [7]:
anomaly_threshold = 1000

In [8]:
import re
def to_int(x):
    try:
        return int(x)
    except ValueError:
        try:
            return int(re.sub(r".*-", "", x))
        except ValueError:
            return 0


clean_df["mosquitohabitatmapperLarvaeCount"] = clean_df["mosquitohabitatmapperLarvaeCount"].fillna(0).apply(to_int)
clean_df = clean_df[clean_df["mosquitohabitatmapperLarvaeCount"] < anomaly_threshold]
clean_df.to_csv("CleanDF Updated.csv")

clean_df

Unnamed: 0,protocol,measuredDate,createDate,updateDate,publishDate,organizationId,organizationName,siteId,siteName,countryName,...,mosquitohabitatmapperComments,mosquitohabitatmapperMosquitoPupae,mosquitohabitatmapperWaterSourcePhotoUrls,mosquitohabitatmapperDataSource,mosquitohabitatmapperLarvaFullBodyPhotoUrls,mosquitohabitatmapperMeasurementLatitude,mosquitohabitatmapperLastIdentifyStage,mosquitohabitatmapperWaterSourceType,mosquitohabitatmapperMosquitoHabitatMapperId,mosquitohabitatmapperMeasurementLongitude
0,mosquito_habitat_mapper,2018-11-25,2020-01-25T18:09:52,2020-01-25T18:09:52,2020-02-14T20:29:11,13063641.0,GPM Satellite Mission,35785.0,18SUJ105472,United States,...,,False,https://data.globe.gov/system/photos/2018/11/2...,GLOBE Observer App,,39.2538,,container: artificial,5188,-77.1959
1,mosquito_habitat_mapper,2019-04-07,2020-01-25T18:24:27,2020-01-25T18:24:27,2020-03-20T22:19:48,13063641.0,GPM Satellite Mission,35785.0,18SUJ105472,United States,...,,False,https://data.globe.gov/system/photos/2019/04/0...,GLOBE Observer App,,39.2535,identify,container: artificial,10365,-77.196
2,mosquito_habitat_mapper,2019-04-07,2020-01-25T18:24:27,2020-01-25T18:24:27,2020-03-20T22:19:48,13063641.0,GPM Satellite Mission,35785.0,18SUJ105472,United States,...,,False,https://data.globe.gov/system/photos/2019/04/0...,GLOBE Observer App,,39.2536,identify,container: artificial,10360,-77.1956
3,mosquito_habitat_mapper,2019-05-29,2020-01-25T18:29:36,2020-01-25T18:29:36,2020-03-20T22:19:48,13063641.0,GPM Satellite Mission,35785.0,18SUJ105472,United States,...,,False,https://data.globe.gov/system/photos/2019/05/2...,GLOBE Observer App,https://data.globe.gov/system/photos/2019/05/2...,39.2542,identify-siphon-pecten,container: artificial,12424,-77.1962
4,mosquito_habitat_mapper,2019-08-04,2020-01-25T18:45:20,2020-01-25T18:45:20,2020-03-20T22:19:48,13063641.0,GPM Satellite Mission,35785.0,18SUJ105472,United States,...,,False,https://data.globe.gov/system/photos/2019/08/0...,GLOBE Observer App,,39.2536,identify,container: artificial,14822,-77.1957
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19274,mosquito_habitat_mapper,2020-05-30,2020-05-30T18:25:03,2020-05-30T19:30:02,2020-09-05T21:23:09,14054356.0,lycee Thilmakha,200580.0,28PCB553673,Senegal,...,,True,https://data.globe.gov/system/photos/2020/05/3...,GLOBE Observer App,https://data.globe.gov/system/photos/2020/05/3...,15.0777,identify-siphon-shape,container: artificial,22776,-16.3463
19275,mosquito_habitat_mapper,2020-05-29,2020-06-03T07:30:03,2020-06-03T16:15:20,2020-09-05T21:23:09,19841715.0,Madagascar Citizen Science,201123.0,38KQE645064,,...,,False,https://data.globe.gov/system/photos/2020/05/2...,GLOBE Observer App,,-18.9168,identify,container: artificial,22806,47.5121
19276,mosquito_habitat_mapper,2020-05-06,2020-07-14T08:40:05,2020-07-14T13:40:07,2020-09-05T21:23:09,18306968.0,Taiwan Partnership Citizen Science,208771.0,51RUH399684,,...,,True,,GLOBE Observer App,https://data.globe.gov/system/photos/2020/05/0...,25.0235,identify-basal-tuft,container: artificial,24654,121.413
19277,mosquito_habitat_mapper,2020-05-31,2020-07-18T23:35:02,2020-07-18T23:35:02,2020-09-05T21:23:09,14054356.0,lycee Thilmakha,209660.0,28PDB055429,Senegal,...,,True,https://data.globe.gov/system/photos/2020/05/3...,GLOBE Observer App,,14.8591,identify-aedes-tuft,container: artificial,24880,-15.8784


# Event Photo Downloading Script

The following script downloads photos associated with possible mosquito mapper training events (identified in the previous script) in addition to other information to help a human identify later identify which groups were truly training events. This can also aid in future research exploring these training events.

The procedure is as follows:
- It gets the identified training events from the previous script
- It generates a CSV of these events for future reference
- For each group, it creates a subfolder and proceeds to do the following:
    - Download all the photos collected in the suspected group
    - Create a CSV containing all the photo file names, urls, and attributions
    - Create a CSV containing all the entries for that group
    

In [9]:
# identify groups
suspect_groups = suspect_df.groupby(by = ['measuredDate','latitude','mosquitohabitatmapperWaterSource','siteName','longitude'])

In [10]:
# create a groups dataset
groups_list = []
groups_data = pd.DataFrame()
for group, df in suspect_groups:
    groups_list.append(str(group))
    new_df = df.copy(True)
    new_df["Group Name"] = str(group)
    groups_data = groups_data.append(new_df, ignore_index = True)
groups_df= pd.DataFrame(groups_list, columns=["Group"])
groups_data.to_csv("Group Data.csv")
groups_df

Unnamed: 0,Group
0,"('2018-07-30', 7.812627, 'other', '47NPJ040637..."
1,"('2018-08-07', 7.812627, 'other', '47NPJ040637..."
2,"('2018-08-14', 7.812627, 'other', '47NPJ040637..."
3,"('2018-08-17', 8.643, 'pond', '47PNK988555', 9..."
4,"('2018-09-01', 0.304885, 'cement, metal or pla..."
...,...
112,"('2020-01-27', 15.037919, 'cement, metal or pl..."
113,"('2020-01-27', 18.01673, 'well or cistern', '4..."
114,"('2020-02-20', 15.040585, 'cement, metal or pl..."
115,"('2020-03-04', 8.643917, 'cement, metal or pla..."


In [11]:
import os
# makes a results directory if it doesn't already exist
if not os.path.isdir("Results"):
        os.mkdir("Results")
        
# write the groups data to a csv
groups_df.to_csv("Results/groups.csv")

In [12]:
import re
def download_picture(url, directory):
    def get_picture(file_name):
        downloaded_obj = requests.get(url, allow_redirects=True)
        parent_dir = os.path.join(directory, file_name)
        with open(parent_dir, "wb") as file:
            file.write(downloaded_obj.content)

    if "https://" in url:
        photo_id = re.search(r'(?<=\d\d\d\d\/\d\d\/\d\d\/).*(?=\/)', url).group(0)
        file_name = f"{photo_id}.jpg".replace(":","-")
        get_picture(file_name)
        temp_dict = {
                        "image_name" : [file_name],
                        "origin" : "GLOBE",
                        "link" : [url],
                        "attribution" : "GLOBE",
                        "license" : "Creative Commons - share adapt attribute"
                    }
        return pd.DataFrame.from_dict(temp_dict)
        

In [13]:

# photodownloading script for suspect groups
for data, group in suspect_groups:
    pics = pd.DataFrame()
    name = f"{data[0]}-{data[2]}-{data[3]}".replace("/", "-")
    dir_name = f"Results/{name}"
    if not os.path.isdir(dir_name):
        os.mkdir(dir_name)
    group_water_source = group["mosquitohabitatmapperWaterSourcePhotoUrls"].dropna().tolist()
    group_abdomen = group["mosquitohabitatmapperAbdomenCloseupPhotoUrls"].dropna().tolist()
    group_larvae = group["mosquitohabitatmapperLarvaFullBodyPhotoUrls"].dropna().tolist()
    group_urls = group_water_source + group_abdomen + group_larvae
    for urls in group_urls:
        if urls:
            urls = urls.split(";")
            for url in urls:
                pics = pics.append(download_picture(url, dir_name), ignore_index = True)
                
    pics.to_csv(f"{dir_name}/pictures.csv")
    group.to_csv(f"{dir_name}/data.csv")
  

In [14]:
if not os.path.isdir("Clean Data"):
    os.mkdir("Clean Data")
        
# photodownloading script for clean data download
pics = pd.DataFrame()
water_source = clean_df["mosquitohabitatmapperWaterSourcePhotoUrls"].dropna().tolist()
abdomen = clean_df["mosquitohabitatmapperAbdomenCloseupPhotoUrls"].dropna().tolist()
larvae = clean_df["mosquitohabitatmapperLarvaFullBodyPhotoUrls"].dropna().tolist()
url_list = water_source + abdomen + larvae
for urls in url_list:
    if urls:
        urls = urls.split(";")
        for url in urls:
            pics = pics.append(download_picture(url, "Clean Data"), ignore_index = True)
                
pics.to_csv(f"Clean Data/pictures.csv")