# Preliminary Data Exploration
This notebook takes a look at the [data provided](https://github.com/HACC2020/data) by the UH Occupancy challenge for HACC 2020. 

In [1]:
import pandas as pd
import numpy as np
from pprint import pprint
from datetime import datetime

In [2]:
df = pd.read_csv('data-main/uh_occupancy/2020-0824_600am-859am_devices_1598295707.csv')
df2 = pd.read_csv('data-main/uh_occupancy/2020-0824_900am-1159am_devices_1598306516.csv')

In [3]:
pprint(df)
pprint(df2)

                     Device  Max Clients  Unique Clients     Building
0       Wainani H 362B Temp            2               2      Wainani
1         Frear D 507C Temp            5               9        Frear
2         Frear D 207C Temp            4              10        Frear
3         Frear D 707C Temp            4               5        Frear
4           Noelani E Store            1              10      Noelani
...                     ...          ...             ...          ...
3209    C-MORE 104 Outreach            0               4       C-MORE
3210            C-MORE 202A            0               0       C-MORE
3211             C-MORE 218            0               1       C-MORE
3212  Athletics 345 Hallway            4              12    Athletics
3213         CBA G103 Mauka            0               0  Shidler CBA

[3214 rows x 4 columns]
                     Device  Max Clients  Unique Clients     Building
0       Wainani H 362B Temp            2               2      Wai

In [4]:
# Counting how many times the amount of connections surpassed room capacity (social distancing)
(df['Unique Clients'] > df['Max Clients']).value_counts()
(df2['Unique Clients'] > df2['Max Clients']).value_counts()

True     2497
False     717
dtype: int64

In [19]:
import pandas as pd
import glob

path='data-main/uh_occupancy'

all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    
    filename = filename.split('/')
    filename = filename[-1:][0].split('_')
    filename.remove('devices')
    df['date'], df['period'], df['epoch'] = filename[:]
    
    li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True)

In [20]:
pprint(frame)

                       Device  Max Clients  Unique Clients     Building  \
0         Wainani H 362B Temp            2               2      Wainani   
1           Frear D 507C Temp            7               8        Frear   
2           Frear D 207C Temp            8              10        Frear   
3           Frear D 707C Temp            6               8        Frear   
4             Noelani E Store            4              39      Noelani   
...                       ...          ...             ...          ...   
399061    C-MORE 104 Outreach            1               3       C-MORE   
399062            C-MORE 202A            0               1       C-MORE   
399063             C-MORE 218            0               1       C-MORE   
399064  Athletics 345 Hallway            1               7    Athletics   
399065         CBA G103 Mauka            1               1  Shidler CBA   

             date        period           epoch Buildings Building 37  
0       2020-0824  1200pm-2

In [21]:
import fiona 
import geopandas as gpd
import shapely

# Enable fiona driver
gpd.io.file.fiona.drvsupport.supported_drivers['KML'] = 'rw'

# Read file
df = gpd.read_file('data-main/wireless20200511125738.kml', driver='KML')

# Drop Z dimension of polygons that occurs often in kml 
df.geometry = df.geometry.map(lambda polygon: shapely.ops.transform(lambda x, y, z: (x, y), polygon))

In [22]:
print(df)

                              Name  \
0                           Biomed   
1                      Bilger Hall   
2                    Campus Center   
3                 Hamilton Library   
4                      George Hall   
..                             ...   
122                 St John Garden   
123   Law School Clinical Building   
124       Lyon Arboretum Cottage G   
125        Manoa Innovation Center   
126  Lower Campus Maintenance Shed   

                                           Description  \
0    <ul> <li>A-Court: A 110, A110T</li> <li>B-Cour...   
1    <ul> <li>1st Floor</li> <li>2nd Floor</li> <li...   
2    <ul> <li>105 (Stir Fresh), 117 (Game Room), 12...   
3    <ul> <li>Main Building </li> <li>Basement</li>...   
4    <ul> <li>1st Floor</li> <li>2nd Floor</li> <li...   
..                                                 ...   
122                                         <ul> </ul>   
123                                         <ul> </ul>   
124                  

In [23]:
# For simplicity, we ignore the curvature of the Earth: https://gis.stackexchange.com/questions/372564/userwarning-when-trying-to-get-centroid-from-a-polygon-geopandas
df['center_point'] = df['geometry'].centroid


  df['center_point'] = df['geometry'].centroid


In [24]:
print(df['Name'])

0                             Biomed
1                        Bilger Hall
2                      Campus Center
3                   Hamilton Library
4                        George Hall
                   ...              
122                   St John Garden
123     Law School Clinical Building
124         Lyon Arboretum Cottage G
125          Manoa Innovation Center
126    Lower Campus Maintenance Shed
Name: Name, Length: 127, dtype: object


In [25]:
# some of the building names are different in the KML file. This shows how many we still need coordinates for:
a = frame['Building'].unique()
print(len(a))
l3 = [x for x in a if x not in list(df['Name'])]
print(l3)
print(len(l3))

108
['Frear', 'Mokihana', 'Architecture-Hawaii Hall Outdoor Lawn', 'Gateway', 'Burns', 'Entrance Kiosks', 'ITC', 'Life Science Building', 'Campus Security', 'Murakami', 'Hale Kahawai', 'Anuenue', 'Lower Campus', 'Parking Structure', 'Ilima', 'Admin Services/Bookstore Warehouse', 'Johnson', 'UH Lab School Portables (N)', 'Thrift Shop', 'Sinclair Annexes', 'Maile Way Annex', 'Ching', 'Nagatani', 'Landscaping', nan]
25


In [26]:
df['Building'] = df['Name']

In [53]:
# create a column for the building's coordinates by converting the Shapely geometry object into a list
df['lon'] = [center.x for center in df['center_point'] if center != None]
df['lat'] = [center.y for center in df['center_point'] if center != None]

# https://stackoverflow.com/questions/53645882/pandas-merging-101
frame = frame.merge(df, on='Building', how='outer')

In [54]:
pprint(frame)
frame.drop(['Name','Buildings','Building 37', 'Description', 'geometry', 'center_point'], axis=1, inplace=True)

                     Device  Max Clients  Unique Clients  \
0       Wainani H 362B Temp          2.0             2.0   
1             Wainani F 502         12.0            15.0   
2             Wainani F 605          6.0            14.0   
3             Wainani F 603          6.0            10.0   
4             Wainani F 604          5.0             7.0   
...                     ...          ...             ...   
399105                  NaN          NaN             NaN   
399106                  NaN          NaN             NaN   
399107                  NaN          NaN             NaN   
399108                  NaN          NaN             NaN   
399109                  NaN          NaN             NaN   

                             Building       date        period  \
0                             Wainani  2020-0824  1200pm-259pm   
1                             Wainani  2020-0824  1200pm-259pm   
2                             Wainani  2020-0824  1200pm-259pm   
3              

KeyError: "['Buildings' 'Building 37'] not found in axis"

In [55]:
pprint(frame.dropna())

                          Device  Max Clients  Unique Clients      Building  \
0            Wainani H 362B Temp          2.0             2.0       Wainani   
1                  Wainani F 502         12.0            15.0       Wainani   
2                  Wainani F 605          6.0            14.0       Wainani   
3                  Wainani F 603          6.0            10.0       Wainani   
4                  Wainani F 604          5.0             7.0       Wainani   
...                          ...          ...             ...           ...   
376434    Dance Studio Classroom          3.0             6.0  Dance Studio   
376435  Dance Studio Dance Floor          2.0            12.0  Dance Studio   
376436            Dance Studio 4          1.0             3.0  Dance Studio   
376437    Dance Studio Classroom          3.0             6.0  Dance Studio   
376438  Dance Studio Dance Floor          3.0            19.0  Dance Studio   

             date        period           epoch  \


In [None]:
# json_format = {
#     "features": [
#         {
#             "type": "Feature",
#             "properties":
#                 {
#                     "device_id": "Wainani F 502",
#                     "Building": "Wainani",
#                     "Max Clients": 2.0,
#                     "Unique Clients": 2.0,
#                     "date": "2020-0824",
#                     "startTime": "1200pm",
#                     "endTime": "259pm"
#                 },
#             "geometry":
#                 {
#                     "type": "Point",
#                     "coordinates": [-157.81458181493875, 21.292468917945786] 
#                 }
#         }
#     ]
# }

In [46]:
import json
test_json = frame[['Building', 'Unique Clients', 'date']].to_json(orient='records')
geo_json = frame[["coordinates"]].to_json(orient='records')
test = [test_json, geo_json]
parsed = json.loads(test_json)

with open('data.json', 'w', encoding='utf-8') as f:
    json.dump(parsed, f, ensure_ascii=False, indent=4)

In [40]:
json.dump(parsed, indent=4)

TypeError: dump() missing 1 required positional argument: 'fp'

In [56]:
# credit! https://geoffboeing.com/2015/10/exporting-python-data-geojson/
# feed this function the dataframe, and a list of properties
def df_to_geojson(df, properties, lat='lat', lon='lon'):
    geojson = {'type':'FeatureCollection', 'features':[]}
    for _, row in df.iterrows():
        feature = {'type':'Feature',
                   'properties':{},
                   'geometry':{'type':'Point',
                               'coordinates':[]}}
        feature['geometry']['coordinates'] = [row[lon],row[lat]]
        for prop in properties:
            feature['properties'][prop] = row[prop]
        geojson['features'].append(feature)
    return geojson

In [59]:
geojson = df_to_geojson(frame, ['Building', 'Unique Clients', 'date'])

output_filename = 'data.json'
with open(output_filename, 'w') as output_file:
    output_file.write('var dataset = ')
    json.dump(geojson, output_file, indent=2)

In [None]:
print(frame[])