# Preliminary Data Exploration
This notebook takes a look at the [data provided](https://github.com/HACC2020/data) by the UH Occupancy challenge for HACC 2020. 

In [1]:
import pandas as pd
import numpy as np
from pprint import pprint
from datetime import datetime

In [2]:
df = pd.read_csv('data-main/uh_occupancy/2020-0824_600am-859am_devices_1598295707.csv')
df2 = pd.read_csv('data-main/uh_occupancy/2020-0824_900am-1159am_devices_1598306516.csv')

In [3]:
pprint(df)
pprint(df2)

                     Device  Max Clients  Unique Clients     Building
0       Wainani H 362B Temp            2               2      Wainani
1         Frear D 507C Temp            5               9        Frear
2         Frear D 207C Temp            4              10        Frear
3         Frear D 707C Temp            4               5        Frear
4           Noelani E Store            1              10      Noelani
...                     ...          ...             ...          ...
3209    C-MORE 104 Outreach            0               4       C-MORE
3210            C-MORE 202A            0               0       C-MORE
3211             C-MORE 218            0               1       C-MORE
3212  Athletics 345 Hallway            4              12    Athletics
3213         CBA G103 Mauka            0               0  Shidler CBA

[3214 rows x 4 columns]
                     Device  Max Clients  Unique Clients     Building
0       Wainani H 362B Temp            2               2      Wai

In [4]:
(df['Unique Clients'] > df['Max Clients']).value_counts()
(df2['Unique Clients'] > df2['Max Clients']).value_counts()

True     2497
False     717
dtype: int64

In [5]:
import pandas as pd
import glob

path='data-main/uh_occupancy'

all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    
    filename = filename.split('/')
    filename = filename[-1:][0].split('_')
    filename.remove('devices')
    df['date'], df['period'], df['epoch'] = filename[:]
    
    li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True)

In [6]:
pprint(frame)

                       Device  Max Clients  Unique Clients     Building  \
0         Wainani H 362B Temp            2               2      Wainani   
1           Frear D 507C Temp            7               8        Frear   
2           Frear D 207C Temp            8              10        Frear   
3           Frear D 707C Temp            6               8        Frear   
4             Noelani E Store            4              39      Noelani   
...                       ...          ...             ...          ...   
399061    C-MORE 104 Outreach            1               3       C-MORE   
399062            C-MORE 202A            0               1       C-MORE   
399063             C-MORE 218            0               1       C-MORE   
399064  Athletics 345 Hallway            1               7    Athletics   
399065         CBA G103 Mauka            1               1  Shidler CBA   

             date        period           epoch Buildings Building 37  
0       2020-0824  1200pm-2

In [7]:
import fiona 
import geopandas as gpd
import shapely

# Enable fiona driver
gpd.io.file.fiona.drvsupport.supported_drivers['KML'] = 'rw'

# Read file
df = gpd.read_file('data-main/wireless20200511125738.kml', driver='KML')

# Drop Z dimension of polygons that occurs often in kml 
df.geometry = df.geometry.map(lambda polygon: shapely.ops.transform(lambda x, y, z: (x, y), polygon))

In [8]:
print(df)

                              Name  \
0                           Biomed   
1                      Bilger Hall   
2                    Campus Center   
3                 Hamilton Library   
4                      George Hall   
..                             ...   
122                 St John Garden   
123   Law School Clinical Building   
124       Lyon Arboretum Cottage G   
125        Manoa Innovation Center   
126  Lower Campus Maintenance Shed   

                                           Description  \
0    <ul> <li>A-Court: A 110, A110T</li> <li>B-Cour...   
1    <ul> <li>1st Floor</li> <li>2nd Floor</li> <li...   
2    <ul> <li>105 (Stir Fresh), 117 (Game Room), 12...   
3    <ul> <li>Main Building </li> <li>Basement</li>...   
4    <ul> <li>1st Floor</li> <li>2nd Floor</li> <li...   
..                                                 ...   
122                                         <ul> </ul>   
123                                         <ul> </ul>   
124                  

In [9]:
# For simplicity, we ignore the curvature of the Earth: https://gis.stackexchange.com/questions/372564/userwarning-when-trying-to-get-centroid-from-a-polygon-geopandas
df['center_point'] = df['geometry'].centroid

                              Name  \
0                           Biomed   
1                      Bilger Hall   
2                    Campus Center   
3                 Hamilton Library   
4                      George Hall   
..                             ...   
122                 St John Garden   
123   Law School Clinical Building   
124       Lyon Arboretum Cottage G   
125        Manoa Innovation Center   
126  Lower Campus Maintenance Shed   

                                           Description  \
0    <ul> <li>A-Court: A 110, A110T</li> <li>B-Cour...   
1    <ul> <li>1st Floor</li> <li>2nd Floor</li> <li...   
2    <ul> <li>105 (Stir Fresh), 117 (Game Room), 12...   
3    <ul> <li>Main Building </li> <li>Basement</li>...   
4    <ul> <li>1st Floor</li> <li>2nd Floor</li> <li...   
..                                                 ...   
122                                         <ul> </ul>   
123                                         <ul> </ul>   
124                  


  df['center_point'] = df['geometry'].centroid


In [None]:
print(df['Name'])

In [None]:
# some of the building names are different in the KML file. This shows how many we still need coordinates for:
a = frame['Building'].unique()
print(len(a))
l3 = [x for x in a if x not in list(df['Name'])]
print(l3)
print(len(l3))

In [10]:
df['Building'] = df['Name']

In [16]:
# create a column for the building's coordinates by converting the Shapely geometry object into a list
df['coordinates'] = [[center.x, center.y] for center in df['center_point'] if center != None]

# https://stackoverflow.com/questions/53645882/pandas-merging-101
frame = frame.merge(df, on='Building', how='outer')

In [17]:
pprint(frame)

                     Device  Max Clients  Unique Clients  \
0       Wainani H 362B Temp          2.0             2.0   
1             Wainani F 502         12.0            15.0   
2             Wainani F 605          6.0            14.0   
3             Wainani F 603          6.0            10.0   
4             Wainani F 604          5.0             7.0   
...                     ...          ...             ...   
399105                  NaN          NaN             NaN   
399106                  NaN          NaN             NaN   
399107                  NaN          NaN             NaN   
399108                  NaN          NaN             NaN   
399109                  NaN          NaN             NaN   

                             Building       date        period  \
0                             Wainani  2020-0824  1200pm-259pm   
1                             Wainani  2020-0824  1200pm-259pm   
2                             Wainani  2020-0824  1200pm-259pm   
3              