# Analysis of the data

In [1]:
import re

%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('../data/hope_mine_temparature_and_humidity.csv')

## Question 1: What are the columns

In [3]:
df.head()

Unnamed: 0,time,day,month,year,temp_open,temp_top,temp_5m,temp_10m,temp_20m,humidity_open,humidity_top,humidity_5m,humidity_10m,humidity_20m
0,14:35:01,15,7,2016,28.655,29.116,29.132,26.084,,28.522,27.828,45.301,42.496,
1,15:05:01,15,7,2016,24.656,24.117,24.633,24.584,,27.145,25.121,27.816,26.514,
2,15:35:01,15,7,2016,24.656,24.117,24.633,24.584,,27.145,25.8,28.481,27.198,
3,16:05:01,15,7,2016,24.656,24.117,25.133,25.084,,27.145,27.828,27.148,27.88,
4,16:35:01,15,7,2016,25.656,25.117,26.133,25.584,,27.834,28.501,29.146,27.88,


In [4]:
df.columns

Index(['time', 'day', 'month', 'year', 'temp_open', 'temp_top', 'temp_5m',
       'temp_10m', 'temp_20m', 'humidity_open', 'humidity_top', 'humidity_5m',
       'humidity_10m', 'humidity_20m'],
      dtype='object')

## Data: Question 1

We have columns for the time and dates.  Then we have columns for the temparature and columns for the humidity.

The data was collected at the unused Hope Mine in the Central Namib Desert, in Namibia.  It is basically a hole in the ground.  So we can see that the measurements for the temparature and humidity were taken outside the mine, at the opening, 5, 10 and 20 meters into the mine.

- time
- day
- month
- year
- temp_open
- temp_top
- temp_5m
- temp_10m
- temp_20m
- humidity_open
- humidity_top
- humidity_5m
- humidity_10m
- humidity_20m


## Question 2: Over what period of time?

In [5]:
df.shape

(19294, 14)

In [6]:
df.loc[0, ['day', 'month', 'year']]

day        15
month       7
year     2016
Name: 0, dtype: object

In [7]:
df.loc[df.shape[0]-1, ['day', 'month', 'year']]

day        21
month       8
year     2017
Name: 19293, dtype: object

## Data: Question 2

We have columns for the time and dates. Then we have columns for the temparature and columns for the humidity.

The data was collected at the unused Hope Mine in the Central Namib Desert, in Namibia. It is basically a hole in the ground. So we can see that the measurements for the temparature and humidity were taken outside the mine, at the opening, 5, 10 and 20 meters into the mine.

- time
- day, month, year - Range from 15 July 2016 to 21 August 2017
- temp_open
- temp_top
- temp_5m
- temp_10m
- temp_20m
- humidity_open
- humidity_top
- humidity_5m
- humidity_10m
- humidity_20m


## Question 3: What are the time blocks.  How often was the data sampled

In [8]:
time = df['time']

In [9]:
len(time.unique())

726

In [10]:
uniquetime = time.unique()

In [None]:
np.sort(uniquetime)

In [34]:
def split_and_fix(val):
    split_up = val.split(':')[0:2]
    split_up[0] = split_up[0].zfill(2)
    return ':'.join(split_up)
    
split_and_fix("1:29:15")
    

'01:29'

In [35]:
hour_minutes = np.array([split_and_fix(x) for x in uniquetime])

In [38]:
hour_minutes

array(['14:35', '15:05', '15:35', '16:05', '16:35', '17:05', '17:35',
       '18:05', '18:35', '19:05', '19:35', '20:05', '20:35', '21:05',
       '21:35', '22:05', '22:35', '23:05', '23:35', '00:05', '00:35',
       '01:05', '01:35', '02:05', '02:35', '03:05', '03:35', '04:05',
       '04:35', '05:05', '05:35', '06:05', '06:35', '07:05', '07:35',
       '08:05', '08:35', '09:05', '09:35', '10:05', '10:35', '11:05',
       '11:35', '12:05', '12:35', '13:05', '13:35', '14:05', '12:59',
       '13:29', '13:59', '14:29', '14:59', '15:29', '15:59', '16:29',
       '16:59', '17:29', '17:59', '18:29', '18:59', '19:29', '19:59',
       '20:29', '20:59', '21:29', '21:59', '22:29', '22:59', '23:29',
       '23:59', '00:29', '00:59', '01:29', '01:59', '02:29', '02:59',
       '03:29', '03:59', '04:29', '04:59', '05:29', '05:59', '06:29',
       '06:59', '07:29', '07:59', '08:29', '08:59', '09:29', '09:59',
       '10:29', '10:59', '11:29', '11:59', '12:29', '00:29', '00:59',
       '01:29', '01:

In [39]:
uhour_minutes = np.unique(hour_minutes)
uhour_minutes

array(['00:03', '00:05', '00:07', '00:08', '00:09', '00:16', '00:17',
       '00:19', '00:24', '00:26', '00:29', '00:33', '00:35', '00:37',
       '00:38', '00:39', '00:46', '00:47', '00:49', '00:54', '00:56',
       '00:59', '01:03', '01:05', '01:07', '01:08', '01:09', '01:16',
       '01:17', '01:19', '01:24', '01:26', '01:29', '01:33', '01:35',
       '01:37', '01:38', '01:39', '01:46', '01:47', '01:49', '01:54',
       '01:56', '01:59', '02:03', '02:05', '02:07', '02:08', '02:09',
       '02:16', '02:17', '02:19', '02:24', '02:26', '02:29', '02:33',
       '02:35', '02:37', '02:38', '02:39', '02:46', '02:47', '02:49',
       '02:54', '02:56', '02:59', '03:03', '03:05', '03:07', '03:08',
       '03:09', '03:16', '03:17', '03:19', '03:24', '03:26', '03:29',
       '03:33', '03:35', '03:37', '03:38', '03:39', '03:46', '03:47',
       '03:49', '03:54', '03:56', '03:59', '04:03', '04:05', '04:07',
       '04:08', '04:09', '04:16', '04:17', '04:19', '04:24', '04:26',
       '04:29', '04:

In [40]:
len(uhour_minutes)

528

In [46]:
df.loc[:, ['time', 'day', 'month', 'year']][:10]

Unnamed: 0,time,day,month,year
0,14:35:01,15,7,2016
1,15:05:01,15,7,2016
2,15:35:01,15,7,2016
3,16:05:01,15,7,2016
4,16:35:01,15,7,2016
5,17:05:01,15,7,2016
6,17:35:01,15,7,2016
7,18:05:01,15,7,2016
8,18:35:01,15,7,2016
9,19:05:01,15,7,2016


Time was sampled twice an hour, it appears, but that time changes (perhaps this reflect something like battery changes for the loggers?  If I in them, as follows, 'hh:00' for any minutes less than 30, and 