In [64]:
import json
import pandas as pd

The raw data file needs to be parsed line by line. Each line being a json format object.

Raw data file structure:
Line 1: 
        JSON object containing metadata for the meeting. For example:
           "uuid":"879EO_1464916044500" # Unique identifier for meetings
           "group":"879EO" # Unique identifier for groups
           "members":["W6NMGEVHRE","PVJPFRZB3S"]
           "startTime":"2016-06-03T01:07:24.500Z"
           "moderator":"none"
           "location":"meetingroom"
           "type":"study"
           "description":""
           "showVisualization":true

Line 2 to EOF: 
        JSON object containing batched sample data and associated metadata. For example:
            "voltage":2.7378299236297607
            "timestamp":1464916052 # Reference UNIX timestamp for the start of the timeseries
            "timestamp_ms":223 # Add to reference timestamp to get millisecond resolution
            "sampleDelay":50 # Sampling period
            "numSamples":114 # Number of samples in the "samples" array
            "samples":[5,5,6,...] # Time series of volume
        

First import the data from "file_name" as a JSON object

In [65]:
input_file_name = "879EO_1464916044500.txt"
with open(input_file_name,'r') as input_file:
    raw_data = input_file.readlines() #This is a list of strings
    meeting_metadata = json.loads(raw_data[0]) #Convert the header string into a json object
    batched_sample_data = map(json.loads,raw_data[1:]) #Convert the raw sample data into a json object
    print batched_sample_data[0]

{u'timestamp': 1464916046, u'numSamples': 114, u'timestamp_ms': 523, u'voltage': 2.7378299236297607, u'samples': [6, 6, 5, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 5, 5, 5, 5, 5, 5, 10, 5, 6, 5, 6, 5, 5, 5, 5, 5, 6, 5, 5, 5, 5, 5, 5, 6, 5, 5, 5, 5, 5, 5, 5, 5, 6, 5, 5, 5, 5, 5, 5, 6, 5, 6, 5, 5, 5, 5, 5, 5, 5, 5, 6, 5, 5, 5, 6, 5, 5, 6, 6, 5, 5, 6, 5, 6, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 5, 6, 5, 6, 6, 6, 5, 5, 5, 6, 6, 6, 6, 5, 6, 5, 5, 6, 5, 5, 5, 6, 5, 5], u'sampleDelay': 50}


We need to separate the individual samples from the batched samples. The following piece of code will create an array of individual samples while retaining the batch metadata.

In [66]:
sample_data = []

for j in range(len(batched_sample_data)):
    batch = {}
    batch.update(batched_sample_data[j]) #Create a deep copy of the jth batch of samples
    samples = batch.pop('samples')
    reference_timestamp = batch.pop('timestamp')*1000+batch.pop('timestamp_ms') #reference timestamp in milliseconds
    sampleDelay = batch.pop('sampleDelay')
    numSamples = batch.pop('numSamples')
    for i in range(numSamples):
        sample = {}
        sample.update(batch)
        sample['signal'] = samples[i]
        sample['timestamp'] = reference_timestamp + i*sampleDelay
        sample_data.append(sample)
    

Create a pandas dataframe from the list of samples.

In [67]:
df_sample_data = pd.DataFrame(sample_data)
df_sample_data['datetime'] = pd.to_datetime(df_sample_data['timestamp'], unit='ms')
del df_sample_data['timestamp']

df_sample_data.sort_values('datetime')
# Optional: Add the meeting metadata to the dataframe
df_sample_data.metadata = meeting_metadata
df_sample_data.set_index(pd.DatetimeIndex(df_sample_data['datetime']),inplace=True)
df_sample_data.index.name = 'datetime'
del df_sample_data['datetime']

In [68]:
print df_sample_data

                         signal   voltage
datetime                                 
2016-06-03 01:07:26.523       6  2.737830
2016-06-03 01:07:26.573       6  2.737830
2016-06-03 01:07:26.623       5  2.737830
2016-06-03 01:07:26.673       6  2.737830
2016-06-03 01:07:26.723       5  2.737830
2016-06-03 01:07:26.773       5  2.737830
2016-06-03 01:07:26.823       5  2.737830
2016-06-03 01:07:26.873       5  2.737830
2016-06-03 01:07:26.923       5  2.737830
2016-06-03 01:07:26.973       5  2.737830
2016-06-03 01:07:27.023       5  2.737830
2016-06-03 01:07:27.073       5  2.737830
2016-06-03 01:07:27.123       5  2.737830
2016-06-03 01:07:27.173       5  2.737830
2016-06-03 01:07:27.223       6  2.737830
2016-06-03 01:07:27.273       5  2.737830
2016-06-03 01:07:27.323       5  2.737830
2016-06-03 01:07:27.373       5  2.737830
2016-06-03 01:07:27.423       5  2.737830
2016-06-03 01:07:27.473       5  2.737830
2016-06-03 01:07:27.523       5  2.737830
2016-06-03 01:07:27.573      10  2

Output the dataframe to a csv file if needed

In [69]:
output_file_name = input_file_name.split(".")[0] + ".csv"
print output_file_name
df_sample_data.to_csv(output_file_name)

879EO_1464916044500.csv
