# Extract Raw Data from ObserveRTC

The output is a CSV file which will be appened to after each run.

A local file will keep track of when the script was run last and only query data for that given time period.

If there already exists a local data file, then it will load it and remove the latest duplicates. 

And append the newest entries. After this other data processing can happen by reading the CSV file.

In [1]:
import sys
from dotenv import load_dotenv
from pymongo import MongoClient
from datetime import datetime, timedelta
import pandas as pd
import os
import json
import logging


logging.basicConfig(
  format=f'%(asctime)s %(levelname)-8s %(message)s ',
  level=logging.INFO,
  datefmt='%Y-%m-%d %H:%M:%S',
  handlers=[
  logging.StreamHandler(sys.stdout)
  ])

In [2]:
load_dotenv()

filename = "output_folder/reportData.csv"


address = 'mongodb://{user}:{password}@{host}:{port}'.format(
    user= os.getenv('MONGO_USER'),
    password= os.getenv('MONGO_PASSWORD'),
    host= "localhost",
    port= os.getenv('MONGO_PORT')
)
client = MongoClient(address)
database=client["observertc-reports"]
reportsDatabase = database.reports

dateStart = datetime(2022, 12, 22)
dateEnd = datetime(2022, 12, 23)
finalDate = datetime(2023, 1, 25)

while (dateStart < finalDate):
    

    logging.info(f"Starting query for date range {dateStart} to {dateEnd}.")

    queryTimeStart = dateStart.timestamp() * 1000
    queryTimeEnd = dateEnd.timestamp() * 1000

    query = {   "type": "CLIENT_EXTENSION_DATA", 
                "payload.extensionType" : 
                    { "$in" : [ "OUT_BOUND_RTC", 
                                "IN_BOUND_RTC", 
                                "REMOTE_OUT_BOUND_RTC", 
                                "REMOTE_IN_BOUND_RTC"]},
                "payload.userId": ""
                "payload.timestamp" : {"$gt": queryTimeStart},
                "payload.timestamp" : {"$lt": queryTimeEnd}}

    logging.info(f"Query: {query}")

    cursor = reportsDatabase.find(query)
    ##size = len(list(cursor))
    #logging.info(f"Got data from database, size: {size}, converting to data frame.")

    #logPoint = size/10
    #i = 0
    dataSet = []
    for record in cursor:
        
        data = {}
        #append timestamp to data
        data["timestamp"] = record["payload"]["timestamp"]
        data["callId"] = record["payload"]["callId"]
        data["roomId"] = record["payload"]["roomId"]
        data["clientId"] = record["payload"]["clientId"]
        data["userId"] = record["payload"]["userId"]
        data["sampleSeq"] = record["payload"]["sampleSeq"]

        a = json.loads(record["payload"]["payload"])

        # https://stackoverflow.com/questions/38987/how-do-i-merge-two-dictionaries-in-a-single-expression
        data = {**data, **a["stats"]}



        #if(i % logPoint == 0):
        #    logging.info(f"Processed {i}/{size} records.")
        dataSet.append(data)
        

        #i = i+1
    if(len(dataSet) > 0):
        newData = pd.DataFrame(dataSet)
        newData["callId"]=newData["callId"].astype(str)
        newData["roomId"]=newData["roomId"].astype(str)
        newData["clientId"]=newData["clientId"].astype(str)
        newData["userId"]=newData["userId"].astype(str)
        newData.to_csv(filename, mode='w', header=True, index=False)

    dateStart = dateEnd
    # Add a day to the end date
    dateEnd = dateEnd + timedelta(days=1)
    

2023-01-26 11:11:01 INFO     Starting query for date range 2022-12-22 00:00:00 to 2022-12-23 00:00:00. 
2023-01-26 11:11:01 INFO     Query: {'type': 'CLIENT_EXTENSION_DATA', 'payload.extensionType': {'$in': ['OUT_BOUND_RTC', 'IN_BOUND_RTC', 'REMOTE_OUT_BOUND_RTC', 'REMOTE_IN_BOUND_RTC']}, 'payload.timestamp': {'$lt': 1671750000000.0}} 
2023-01-26 11:11:29 INFO     Starting query for date range 2022-12-23 00:00:00 to 2022-12-24 00:00:00. 
2023-01-26 11:11:29 INFO     Query: {'type': 'CLIENT_EXTENSION_DATA', 'payload.extensionType': {'$in': ['OUT_BOUND_RTC', 'IN_BOUND_RTC', 'REMOTE_OUT_BOUND_RTC', 'REMOTE_IN_BOUND_RTC']}, 'payload.timestamp': {'$lt': 1671836400000.0}} 
2023-01-26 11:11:35 INFO     Starting query for date range 2022-12-24 00:00:00 to 2022-12-25 00:00:00. 
2023-01-26 11:11:35 INFO     Query: {'type': 'CLIENT_EXTENSION_DATA', 'payload.extensionType': {'$in': ['OUT_BOUND_RTC', 'IN_BOUND_RTC', 'REMOTE_OUT_BOUND_RTC', 'REMOTE_IN_BOUND_RTC']}, 'payload.timestamp': {'$lt': 16719

KeyboardInterrupt: 