# Extract Raw report data from ObserveRTC

The output is a series of CSV files, one for each client type.

In [1]:
import sys
from dotenv import load_dotenv
from pymongo import MongoClient
from datetime import datetime, timedelta
import pandas as pd
import os
import json
import logging


logging.basicConfig(
  format=f'%(asctime)s %(levelname)-8s %(message)s ',
  level=logging.INFO,
  datefmt='%Y-%m-%d %H:%M:%S',
  handlers=[
  logging.StreamHandler(sys.stdout)
  ])

outputFolder = "output_folder/rawReport/"


if not os.path.exists(outputFolder):

   # Create a new directory because it does not exist
   os.makedirs(outputFolder)
   logging.info(f"The directory \"{outputFolder}\" is created!")

In [2]:
load_dotenv()




address = 'mongodb://{user}:{password}@{host}:{port}'.format(
    user= os.getenv('MONGO_USER'),
    password= os.getenv('MONGO_PASSWORD'),
    host= "localhost",
    port= os.getenv('MONGO_PORT')
)
client = MongoClient(address)
database=client["observertc-reports"]
reportsDatabase = database.reports


userIds = [
    "c1-Normal",
    "c2-TorNormal",
    "c3-TorEurope",
    "c4-TorScandinavia",
    "c5-I2P",
    "c6-Lokinet",
    "d1-Normal",
    "d2-TorNormal",
    "d3-TorEurope",
    "d4-TorScandinavia",
    "d5-I2P",
    "d6-Lokinet"
]

for userId in userIds:
    

    logging.info(f"Starting query for user: {userId}")

    query = {   "type": "CLIENT_EXTENSION_DATA", 
                "payload.extensionType" : 
                    { "$in" : [ "OUT_BOUND_RTC", 
                                "IN_BOUND_RTC", 
                                "REMOTE_OUT_BOUND_RTC", 
                                "REMOTE_IN_BOUND_RTC"]},
                "payload.userId": userId}

    #logging.info(f"Query: {query}")

    cursor = reportsDatabase.find(query)
    ##size = len(list(cursor))
    #logging.info(f"Got data from database, size: {size}, converting to data frame.")
    logging.info(f"Got data from database, converting to data frame.")

    dataSet = []
    #i = 0
    for record in cursor:
        
        data = {}
        #append timestamp to data
        data["timestamp"] = record["payload"]["timestamp"]
        data["callId"] = record["payload"]["callId"]
        data["roomId"] = record["payload"]["roomId"]
        data["clientId"] = record["payload"]["clientId"]
        data["userId"] = record["payload"]["userId"]
        data["sampleSeq"] = record["payload"]["sampleSeq"]

        a = json.loads(record["payload"]["payload"])

        # https://stackoverflow.com/questions/38987/how-do-i-merge-two-dictionaries-in-a-single-expression
        data = {**data, **a["stats"]}



        #if(i % 1000 == 0):
        #    logging.info(f"Processed {i} records.")
        dataSet.append(data)
        #i = i + 1
        

        #i = i+1

    logging.info(f"{userId}, Dataset complete, size: {len(dataSet)}, coverting types")
    if(len(dataSet) > 0):
        newData = pd.DataFrame(dataSet)
        newData["callId"]=newData["callId"].astype(str)
        newData["roomId"]=newData["roomId"].astype(str)
        newData["clientId"]=newData["clientId"].astype(str)
        newData["userId"]=newData["userId"].astype(str)
        logging.info(f"{userId}, Dataset converted, writing to file")
        newData.to_csv((outputFolder + userId + ".csv") , mode='w', header=True, index=False)

    

2023-01-26 11:42:25 INFO     Starting query for user: c1-Normal 
2023-01-26 11:42:25 INFO     Got data from database, converting to data frame. 
2023-01-26 11:42:51 INFO     c1-Normal, Dataset complete, size: 508639, coverting types 
2023-01-26 11:43:04 INFO     c1-Normal, Dataset converted, writing to file 
2023-01-26 11:43:17 INFO     Starting query for user: c2-TorNormal 
2023-01-26 11:43:17 INFO     Got data from database, converting to data frame. 
2023-01-26 11:43:42 INFO     c2-TorNormal, Dataset complete, size: 417036, coverting types 
2023-01-26 11:43:54 INFO     c2-TorNormal, Dataset converted, writing to file 
2023-01-26 11:44:05 INFO     Starting query for user: c3-TorEurope 
2023-01-26 11:44:05 INFO     Got data from database, converting to data frame. 
2023-01-26 11:44:30 INFO     c3-TorEurope, Dataset complete, size: 414471, coverting types 
2023-01-26 11:44:37 INFO     c3-TorEurope, Dataset converted, writing to file 
2023-01-26 11:44:48 INFO     Starting query for user