In [None]:
# Install a pip package in the current Jupy
# ter kernel
import sys
!{sys.executable} -m pip uninstall -y pandas
!{sys.executable} -m pip install pandas==1.0.3
!{sys.executable} -m pip install pandas_profiling
!{sys.executable} -m pip install numpy

In [4]:
from qmenta.core.platform import Auth, post, parse_response
from getpass import getpass
import pandas as pd

import numpy as np
import utils
import utils_cleaning
import datetime
import pandas_profiling
import json

import os
#sys.path.append("../..")

import querries_qmenta_temp2
from pandas_profiling import ProfileReport

In [2]:
import importlib 
importlib.reload(querries_qmenta_temp2)

<module 'querries_qmenta_temp2' from '/home/jovyan/data-wrangling-MS/data_quality/scripts/querries_qmenta_temp2.py'>

## Fetch Data from Registries

In [5]:
project_id_reg = 3202

# base url to connect to the central platform
base_url = "https://platform.qmenta.com"
# PUT YOUR USERNAME (EMAIL) HERE
username = "tina.parciak@med.uni-goettingen.de"
# you will be asked for your password here
password = getpass()

# creation of authentication object
auth_obj = Auth.login(username, password, base_url)

# method to fetch the subjects data
def get_subjects_data(project_id):
    r = post(auth_obj, "/patient_manager/get_patient_list",
            {"_pid":project_id},
            timeout=600.0)

    data_trans = [{
            "id": record["_id"],
            "secret_name":record["patient_secret_name"],
            **{
                k[3:]:record[k]
                for k in record
                if k[:3] == "md_"
            }
    } for record in parse_response(r)]

    for r in data_trans:
        for k in r:
            if isinstance(r[k], dict):
                try:
                    r[k] = datetime.datetime.fromtimestamp(r[k]["$date"]/1000.0)
                except:
                    r[k] = None

    return data_trans

 ····················


In [6]:
data_reg = get_subjects_data(project_id_reg)
df_raw_reg = pd.DataFrame(data_reg)

## Fetch Data from direct Entry (Forms)

In [7]:
project_id_forms = 3150
data_forms = get_subjects_data(project_id_forms)

In [8]:
df_raw_forms = pd.DataFrame(data_forms)

In [9]:
# Build .tex file of merged raw and cleaned data (n/%) (registry + forms)
merged_df = pd.concat([df_raw_forms, df_raw_reg])

In [None]:
merged_df.info()

## Enhancement of Data

In [10]:
# Enhance and clean data
df_enhanced_all = utils.enhance_registry_data(merged_df.copy())
df_cleaned_all = utils_cleaning.clean_data(df_enhanced_all,auth_obj,project_id_reg, None, send_qa_staus=False) # Set to true to update qa status

In [None]:
df = df_cleaned_all[df_cleaned_all["report_source"]=="patients"]
cols = list(df.columns.values)
cols

## Basic description/counts for PRO data

In [110]:
## counts per country
dfCountry = df.copy()
#dfCountry[["secret_name","covid19_country"]].to_csv('test_country.csv')
dfCountry['covid19_country'] = df['covid19_country'].str.upper().str.replace(" ","").str.replace("^$","NaN").str.replace("^.*DEUTSCH.*$","GERMANY",regex=True).str.replace("_","").str.replace("^.*UNITEDKING.*$","UNITEDKINGDOM").str.replace("^.*UNITEDSTA.*$","USA")

# basic stats (count) for each selected variable in PRO data, grouped by country
res = dfCountry.groupby(['covid19_country']).count()
res.to_csv('reports/country_counts_general.csv')
#res.to_csv('reports/country_counts_general.txt', index=False, sep='\t')

# basic stats (mean) for each selected variable in PRO data, grouped by country
res2 = dfCountry.groupby(['covid19_country']).mean()

# basic stats (count,min,max) for each selected numerical variable in PRO data, grouped by country
res3 = dfCountry.groupby(['covid19_country']).describe()
res3.to_csv('reports/datastats_per_country2.csv')


In [111]:
# counts per country by covid infections (raw)

dfInfec = dfCountry.copy()
# set secret_name to number from 0 to length(df)
dfInfec["secret_name"] = np.arange(df.shape[0])

confirmed =dfInfec[dfInfec['covid19_confirmed_case']=='yes']
suspected_raw=dfInfec[dfInfec['covid19_suspected_case']=='yes']
noInfec =dfInfec[(dfInfec['covid19_confirmed_case']=='no') & (dfInfec['covid19_suspected_case']=='no')]
print(f"Total patient reported data count: {len(dfInfec.index)},\nConfirmed cases (raw): {len(confirmed.index)},\nSuspected cases (raw): {len(suspected_raw.index)},\nno infection (raw): {len(noInfec.index)}")


Total patient reported data count: 3337,
Confirmed cases (raw): 41,
Suspected cases (raw): 96,
no infection (raw): 1981


## Definition of covid_diagnosis criteria (confirmed, suspected, not_suspected)

In [113]:
dfCovid=dfInfec
dfCovid["covid19_diagnosis"] = "missing"

dfCovid.loc[
    (dfCovid.covid19_suspected_case=="yes") & 
    ((dfCovid.covid19_sympt_fever=="yes")|(dfCovid.covid19_sympt_dry_cough=="yes") | (dfCovid.covid19_sympt_loss_smell_taste=="yes")),"covid19_diagnosis"] = "suspected"

dfCovid.loc[
    ((dfCovid.covid19_sympt_fever=="yes")|(dfCovid.covid19_sympt_dry_cough=="yes") | (dfCovid.covid19_sympt_loss_smell_taste=="yes"))
    & ((dfCovid.covid19_sympt_fever=="yes")|(dfCovid.covid19_sympt_dry_cough=="yes") | (dfCovid.covid19_sympt_loss_smell_taste=="yes") | (dfCovid.covid19_sympt_shortness_breath == "yes")| (dfCovid.covid19_sympt_pneumonia == "yes")
    | (dfCovid.covid19_sympt_fatigue=="yes") | (dfCovid.covid19_sympt_pain=="yes") | (dfCovid.covid19_sympt_nasal_congestion=="yes") | 
    (dfCovid.covid19_sympt_chills=="yes")|(dfCovid.covid19_sympt_sore_throat=="yes")), "covid19_diagnosis"] = "suspected"

dfCovid.loc[(dfCovid.covid19_confirmed_case=="yes"),"covid19_diagnosis"] = "confirmed"

dfCovid.loc[((dfCovid.covid19_confirmed_case=="no") & (dfCovid.covid19_suspected_case=="no")), "covid19_diagnosis"]="not_suspected"

dfCovid[["covid19_country","covid19_diagnosis"]]

# control of counts of infection distribution
print(len(dfCovid[dfCovid["covid19_diagnosis"]=="confirmed"].index),
len(dfCovid[dfCovid["covid19_diagnosis"]=="suspected"].index),
len(dfCovid[dfCovid["covid19_diagnosis"]=="not_suspected"].index),
len(dfCovid[dfCovid["covid19_diagnosis"]=="missing"].index))

#dfCovid.covid19_country[dfCovid["covid19_diagnosis"]=="confirmed"].to_csv('reports/rows_confirmed.csv')
#dfCovid.covid19_country[dfCovid["covid19_diagnosis"]=="suspected"].to_csv('reports/rows_suspected.csv')

41 311 1981 1004


In [None]:
#dfCovid.groupby("covid19_diagnosis").count()["secret_name"].to_csv(f"./reports/patients_infection_counts.csv")
dfConf=dfCovid[dfCovid.covid19_diagnosis=="confirmed"].groupby("covid19_country").count()["secret_name"]
dfSusp=dfCovid[dfCovid.covid19_diagnosis=="suspected"].groupby("covid19_country").count()["secret_name"]
dfNotSusp=dfCovid[dfCovid.covid19_diagnosis=="not_suspected"].groupby("covid19_country").count()["secret_name"]
dfMiss=dfCovid[dfCovid.covid19_diagnosis=="missing"].groupby("covid19_country").count()["secret_name"]
#print(dfConf,dfSusp)

# dfConf.to_csv('reports/cases_confirmed_per_country.csv')
# dfSusp.to_csv('reports/cases_suspected_per_country.csv')
# dfNotSusp.to_csv('reports/cases_not_suspected_per_country.csv')
# dfMiss.to_csv('reports/cases_missing_per_country.csv')

dfInfStatus=[dfConf,dfSusp,dfNotSusp,dfMiss]


In [12]:
# Description of numeric data of patient data (enhanced)
dfP_enhanced = df_enhanced_all[df_enhanced_all["report_source"]=="patients"]
dfP_enhanced.describe().to_csv('reports/descr_dfP_enhanced.csv')

In [13]:
# Description of numeric data of patient data (cleaned)
dfP_cleaned = df_cleaned_all[df_cleaned_all["report_source"]=="patients"]
dfP_cleaned.describe().to_csv('reports/descr_dfP_cleaned.csv',index=False)