In [1]:
import pandas as pd
import os
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib.ticker import StrMethodFormatter
from matplotlib.ticker import LinearLocator
import numpy as np
import matplotlib.gridspec as gridspec
from scipy import stats

%matplotlib inline 

## importing and merging PCR datafrom COR

In [2]:
cor_data = pd.read_csv('../Data/Input/synthetic_COR_data.csv',sep = ',')

In [3]:
cor_data = cor_data.drop_duplicates(subset = 'Accession #', keep = 'last')

In [4]:
## Adding prefixes to every column in order distinuish the CT values
floq_data = cor_data.copy().add_prefix('floq_')
floq_data = floq_data.rename(columns= {'floq_Accession #': 'floq requisition number'})

In [5]:
group1_evalyn_data = cor_data.copy().add_prefix('evalyn_')
group1_evalyn_data = group1_evalyn_data.rename(columns = {'evalyn_Accession #':'evalyn requisition number'})

## Importing group 1 masterark which contains the link between evalyn and floq/emily IDs.

In [6]:
#importing masterark with all requisition numbers and evalyn identities from studiegruppe 1(evalyn vs floqswap)
group1_masterark = pd.read_csv('../Data/Input/synthetic_mastersheet.csv', sep = ',')
group1_masterark['Spørgeskema ID'] = group1_masterark['Spørgeskema ID'].apply(lambda x: x.lower() if isinstance(x, str) else x)
group1_masterark['Evalyn ID'] = group1_masterark['Evalyn ID'].apply(lambda x: x.upper() if isinstance(x, str) else x)

In [7]:
group1_masterark = group1_masterark.rename(columns = {'Spørgeskema ID': 'floq requisition number'})

## Merging data from laboratory, contains the link between the evalyn ID and requsitionsnumber of the COR analysis.

In [8]:
molab = pd.read_csv('../Data/Input/synthetic_molab_file.csv', sep = ',')
molab = molab.drop_duplicates(subset = 'Rekv.Nr.', keep = 'last')
molab['Klinisk Info'] = molab['Klinisk Info'].apply(lambda x: x.upper()if isinstance(x, str) else x)
evalyn_rek_numbers = molab.rename(columns={'Klinisk Info': 'Evalyn ID', 'Rekv.Nr.': 'evalyn requisition number'})

## Importing questionaire data from group 1 which contains the date of sampling and requisition number from floq

In [9]:
group1_questionaire_data = pd.read_csv('../Data/Input/synthetic_questionaire.csv', sep = ',')

In [10]:
group1_questionaire_data = group1_questionaire_data.rename(columns= {'Spørgeskema ID': 'floq requisition number'})

In [11]:
group1_questionaire_data = group1_questionaire_data[['floq requisition number', 'Dato for prøvetagning', 'Hvilken børste tog du først?']]

## Group 1: Merging COR data with the masterark and laboratory data in order to link evalyn and floq data

In [12]:
first_step = group1_masterark.merge(floq_data, how = 'inner', on = ['floq requisition number'])

In [13]:
second_step = first_step.merge(evalyn_rek_numbers, how = 'inner', on = ['Evalyn ID'])

In [14]:
final_group_1 = second_step.merge(group1_evalyn_data, how = 'inner', on= ['evalyn requisition number'])

## Merging central dataframe with questionaire data in order to include date of sampling. Subsequently, relevant collumns are converted to datetime datatype and time difference between sampling and analysis is registeret in the delta time column 

### Group 1

In [15]:
final_group_1['floq_Test Result Time'] = final_group_1['floq_Test Result Time'].str.replace('-','/')
group1_fourth_step = final_group_1.copy().merge(group1_questionaire_data, how = 'inner', on = ['floq requisition number'])

In [16]:
group1_fourth_step['time of analysis'] = pd.to_datetime(group1_fourth_step['floq_Test Result Time'], format = 'mixed')

In [17]:
group1_fourth_step['time of testing'] = pd.to_datetime(group1_fourth_step['Dato for prøvetagning'], format = '%d-%m-%y', errors='coerce')

In [18]:
group1_fourth_step['delta time'] = (group1_fourth_step['time of analysis'] - group1_fourth_step['time of testing']).dt.days

## Creating contingency tables for all evalyn vs floq outcome, all tests, and for evalyn as first sample and floqswab as first sample. 

In [19]:
def create_contingency(relevant_column_1, relevant_column_2, title):
    contingency_table = pd.crosstab(relevant_column_1, relevant_column_2, margins = True)
    contingency_table = contingency_table.map(lambda x: f'{x} ({round((x / len(evalyn_first)*100),2)} %)')
    contingency_table.to_csv(f'Output/{title}.csv')

In [20]:
contingency = pd.crosstab(final_group_1['floq_HPV HR Result'], final_group_1['evalyn_HPV HR Result'], margins = True)

In [21]:
contingency = contingency.map(lambda x: f'{x} ({round((x / len(final_group_1)*100),2)} %)')

In [22]:
contingency

evalyn_HPV HR Result,Neg,Pos,All
floq_HPV HR Result,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Neg,45 (45.0 %),24 (24.0 %),69 (69.0 %)
Pos,22 (22.0 %),9 (9.0 %),31 (31.0 %)
All,67 (67.0 %),33 (33.0 %),100 (100.0 %)
