In [None]:
import os
import yaml
import warnings
import pandas as pd
from datetime import datetime, timedelta
from modules.excel_mod import preprocess_legacy_data_from_excel
from modules.transform_Georgia_mod import transform_column_names
from modules.transform_Georgia_mod import preprocess_Georgia_data
from modules.transform_Georgia_mod import create_patient_condition_table

warnings.filterwarnings("ignore")
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [None]:
# Read config from yaml file
with open('../config/config.yml', 'r') as config_file:
    config = yaml.safe_load(config_file)

# Setting the path for the excel file found in the config file.
file_path = config['Georgia_2250_file_path']
file_paths = [file_path]

In [None]:
# Extracts the sheets of interest from the excel file and into csv form
# Run this once to create the csv files. 
file_number = '2250'
month = 'november'
target_sheet = 'November 2100-2250'
# preprocess_legacy_data_from_excel(file_paths, file_number, month, target_sheet)

In [None]:
# Declare the dataframes that we want to process:
df_november = pd.read_csv('../countries/Georgia/data/raw/Georgia_legacy_2250_1_november.csv')

# To get the registration dates go to the sql_queries folder and use the query  
# Don't forget to clean the csv file and keep only the columns that are relevant.
CaseBrowser_prod_data = pd.read_csv('../countries/Georgia/data/raw/Casebrowser_prod_data_1-8-24.csv')
registrationdate = CaseBrowser_prod_data[['patient_local_identifier', 'registrationdate']]

In [None]:
df_november.head(20)

In [None]:
# Change the third parameter depending on the values in the csv file or insert dataframe alone if there are no rows to drop
# Pick back up here.
rows_to_drop = [0,1,2]
preprocess_Georgia_data(df_november, registrationdate, rows_to_drop, file_number, month)

In [None]:
# Iterate through the csv files in the intermediate directory and transform the column names
# Then output the results into the processed directory
interm_dir = '../countries/Georgia/data/intermediate'

# A list of all the csv files in the intermediate directory
csv_files = [file_name for file_name in os.listdir(interm_dir) if file_name.endswith('.csv')]

# Filter files created today
today = datetime.now().date()
recent_csv_files = [file_name for file_name in csv_files if datetime.fromtimestamp(os.path.getctime(os.path.join(interm_dir, file_name))).date() == today]

# Sort files based on creation time
recent_csv_files.sort(key=lambda x: os.path.getctime(os.path.join(interm_dir, x)), reverse=True)

# Process only the most recent files
if recent_csv_files:
    for csv_file in recent_csv_files:
        file_path = os.path.join(interm_dir, csv_file)
        transform_column_names(file_path)
else:
    print('No recently created files to process.')

#### Create the patient and condition table

* If the patient table in the database does not have the indentifiers in the processed data, then the patient data has to be added. 

* The same goes for the condition table

In [None]:
# Use this function to save the dataframes into the processed directory for the patient and condition tables
def gen_save(dataframes, month):
    data_folder = '../countries/Georgia/data/processed/'
    
    index_labels = {1: 'patient', 2: 'condition'}
    
    # Loop through the dataframes list and save them 
    for idx, df in enumerate(dataframes, start=1):
        csv_file_name = f'Georgia_{idx}_{month}'
        
        # Conditionally add the label based on the index
        if idx in index_labels:
            csv_file_name += f'_{index_labels[idx]}'
        
        csv_file_name += '.csv'
        
        csv_file_path = os.path.join(data_folder, csv_file_name)
        df.to_csv(csv_file_path, index=False)

In [None]:
processed_november = pd.read_csv('../countries/Georgia/data/processed/Georgia_2250_1_november_specimen.csv')

In [None]:
patient_condition_november = create_patient_condition_table(processed_november, CaseBrowser_prod_data)

In [None]:
# Print tuples of the dataframes
patient_condition_november[0].head(10) # *patient table*

In [None]:
patient_condition_november[1].head(10) # *condition table*

In [None]:
# Now I want to make the tuples into individual dataframes
november_patient = patient_condition_november[0]
november_condition = patient_condition_november[1]

In [None]:
# List of dataframes to save
dataframes = [november_patient, november_condition]

In [None]:
gen_save(dataframes, month)

In [None]:
patients = pd.read_csv('../countries/Georgia/data/processed/Georgia_patient_november.csv')
condition = pd.read_csv('../countries/Georgia/data/processed/Georgia_condition_november.csv')

In [None]:
patients.head(20)

In [None]:
condition.head(20)

#### Check file's for duplicates

In [None]:
# TODO: Create a function that checks for duplicates

In [None]:
patients[patients.duplicated(['gender', 'managingorganizationid', 'identifier'])]

In [None]:
condition[condition.duplicated(['identifier', 'casedefinition','ageonset','registrationdate'])]

In [None]:
df_1 = pd.read_csv('Georgia/data/processed/Georgia_2250_1_november_specimen.csv')
df_2 = pd.read_csv('Georgia/data/processed/Georgia_2250_2_november_culture.csv')
df_3 = pd.read_csv('Georgia/data/processed/Georgia_2250_3_november_microscopy.csv')

In [None]:
df_1[df_1.duplicated(['identifier', 'registrationdate','containeridentifier','collected','bodysite'])]

In [None]:
df_2[df_2.duplicated(['identifier', 'registrationdate','containeridentifier','issued','value', 'culturetype'])]

In [None]:
df_3[df_3.duplicated(['identifier', 'registrationdate','containeridentifier','issued','value','microscopytype'])]

In [None]:
# Drop duplicates from files
df_1.drop_duplicates(['identifier', 'registrationdate','containeridentifier','collected','bodysite'], inplace=True)
df_2.drop_duplicates(['identifier', 'registrationdate','containeridentifier','issued','value','culturetype'], inplace=True)
df_3.drop_duplicates(['identifier', 'registrationdate','containeridentifier','issued','value','microscopytype'], inplace=True)

In [None]:
# Save the new dataframes
df_1.to_csv('../countries/Georgia/data/processed/Georgia_2250_november_drop_dups_1.csv', index=False)
df_2.to_csv('../countries/Georgia/data/processed/Georgia_2250_november_drop_dups_2.csv', index=False)
df_3.to_csv('../countries/Georgia/data/processed/Georgia_2250_november_drop_dups_3.csv', index=False)