In [None]:
from tqdm import tqdm 
from xml.etree import ElementTree as ET
from datetime import datetime
import re
import pandas as pd
import matplotlib.pyplot as plt


In [None]:
def parse_date(date_str):
    try:
        output = datetime.strptime(date_str, "%B %d, %Y")
    except:
        try:
            output = datetime.strptime(date_str, "%B %Y")
        except Exception as e:
            print(e)
            raise e
    return output

def calculate_duration(start_date, completion_date):
    # Unit: days
    if start_date and completion_date:
        start_date = parse_date(start_date)
        completion_date = parse_date(completion_date)
        duration = (completion_date - start_date).days
    else:
        duration = -1

    return duration

def xmlfile2date(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    try:
        start_date = root.find('start_date').text
    except:
        start_date = ''
    try:
        completion_date = root.find('primary_completion_date').text
    except:
        try:
            completion_date = root.find('completion_date').text 
        except:
            completion_date = ''

    return start_date, completion_date


In [None]:
date_list = []

# 480403 lines
with open("../data/trials/all_xml.txt", "r") as file:
    for xml_path in tqdm(file):
        xml_path = f"../data/{xml_path.strip()}"
        
        # NCT00000150 <- raw_data/NCT0000xxxx/NCT00000150.xml
        nct_id = re.search(r"/([^/]+)\.xml$", xml_path).group(1)
        
        start_date, completion_date = xmlfile2date(xml_path)

        if start_date and completion_date:
            duration = calculate_duration(start_date, completion_date)
        else:
            duration = -1

        date_list.append((nct_id, start_date, completion_date, duration))


In [None]:
# 478505 lines
date_df = pd.DataFrame(date_list, columns=['ntcid', 'start_date', 'completion_date', 'time_day'])
print(date_df)

# date_df.to_csv('data/ntcid_time_all.csv', index=False, sep='\t')

In [None]:
# Filter out unusable time data.
# 465251 lines
date_df = date_df[date_df['time_day'] > 0]
date_df.to_csv('../data/ntcid_time.csv', index=False, sep='\t')

In [None]:
# Analysis
medium_time_day = date_df['time_day'].median()
average_time_day = date_df['time_day'].mean()
minimum_time_day = date_df['time_day'].min()
maximum_time_day = date_df['time_day'].max()
std_time_day = date_df['time_day'].std()

print("Medium time_day:", medium_time_day)
print("Average time_day:", average_time_day)
print("Minimum time_day:", minimum_time_day)
print("Maximum time_day:", maximum_time_day)
print("Standard Deviation of time_day:", std_time_day)


In [None]:
date_df['month'] = date_df['time_day'] // 30
frequency_month = date_df['month'].value_counts().sort_index().reset_index()
print(frequency_month)

In [None]:
plot_data = frequency_month[frequency_month['month'] <= 72]

plt.figure(figsize=(10, 6))

plt.scatter(plot_data['month'], plot_data['count'], s=10, c='blue')

plt.xlabel('Month')
plt.ylabel('Frequency')
plt.title('Frequency of Months (Limited to 72)')
plt.grid(True)  # Add grid

plt.show()