In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('/fs5/p_masi/kimm58/MRIQC_experiments/WRAP_anat/derivatives/group_T1w.tsv', sep='\t')

In [3]:
#for each of the 64 metrics, get the rows that fall outside of the Q1−1.5×IQR to Q3+1.5×IQR range
#store the results in a duplicate dataframe, where the only values are 0 or 1 (0 if the value is not an outlier, 1 if it is for that metric)
outlier_df = df.copy()
for metric in df.columns[1:]:
    Q1 = df[metric].quantile(0.25)
    Q3 = df[metric].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outlier_df[metric] = ((df[metric] < lower_bound) | (df[metric] > upper_bound)).astype(int)
    #print(f'{metric}: {len(outliers)} outliers')


In [4]:
#see the scans (top 10) with the most outliers
outlier_df['num_outliers'] = outlier_df.iloc[:,1:].sum(axis=1)
outlier_df = outlier_df.sort_values(by='num_outliers', ascending=False)

In [28]:
#see how many rows have at least 2 outliers, at least 5, at least 10
#print(f'Number of scans with at least 2 outliers: {len(outlier_df[outlier_df["num_outliers"] >= 2])}')
print(f'Number of scans with at least 5 outliers: {len(outlier_df[outlier_df["num_outliers"] >= 5])}')
print(f'Number of scans with at least 10 outliers: {len(outlier_df[outlier_df["num_outliers"] >= 10])}')
print(f'Number of scans with at least 15 outliers: {len(outlier_df[outlier_df["num_outliers"] >= 15])}')
print(f'Number of scans with at least 20 outliers: {len(outlier_df[outlier_df["num_outliers"] >= 20])}')

Number of scans with at least 5 outliers: 234
Number of scans with at least 10 outliers: 38
Number of scans with at least 15 outliers: 5
Number of scans with at least 20 outliers: 2


In [34]:
#read in the QA.csv file for the T1w comparison to get the files that were flagged as something
qa_df = pd.read_csv('/fs5/p_masi/kimm58/MRIQC_experiments/QA_root/WRAP/T1w/QA.csv')
#flagged_maybe = qa_df[(qa_df['QA_status'] == 'maybe') | (qa_df['QA_status'] == 'no')]

#get the list of files that were flagged as something
qa_dict = {}
for status in ['maybe', 'no']:
    qa_dict[status] = [] 
    qa_status_df = qa_df[qa_df['QA_status'] == status]
    for i, row in qa_status_df.iterrows():
        #print(row)
        sub, ses, acq, run = row['sub'], row['ses'], row['acq'], row['run']
        acqx = '_'+acq if not pd.isna(acq) else ''
        runx = '_'+run if not pd.isna(run) else ''
        file = f"{sub}_{ses}{acqx}{runx}_T1w"
        qa_dict[status].append(file)

#get the list from the outlier_df at each threshold number
outlier_dict = {}
for num_outliers in [5,10,15,20]:
    outlier_dict[num_outliers] = outlier_df[outlier_df['num_outliers'] >= num_outliers]['bids_name'].tolist()


In [36]:
#for the files with a no status, see if they are in the outlier lists
for status in ['maybe', 'no']:
    print(f'Number of {status} files: {len(qa_dict[status])}')
    for num_outliers in [5,10,15,20]:
        outlier_list = outlier_dict[num_outliers]
        count = 0
        for file in qa_dict[status]:
            if file in outlier_list:
                count += 1
        print(f'Number of {status} files in the {num_outliers} outliers list: {count}')

Number of maybe files: 39
Number of maybe files in the 5 outliers list: 8
Number of maybe files in the 10 outliers list: 2
Number of maybe files in the 15 outliers list: 0
Number of maybe files in the 20 outliers list: 0
Number of no files: 2
Number of no files in the 5 outliers list: 2
Number of no files in the 10 outliers list: 2
Number of no files in the 15 outliers list: 1
Number of no files in the 20 outliers list: 1
