### Make a mapping between subject id (e.g. LD1104) with the path of their fastq.gz file. Due to some subjects having multiple files, we have previously combined them to form one, with a modified name.

In [1]:
import os

In [2]:
path = "/labs/mignot/DGN/DGN_fastq/rawdata/"

In [3]:
folder_name_file_path_dict = dict()
for folder_name in os.listdir(path):
    subject_path = os.path.join(path, folder_name)
    for file_name in os.listdir(subject_path):
        if ("Levinson" in file_name) and file_name[-8:] == "fastq.gz":
            file_path = os.path.join(subject_path, file_name)
            if folder_name in folder_name_file_path_dict: # might have multiple fastq.gz files in one
                folder_name_file_path_dict[folder_name].append(file_path)
            else:
                folder_name_file_path_dict[folder_name] = [file_path]

In [4]:
count = 0
for folder_name in folder_name_file_path_dict:
    file_lst = folder_name_file_path_dict[folder_name]
    if len(file_lst) > 1:
        combined_fastq_path = os.path.dirname(file_lst[0])+"/{}.fastq.gz".format(folder_name)
        folder_name_file_path_dict[folder_name] = [combined_fastq_path]
        command = "cat " + " ".join(file_lst) + " > " + combined_fastq_path
        #with open("{}_combine.sh".format(folder_name), "w") as f:
        #    f.write("#!/bin/bash\n#SBATCH --job-name=default\n#SBATCH --nodes=1\n#SBATCH --ntasks=1\n#SBATCH --cpus-per-task=1\n#SBATCH --partition=batch\n#SBATCH --account=mignot\n#SBATCH --time=1:00:00\n")
        #    f.write(command)
        script = "{}_combine.sh".format(folder_name)
        #!sbatch {script}
        count += 1
count

169

In [5]:
count = 0
for folder_name in folder_name_file_path_dict:
    file_lst = folder_name_file_path_dict[folder_name]
    if len(file_lst) > 1:
        count += 1
    # since we got rid of the multiple files, no need to have a list structure
    folder_name_file_path_dict[folder_name] = file_lst[0]
assert count == 0

In [6]:
folder_name_file_path_dict

{'LD0141': '/labs/mignot/DGN/DGN_fastq/rawdata/LD0141/Levinson_Library_Pool_240_ACTTGA_ACTTGA_L007.fastq.gz',
 'LD0366': '/labs/mignot/DGN/DGN_fastq/rawdata/LD0366/Levinson_Library_Pool_041_TGACCA_TGACCA_L008.fastq.gz',
 'LD0954': '/labs/mignot/DGN/DGN_fastq/rawdata/LD0954/batch3.Levinson_Library_Pool_183_CAGATC_CAGATC_L006.fastq.gz',
 'LD0652': '/labs/mignot/DGN/DGN_fastq/rawdata/LD0652/batch3.Levinson_Library_Pool_110_ACAGTG_L001.fastq.gz',
 'LD0847': '/labs/mignot/DGN/DGN_fastq/rawdata/LD0847/Levinson_Library_Pool_157_TAGCTT_L005.fastq.gz',
 'LD0979': '/labs/mignot/DGN/DGN_fastq/rawdata/LD0979/batch3.Levinson_Library_Pool_183_GATCAG_GATCAG_L006.fastq.gz',
 'LD0327': '/labs/mignot/DGN/DGN_fastq/rawdata/LD0327/Levinson_Library_Pool_030_F04_CTTGTA_CTTGTA_L006.fastq.gz',
 'LD0278': '/labs/mignot/DGN/DGN_fastq/rawdata/LD0278/Levinson_Library_Pool_022_F03_ACTTGA_ACTTGA_L006.fastq.gz',
 'LD0599': '/labs/mignot/DGN/DGN_fastq/rawdata/LD0599/Levinson_Library_Pool_095_CTTGTA_CTTGTA_L007.fastq.

## To find covariates, go into each subject's fastq file and get flowcell information

In [16]:
import gzip

In [None]:
"""
EAS139	the unique instrument name
136	the run id
FC706VJ	the flowcell id
2	flowcell lane
2104	tile number within the flowcell lane
15343	'x'-coordinate of the cluster within the tile
197393	'y'-coordinate of the cluster within the tile
1	the member of a pair, 1 or 2 (paired-end or mate-pair reads only)
Y	Y if the read is filtered, N otherwise
18	0 when none of the control bits are on, otherwise it is an even number
ATCACG	index sequence
"""

In [28]:
metadata = []
for patid in folder_name_file_path_dict:
    fastq_path = folder_name_file_path_dict[patid]
    fastq_file = gzip.open(fastq_path, 'rb')
    first_line = fastq_file.readline().decode("utf-8")
    _,_,fcid,lane,_,_,_,_,_,_= first_line.strip().split(":")
    metadata.append([patid, fcid, lane])

In [34]:
import numpy as np
import pandas as pd

In [31]:
metadata_arr = np.array(metadata)

In [37]:
df = pd.DataFrame(metadata_arr, columns=['patid', 'fcid', 'lane'])

In [51]:
df.to_csv("DGN_covariates_df.csv", index=None)

In [7]:
# subjects_select = {'zero': ['LD0014',
#  'LD0041',
#  'LD0038',
#  'LD0084',
#  'LD0033',
#  'LD0157',
#  'LD0045',
#  'LD0120',
#  'LD0011',
#  'LD0013',
#  'LD0058',
#  'LD0001',
#  'LD0061',
#  'LD1225',
#  'LD0217',
#  'LD0205',
#  'LD0089',
#  'LD0083',
#  'LD0103',
#  'LD0027',
#  'LD0035',
#  'LD0063',
#  'LD0009',
#  'LD0056',
#  'LD0146',
#  'LD0104',
#  'LD0019',
#  'LD0006',
#  'LD0065',
#  'LD0068',
#  'LD0044',
#  'LD0226',
#  'LD1031',
#  'LD0079',
#  'LD0150',
#  'LD0253',
#  'LD0034',
#  'LD0040',
#  'LD0003',
#  'LD0059',
#  'LD0047',
#  'LD1258',
#  'LD0195',
#  'LD0042',
#  'LD0016',
#  'LD0020',
#  'LD0017',
#  'LD0134',
#  'LD0144',
#  'LD0300',
#  'LD0075',
#  'LD0094',
#  'LD0174',
#  'LD0213',
#  'LD0030',
#  'LD0062',
#  'LD0012',
#  'LD0024',
#  'LD0131',
#  'LD0082',
#  'LD0098',
#  'LD0193',
#  'LD0007',
#  'LD0284',
#  'LD0088',
#  'LD0054',
#  'LD0176',
#  'LD0163',
#  'LD0168',
#  'LD0275',
#  'LD0209',
#  'LD0097',
#  'LD0124',
#  'LD0277',
#  'LD0235',
#  'LD0669',
#  'LD0429',
#  'LD0325',
#  'LD0108',
#  'LD0160',
#  'LD0219',
#  'LD0133',
#  'LD0101',
#  'LD0162',
#  'LD0228',
#  'LD0177',
#  'LD0768',
#  'LD0183',
#  'LD0100',
#  'LD0224',
#  'LD0251',
#  'LD0261',
#  'LD0404',
#  'LD0116',
#  'LD1198',
#  'LD0141',
#  'LD0148',
#  'LD0154',
#  'LD0118',
#  'LD0127',
#  'LD0135',
#  'LD0070',
#  'LD0243',
#  'LD0189',
#  'LD0482',
#  'LD0221',
#  'LD0520',
#  'LD0302',
#  'LD0323',
#  'LD0233',
#  'LD0161',
#  'LD0145',
#  'LD0122',
#  'LD0109',
#  'LD0181',
#  'LD0132',
#  'LD0255',
#  'LD0218',
#  'LD0265',
#  'LD0268',
#  'LD0411',
#  'LD0090',
#  'LD0151',
#  'LD0299',
#  'LD0306',
#  'LD0496',
#  'LD0290',
#  'LD0418',
#  'LD0216',
#  'LD1226',
#  'LD0140',
#  'LD0152',
#  'LD0069',
#  'LD1193',
#  'LD0130',
#  'LD0252',
#  'LD0818',
#  'LD0220',
#  'LD0171',
#  'LD0128',
#  'LD0190',
#  'LD0105',
#  'LD0208',
#  'LD0184',
#  'LD0112',
#  'LD0552',
#  'LD0337',
#  'LD0624',
#  'LD0303',
#  'LD1220',
#  'LD0467',
#  'LD0427',
#  'LD0527',
#  'LD0559',
#  'LD0158',
#  'LD0236',
#  'LD0187',
#  'LD0244',
#  'LD0230',
#  'LD0173',
#  'LD0603',
#  'LD0413',
#  'LD0292',
#  'LD0307',
#  'LD0564',
#  'LD0610',
#  'LD0533',
#  'LD0582',
#  'LD0509',
#  'LD0604',
#  'LD0584',
#  'LD0207',
#  'LD0192',
#  'LD0175',
#  'LD0387',
#  'LD0294',
#  'LD0368',
#  'LD1328',
#  'LD0347',
#  'LD0249',
#  'LD0272',
#  'LD0599',
#  'LD0583',
#  'LD1217',
#  'LD0654',
#  'LD0441',
#  'LD0086',
#  'LD0136',
#  'LD0248',
#  'LD0119',
#  'LD0215',
#  'LD0483',
#  'LD0563',
#  'LD0641',
#  'LD0407',
#  'LD0341',
#  'LD0270',
#  'LD0324',
#  'LD0736',
#  'LD0374',
#  'LD0422',
#  'LD0623',
#  'LD0278',
#  'LD0310',
#  'LD0508',
#  'LD0631',
#  'LD0423',
#  'LD0492',
#  'LD0263',
#  'LD0519',
#  'LD0738',
#  'LD0720',
#  'LD0436',
#  'LD0434',
#  'LD0289',
#  'LD0345',
#  'LD0421',
#  'LD0686',
#  'LD0348',
#  'LD0354',
#  'LD0531',
#  'LD0540',
#  'LD0282',
#  'LD0769',
#  'LD0649',
#  'LD0523',
#  'LD0571',
#  'LD0652',
#  'LD0440',
#  'LD0403',
#  'LD0597',
#  'LD1332',
#  'LD0327',
#  'LD0468',
#  'LD0634',
#  'LD0269',
#  'LD0456',
#  'LD0420',
#  'LD0365',
#  'LD0353',
#  'LD0238',
#  'LD0308',
#  'LD0442',
#  'LD0364',
#  'LD0435',
#  'LD0680',
#  'LD0638',
#  'LD0626',
#  'LD0511',
#  'LD0349',
#  'LD0430',
#  'LD0329',
#  'LD0247',
#  'LD0445',
#  'LD0682',
#  'LD0676',
#  'LD0452',
#  'LD0635',
#  'LD0471',
#  'LD0283',
#  'LD0432',
#  'LD0677',
#  'LD0621',
#  'LD0586',
#  'LD0630',
#  'LD0579',
#  'LD0544',
#  'LD0316',
#  'LD0706',
#  'LD0326',
#  'LD0484',
#  'LD0393',
#  'LD1314',
#  'LD0575',
#  'LD0581',
#  'LD0526',
#  'LD0444',
#  'LD0472',
#  'LD0741',
#  'LD0766',
#  'LD0633',
#  'LD0711',
#  'LD0567',
#  'LD0936',
#  'LD0572',
#  'LD0458',
#  'LD0477',
#  'LD0568',
#  'LD1245',
#  'LD0512',
#  'LD1285',
#  'LD0616',
#  'LD0487',
#  'LD0388',
#  'LD0371',
#  'LD0740',
#  'LD0304',
#  'LD0381',
#  'LD0473',
#  'LD0463',
#  'LD0437',
#  'LD0775',
#  'LD0491',
#  'LD0548',
#  'LD0522',
#  'LD0760',
#  'LD0439',
#  'LD0554',
#  'LD0618',
#  'LD0744',
#  'LD0558',
#  'LD0317',
#  'LD0454',
#  'LD0459',
#  'LD0503',
#  'LD0710',
#  'LD0689',
#  'LD1179',
#  'LD1248',
#  'LD0562',
#  'LD0314',
#  'LD0333',
#  'LD0416',
#  'LD0451',
#  'LD0380',
#  'LD0457',
#  'LD0410',
#  'LD0609',
#  'LD0565',
#  'LD0715',
#  'LD0495',
#  'LD0629',
#  'LD0465',
#  'LD0502',
#  'LD0709',
#  'LD0598',
#  'LD0417',
#  'LD0481',
#  'LD0784',
#  'LD1275',
#  'LD0665',
#  'LD0749',
#  'LD0786',
#  'LD0774',
#  'LD1322',
#  'LD0369',
#  'LD0488',
#  'LD0494',
#  'LD0724',
#  'LD1343',
#  'LD0702',
#  'LD0593',
#  'LD0667',
#  'LD0480',
#  'LD0625',
#  'LD0645',
#  'LD0612',
#  'LD0685',
#  'LD0851',
#  'LD0763',
#  'LD0820',
#  'LD0947',
#  'LD1280',
#  'LD0619',
#  'LD0592',
#  'LD1037',
#  'LD0534',
#  'LD1325',
#  'LD0450',
#  'LD0426',
#  'LD0746',
#  'LD0611',
#  'LD1199',
#  'LD0821',
#  'LD1083',
#  'LD0703',
#  'LD0601',
#  'LD0620',
#  'LD0695',
#  'LD0573',
#  'LD0663',
#  'LD0570',
#  'LD0745',
#  'LD0687',
#  'LD0697',
#  'LD0693',
#  'LD0699',
#  'LD0903',
#  'LD0765',
#  'LD0946',
#  'LD0800',
#  'LD1029',
#  'LD1125',
#  'LD0515',
#  'LD0479',
#  'LD0644',
#  'LD0507',
#  'LD0594',
#  'LD0545',
#  'LD0561',
#  'LD0730',
#  'LD0754',
#  'LD0941',
#  'LD1034',
#  'LD0969',
#  'LD1077',
#  'LD0880',
#  'LD1306',
#  'LD0764',
#  'LD0977',
#  'LD0826',
#  'LD0858',
#  'LD0994',
#  'LD0525',
#  'LD0737',
#  'LD0827',
#  'LD0801',
#  'LD1131',
#  'LD0814',
#  'LD0995',
#  'LD1012',
#  'LD0797',
#  'LD0879',
#  'LD0790',
#  'LD0841',
#  'LD0816',
#  'LD0954',
#  'LD0659',
#  'LD0859',
#  'LD1113',
#  'LD0811',
#  'LD0979',
#  'LD0862',
#  'LD1026',
#  'LD1090',
#  'LD0823',
#  'LD0825',
#  'LD0857',
#  'LD1235',
#  'LD1028',
#  'LD0662',
#  'LD0727',
#  'LD0725',
#  'LD0758',
#  'LD0844',
#  'LD1039',
#  'LD1017',
#  'LD1075',
#  'LD0917',
#  'LD1064',
#  'LD0791',
#  'LD0855',
#  'LD1091',
#  'LD0915',
#  'LD0794',
#  'LD1000',
#  'LD0894',
#  'LD1345',
#  'LD1132',
#  'LD0997',
#  'LD0806',
#  'LD1003',
#  'LD0900',
#  'LD1182',
#  'LD0918',
#  'LD1249',
#  'LD0962',
#  'LD1167',
#  'LD0852',
#  'LD0789',
#  'LD0834',
#  'LD0815',
#  'LD1047',
#  'LD1006',
#  'LD0955',
#  'LD1021',
#  'LD0942',
#  'LD0940',
#  'LD0959',
#  'LD1049',
#  'LD1265',
#  'LD0996',
#  'LD0952',
#  'LD0865',
#  'LD0805',
#  'LD0890',
#  'LD1092',
#  'LD0870',
#  'LD0817',
#  'LD0843',
#  'LD1259',
#  'LD0899',
#  'LD0798',
#  'LD0876',
#  'LD1093',
#  'LD0923',
#  'LD1071',
#  'LD1095',
#  'LD0896',
#  'LD1074',
#  'LD1298',
#  'LD0884',
#  'LD0953',
#  'LD0796',
#  'LD0871',
#  'LD0886',
#  'LD0873',
#  'LD0970',
#  'LD0986',
#  'LD1020',
#  'LD0875',
#  'LD1059',
#  'LD1094',
#  'LD1045',
#  'LD0993',
#  'LD0975',
#  'LD1056',
#  'LD1228',
#  'LD1331',
#  'LD1042',
#  'LD0864',
#  'LD0891',
#  'LD0795',
#  'LD0833',
#  'LD0793',
#  'LD0782',
#  'LD1153',
#  'LD0878',
#  'LD0846',
#  'LD0913',
#  'LD0987',
#  'LD0944',
#  'LD0960',
#  'LD0934',
#  'LD1239',
#  'LD1264',
#  'LD1135',
#  'LD1141',
#  'LD1186',
#  'LD1073',
#  'LD1022',
#  'LD1158',
#  'LD1138',
#  'LD1127',
#  'LD1188',
#  'LD1081',
#  'LD1255',
#  'LD1101',
#  'LD1096',
#  'LD0929',
#  'LD0916',
#  'LD1008',
#  'LD1283',
#  'LD1155',
#  'LD0991',
#  'LD1087',
#  'LD1015',
#  'LD1175',
#  'LD1080',
#  'LD1301',
#  'LD1202',
#  'LD1272',
#  'LD1102',
#  'LD1165',
#  'LD1085',
#  'LD1171',
#  'LD1097',
#  'LD0965',
#  'LD1134',
#  'LD1241',
#  'LD1110',
#  'LD1046',
#  'LD1170',
#  'LD1044',
#  'LD1054',
#  'LD1159',
#  'LD1065',
#  'LD1157',
#  'LD1024',
#  'LD1100',
#  'LD1013',
#  'LD0910',
#  'LD0980',
#  'LD1214',
#  'LD1002',
#  'LD1103',
#  'LD1293',
#  'LD1161',
#  'LD1203',
#  'LD0967',
#  'LD1154',
#  'LD1197',
#  'LD1038',
#  'LD1076',
#  'LD1208',
#  'LD1190',
#  'LD1274',
#  'LD1251',
#  'LD1321',
#  'LD1162',
#  'LD1189',
#  'LD1289',
#  'LD1348',
#  'LD1232',
#  'LD0378',
#  'LD1142',
#  'LD1105',
#  'LD1305',
#  'LD1316',
#  'LD1341',
#  'LD1219',
#  'LD1262',
#  'LD1163',
#  'LD1168',
#  'LD0231',
#  'LD1145',
#  'LD1181',
#  'LD1242',
#  'LD1209',
#  'LD1356',
#  'LD0313',
#  'LD1104',
#  'LD1143',
#  'LD1210',
#  'LD1313',
#  'LD1176',
#  'LD1185',
#  'LD1254',
#  'LD1267',
#  'LD1061',
#  'LD1174',
#  'LD1329',
#  'LD1150',
#  'LD1243',
#  'LD1263',
#  'LD1230',
#  'LD1212',
#  'LD1334',
#  'LD1281',
#  'LD1180',
#  'LD1166',
#  'LD1362',
#  'LD1279',
#  'LD1149',
#  'LD1268',
#  'LD1270',
#  'LD1147',
#  'LD1200',
#  'LD1323',
#  'LD1277',
#  'LD1309',
#  'LD1286',
#  'LD0498',
#  'LD1349',
#  'LD1072',
#  'LD1144',
#  'LD1115',
#  'LD1291',
#  'LD1282',
#  'LD1271',
#  'LD1252',
#  'LD0102'], 'one': ['LD0022',
#  'LD0008',
#  'LD0023',
#  'LD0015',
#  'LD0043',
#  'LD0812',
#  'LD0018',
#  'LD0046',
#  'LD0096',
#  'LD0021',
#  'LD0002',
#  'LD0073',
#  'LD0055',
#  'LD0121',
#  'LD0053',
#  'LD0028',
#  'LD0048',
#  'LD0049',
#  'LD1192',
#  'LD0227',
#  'LD0370',
#  'LD0241',
#  'LD0142',
#  'LD0091',
#  'LD0117',
#  'LD0115',
#  'LD0123',
#  'LD0739',
#  'LD0732',
#  'LD0382',
#  'LD0087',
#  'LD0074',
#  'LD0186',
#  'LD0210',
#  'LD0264',
#  'LD0648',
#  'LD0395',
#  'LD0660',
#  'LD0566',
#  'LD0242',
#  'LD0072',
#  'LD0490',
#  'LD0338',
#  'LD0092',
#  'LD0077',
#  'LD0139',
#  'LD0285',
#  'LD0366',
#  'LD0342',
#  'LD0461',
#  'LD0067',
#  'LD0201',
#  'LD0334',
#  'LD0373',
#  'LD0707',
#  'LD0409',
#  'LD0206',
#  'LD0099',
#  'LD0147',
#  'LD0361',
#  'LD0143',
#  'LD0351',
#  'LD0262',
#  'LD0605',
#  'LD0286',
#  'LD0214',
#  'LD0106',
#  'LD0296',
#  'LD0254',
#  'LD0600',
#  'LD0376',
#  'LD0415',
#  'LD0683',
#  'LD0443',
#  'LD0297',
#  'LD0819',
#  'LD0377',
#  'LD0266',
#  'LD0363',
#  'LD0386',
#  'LD0530',
#  'LD0632',
#  'LD0276',
#  'LD0301',
#  'LD1236',
#  'LD0287',
#  'LD0352',
#  'LD0321',
#  'LD0396',
#  'LD0267',
#  'LD0718',
#  'LD0474',
#  'LD0589',
#  'LD0578',
#  'LD0543',
#  'LD0475',
#  'LD0576',
#  'LD0258',
#  'LD0506',
#  'LD0771',
#  'LD0513',
#  'LD0438',
#  'LD0500',
#  'LD0557',
#  'LD0698',
#  'LD0742',
#  'LD0449',
#  'LD0673',
#  'LD0476',
#  'LD0674',
#  'LD0713',
#  'LD0359',
#  'LD0355',
#  'LD1326',
#  'LD0647',
#  'LD1276',
#  'LD0938',
#  'LD0854',
#  'LD0529',
#  'LD0882',
#  'LD0988',
#  'LD1025',
#  'LD0721',
#  'LD0628',
#  'LD0681',
#  'LD0615',
#  'LD0788',
#  'LD0684',
#  'LD0802',
#  'LD0948',
#  'LD0808',
#  'LD1278',
#  'LD0478',
#  'LD0595',
#  'LD0514',
#  'LD0756',
#  'LD0726',
#  'LD0679',
#  'LD0690',
#  'LD0510',
#  'LD0772',
#  'LD0714',
#  'LD0608',
#  'LD0668',
#  'LD0974',
#  'LD0830',
#  'LD0951',
#  'LD0824',
#  'LD0992',
#  'LD0881',
#  'LD0639',
#  'LD0748',
#  'LD1007',
#  'LD0877',
#  'LD0863',
#  'LD0856',
#  'LD1005',
#  'LD0803',
#  'LD0536',
#  'LD0777',
#  'LD0831',
#  'LD0778',
#  'LD0781',
#  'LD0804',
#  'LD0799',
#  'LD0921',
#  'LD0860',
#  'LD0897',
#  'LD1084',
#  'LD0776',
#  'LD0919',
#  'LD0932',
#  'LD1058',
#  'LD0924',
#  'LD1053',
#  'LD1128',
#  'LD0989',
#  'LD0889',
#  'LD0848',
#  'LD0898',
#  'LD0832',
#  'LD1287',
#  'LD1018',
#  'LD0972',
#  'LD0901',
#  'LD1187',
#  'LD0847',
#  'LD1317',
#  'LD1001',
#  'LD0976',
#  'LD1140',
#  'LD1139',
#  'LD1050',
#  'LD1129',
#  'LD1067',
#  'LD1119',
#  'LD1126',
#  'LD1117',
#  'LD0909',
#  'LD1133',
#  'LD1124',
#  'LD0983',
#  'LD1069',
#  'LD0963',
#  'LD1213',
#  'LD1032',
#  'LD1350',
#  'LD1009',
#  'LD1070',
#  'LD1183',
#  'LD1354',
#  'LD1336',
#  'LD1099',
#  'LD1156',
#  'LD1194',
#  'LD1361',
#  'LD1227',
#  'LD1303',
#  'LD1333',
#  'LD1290',
#  'LD1288',
#  'LD1177',
#  'LD1266',
#  'LD1299',
#  'LD1178',
#  'LD1320',
#  'LD1195',
#  'LD1234',
#  'LD1246',
#  'LD1353',
#  'LD1337',
#  'LD1366',
#  'LD1196',
#  'LD1148',
#  'LD1357',
#  'LD0165'], 'two': ['LD0064',
#  'LD0391',
#  'LD0330',
#  'LD0392',
#  'LD0397',
#  'LD0653',
#  'LD0390',
#  'LD0556',
#  'LD0401',
#  'LD0408',
#  'LD0779',
#  'LD0691',
#  'LD0701',
#  'LD0591',
#  'LD0792',
#  'LD0850',
#  'LD0867',
#  'LD1364',
#  'LD0773',
#  'LD0928',
#  'LD0892',
#  'LD1346',
#  'LD1211',
#  'LD0447']}

In [9]:
# len(subjects_select['zero'])+len(subjects_select['one'])+len(subjects_select['two'])

922

## Get the 895 subject ids in Sharon's dataset

In [17]:
import pandas as pd

sharon_counts_df = pd.read_csv("gene_counts.tsv", delimiter="\t")
sharon_counts_df = sharon_counts_df.transpose()

subjects_select = set(sharon_counts_df.index)

### generate the .sh files that will be sent to slurm

In [30]:
## VERSION WHERE WE DO NOT SPLIT THE SUBJECTS INTO GROUPS

root = "/labs/mignot/DGN/BLAST/"
for i, folder_name in enumerate(subjects_select):
    file_path = folder_name_file_path_dict[folder_name]
    save_path = os.path.join(root, folder_name)
    print(i, folder_name, file_path)
#         if os.path.exists("{}.out".format(save_path)):
#             print("skipping above...")
#             continue
    with open("new_BLAST_shs/{}.sh".format(folder_name), "w") as f:
        f.write("#!/bin/bash\n#SBATCH --job-name=default\n#SBATCH --nodes=1\n#SBATCH --ntasks=1\n#SBATCH --cpus-per-task=8\n#SBATCH --partition=batch\n#SBATCH --account=mignot\n#SBATCH --time=10:00:00\n")    

        command1 = "gunzip < {} > {}.fastq\n".format(file_path, save_path)
        f.write(command1)
        command2 = "paste - - - - < {}.fastq | cut -f 1,2 | sed 's/^@/>/' | tr '\t' '\n' > {}.fasta\n".format(save_path, save_path)
        f.write(command2)
        command3 = "rm {}.fastq\n".format(save_path)
        f.write(command3)
        command4 = "blastn -num_threads 8 -db /home/ashteng/blastdb/imgt_human_TCR_VDJ_F_ORF_inframe_P_cleaned.fasta -query {}.fasta -perc_identity 93 -outfmt 6 -out {}.out\n".format(save_path, save_path)
        f.write(command4)
        # command5 = "rm {}.fasta\n".format(save_path)
        # f.write(command5)

0 LD0244 /labs/mignot/DGN/DGN_fastq/rawdata/LD0244/Levinson_Library_Pool_014_F02_ACAGTG_ACAGTG_L006.fastq.gz
1 LD0044 /labs/mignot/DGN/DGN_fastq/rawdata/LD0044/Levinson_Library_Pool_076_CGATGT_CGATGT_L004.fastq.gz
2 LD0662 /labs/mignot/DGN/DGN_fastq/rawdata/LD0662/Levinson_Library_Pool_338_GCCAAT_GCCAAT_L002.fastq.gz
3 LD1097 /labs/mignot/DGN/DGN_fastq/rawdata/LD1097/Levinson_Library_Pool_194_ACTTGA_ACTTGA_L002.fastq.gz
4 LD0903 /labs/mignot/DGN/DGN_fastq/rawdata/LD0903/Levinson_Library_Pool_168_TTAGGC_L008.fastq.gz
5 LD0581 /labs/mignot/DGN/DGN_fastq/rawdata/LD0581/Levinson_Library_Pool_090_GGCTAC_GGCTAC_L002.fastq.gz
6 LD0266 /labs/mignot/DGN/DGN_fastq/rawdata/LD0266/LD0266.fastq.gz
7 LD0251 /labs/mignot/DGN/DGN_fastq/rawdata/LD0251/Levinson_Library_Pool_010_B02_GCCAAT_GCCAAT_L002.fastq.gz
8 LD1077 /labs/mignot/DGN/DGN_fastq/rawdata/LD1077/LD1077.fastq.gz
9 LD0097 /labs/mignot/DGN/DGN_fastq/rawdata/LD0097/LD0097.fastq.gz
10 LD0028 /labs/mignot/DGN/DGN_fastq/rawdata/LD0028/Levinson_Li

118 LD0038 /labs/mignot/DGN/DGN_fastq/rawdata/LD0038/Levinson_Library_Pool_354_GATCAG_L007.fastq.gz
119 LD1117 /labs/mignot/DGN/DGN_fastq/rawdata/LD1117/Levinson_Library_Pool_216_GATCAG_GATCAG_L008.fastq.gz
120 LD1155 /labs/mignot/DGN/DGN_fastq/rawdata/LD1155/Levinson_Library_Pool_251_ATCACG_ATCACG_L002.fastq.gz
121 LD0496 /labs/mignot/DGN/DGN_fastq/rawdata/LD0496/Levinson_Library_Pool_073_TGACCA_TGACCA_L001.fastq.gz
122 LD0303 /labs/mignot/DGN/DGN_fastq/rawdata/LD0303/Levinson_Library_Pool_299_GATCAG_GATCAG_L005.fastq.gz
123 LD0629 /labs/mignot/DGN/DGN_fastq/rawdata/LD0629/Levinson_Library_Pool_347_TTAGGC_L003.fastq.gz
124 LD1326 /labs/mignot/DGN/DGN_fastq/rawdata/LD1326/Levinson_Library_Pool_290_ACAGTG_ACAGTG_L004.fastq.gz
125 LD0877 /labs/mignot/DGN/DGN_fastq/rawdata/LD0877/Levinson_Library_Pool_163_ATCACG_L003.fastq.gz
126 LD0889 /labs/mignot/DGN/DGN_fastq/rawdata/LD0889/Levinson_Library_Pool_165_CGATGT_L005.fastq.gz
127 LD0816 /labs/mignot/DGN/DGN_fastq/rawdata/LD0816/LD0816.fastq

229 LD0772 /labs/mignot/DGN/DGN_fastq/rawdata/LD0772/Levinson_Library_Pool_132_TTAGGC_TTAGGC_L004.fastq.gz
230 LD0867 /labs/mignot/DGN/DGN_fastq/rawdata/LD0867/Levinson_Library_Pool_157_CTTGTA_L005.fastq.gz
231 LD0572 /labs/mignot/DGN/DGN_fastq/rawdata/LD0572/Levinson_Library_Pool_092_TAGCTT_TAGCTT_L004.fastq.gz
232 LD0201 /labs/mignot/DGN/DGN_fastq/rawdata/LD0201/LD0201.fastq.gz
233 LD0289 /labs/mignot/DGN/DGN_fastq/rawdata/LD0289/LD0289.fastq.gz
234 LD0491 /labs/mignot/DGN/DGN_fastq/rawdata/LD0491/Levinson_Library_Pool_069_TTAGGC_TTAGGC_L005.fastq.gz
235 LD1236 /labs/mignot/DGN/DGN_fastq/rawdata/LD1236/Levinson_Library_Pool_255_GATCAG_GATCAG_L006.fastq.gz
236 LD1220 /labs/mignot/DGN/DGN_fastq/rawdata/LD1220/batch5.Levinson_Library_Pool_309_CTTGTA_CTTGTA_L007.fastq.gz
237 LD0013 /labs/mignot/DGN/DGN_fastq/rawdata/LD0013/LD0013.fastq.gz
238 LD0075 /labs/mignot/DGN/DGN_fastq/rawdata/LD0075/Levinson_Library_Pool_220_TTAGGC_L004.fastq.gz
239 LD0492 /labs/mignot/DGN/DGN_fastq/rawdata/LD049

347 LD1015 /labs/mignot/DGN/DGN_fastq/rawdata/LD1015/batch3.Levinson_Library_Pool_192_CTTGTA_CTTGTA_L008.fastq.gz
348 LD0932 /labs/mignot/DGN/DGN_fastq/rawdata/LD0932/Levinson_Library_Pool_311_TAGCTT_TAGCTT_L005.fastq.gz
349 LD0144 /labs/mignot/DGN/DGN_fastq/rawdata/LD0144/Levinson_Library_Pool_235_GATCAG_GATCAG_L003.fastq.gz
350 LD1006 /labs/mignot/DGN/DGN_fastq/rawdata/LD1006/batch3.Levinson_Library_Pool_186_CTTGTA_CTTGTA_L002.fastq.gz
351 LD1131 /labs/mignot/DGN/DGN_fastq/rawdata/LD1131/Levinson_Library_Pool_224_TAGCTT_L008.fastq.gz
352 LD0571 /labs/mignot/DGN/DGN_fastq/rawdata/LD0571/Levinson_Library_Pool_091_TAGCTT_TAGCTT_L003.fastq.gz
353 LD0128 /labs/mignot/DGN/DGN_fastq/rawdata/LD0128/Levinson_Library_Pool_238_CAGATC_CAGATC_L005.fastq.gz
354 LD0163 /labs/mignot/DGN/DGN_fastq/rawdata/LD0163/LD0163.fastq.gz
355 LD0249 /labs/mignot/DGN/DGN_fastq/rawdata/LD0249/Levinson_Library_Pool_009_A02_GCCAAT_GCCAAT_L001.fastq.gz
356 LD0619 /labs/mignot/DGN/DGN_fastq/rawdata/LD0619/Levinson_Li

466 LD0213 /labs/mignot/DGN/DGN_fastq/rawdata/LD0213/LD0213.fastq.gz
467 LD0647 /labs/mignot/DGN/DGN_fastq/rawdata/LD0647/batch3.Levinson_Library_Pool_107_ACAGTG_L004.fastq.gz
468 LD0796 /labs/mignot/DGN/DGN_fastq/rawdata/LD0796/Levinson_Library_Pool_144_ACAGTG_ACAGTG_L008.fastq.gz
469 LD0411 /labs/mignot/DGN/DGN_fastq/rawdata/LD0411/Levinson_Library_Pool_055_CAGATC_CAGATC_L006.fastq.gz
470 LD0359 /labs/mignot/DGN/DGN_fastq/rawdata/LD0359/LD0359.fastq.gz
471 LD0387 /labs/mignot/DGN/DGN_fastq/rawdata/LD0387/Levinson_Library_Pool_047_ACAGTG_ACAGTG_L006.fastq.gz
472 LD0363 /labs/mignot/DGN/DGN_fastq/rawdata/LD0363/LD0363.fastq.gz
473 LD0084 /labs/mignot/DGN/DGN_fastq/rawdata/LD0084/LD0084.fastq.gz
474 LD1287 /labs/mignot/DGN/DGN_fastq/rawdata/LD1287/batch5.Levinson_Library_Pool_282_CGATGT_CGATGT_L004.fastq.gz
475 LD1286 /labs/mignot/DGN/DGN_fastq/rawdata/LD1286/batch5.Levinson_Library_Pool_281_CGATGT_CGATGT_L003.fastq.gz
476 LD0986 /labs/mignot/DGN/DGN_fastq/rawdata/LD0986/batch3.Levinson

579 LD0230 /labs/mignot/DGN/DGN_fastq/rawdata/LD0230/Levinson_Library_Pool_013_E02_TGACCA_TGACCA_L005.fastq.gz
580 LD0369 /labs/mignot/DGN/DGN_fastq/rawdata/LD0369/LD0369.fastq.gz
581 LD1144 /labs/mignot/DGN/DGN_fastq/rawdata/LD1144/Levinson_Library_Pool_339_CTTGTA_CTTGTA_L003.fastq.gz
582 LD0702 /labs/mignot/DGN/DGN_fastq/rawdata/LD0702/Levinson_Library_Pool_120_GATCAG_GATCAG_L005.fastq.gz
583 LD1265 /labs/mignot/DGN/DGN_fastq/rawdata/LD1265/batch5.Levinson_Library_Pool_279_GGCTAC_GGCTAC_L001.fastq.gz
584 LD0154 /labs/mignot/DGN/DGN_fastq/rawdata/LD0154/LD0154.fastq.gz
585 LD1259 /labs/mignot/DGN/DGN_fastq/rawdata/LD1259/Levinson_Library_Pool_311_CTTGTA_CTTGTA_L005.fastq.gz
586 LD0582 /labs/mignot/DGN/DGN_fastq/rawdata/LD0582/Levinson_Library_Pool_091_GGCTAC_GGCTAC_L003.fastq.gz
587 LD0648 /labs/mignot/DGN/DGN_fastq/rawdata/LD0648/Levinson_Library_Pool_108_ACAGTG_ACAGTG_L001.fastq.gz
588 LD1187 /labs/mignot/DGN/DGN_fastq/rawdata/LD1187/Levinson_Library_Pool_257_ACAGTG_ACAGTG_L008.fast

698 LD0440 /labs/mignot/DGN/DGN_fastq/rawdata/LD0440/Levinson_Library_Pool_060_TAGCTT_TAGCTT_L003.fastq.gz
699 LD0649 /labs/mignot/DGN/DGN_fastq/rawdata/LD0649/batch3.Levinson_Library_Pool_109_ACAGTG_ACAGTG_L007.fastq.gz
700 LD1001 /labs/mignot/DGN/DGN_fastq/rawdata/LD1001/Levinson_Library_Pool_190_GGCTAC_L006.fastq.gz
701 LD0665 /labs/mignot/DGN/DGN_fastq/rawdata/LD0665/batch3.Levinson_Library_Pool_109_GCCAAT_GCCAAT_L007.fastq.gz
702 LD1094 /labs/mignot/DGN/DGN_fastq/rawdata/LD1094/batch3.Levinson_Library_Pool_209_ACTTGA_L001.fastq.gz
703 LD0475 /labs/mignot/DGN/DGN_fastq/rawdata/LD0475/LD0475.fastq.gz
704 LD0856 /labs/mignot/DGN/DGN_fastq/rawdata/LD0856/batch3.Levinson_Library_Pool_156_GGCTAC_L004.fastq.gz
705 LD0514 /labs/mignot/DGN/DGN_fastq/rawdata/LD0514/batch5.Levinson_Library_Pool_304_GATCAG_GATCAG_L002.fastq.gz
706 LD1039 /labs/mignot/DGN/DGN_fastq/rawdata/LD1039/Levinson_Library_Pool_198_TTAGGC_TTAGGC_L006.fastq.gz
707 LD1263 /labs/mignot/DGN/DGN_fastq/rawdata/LD1263/batch5.L

830 LD0718 /labs/mignot/DGN/DGN_fastq/rawdata/LD0718/Levinson_Library_Pool_122_GGCTAC_GGCTAC_L007.fastq.gz
831 LD0285 /labs/mignot/DGN/DGN_fastq/rawdata/LD0285/batch5.Levinson_Library_Pool_304_ACTTGA_ACTTGA_L002.fastq.gz
832 LD1175 /labs/mignot/DGN/DGN_fastq/rawdata/LD1175/Levinson_Library_Pool_254_TTAGGC_TTAGGC_L057.fastq.gz
833 LD0633 /labs/mignot/DGN/DGN_fastq/rawdata/LD0633/batch3.Levinson_Library_Pool_107_TGACCA_L004.fastq.gz
834 LD1138 /labs/mignot/DGN/DGN_fastq/rawdata/LD1138/Levinson_Library_Pool_221_GGCTAC_L005.fastq.gz
835 LD0292 /labs/mignot/DGN/DGN_fastq/rawdata/LD0292/Levinson_Library_Pool_023_G03_GATCAG_GATCAG_L007.fastq.gz
836 LD0604 /labs/mignot/DGN/DGN_fastq/rawdata/LD0604/Levinson_Library_Pool_099_ATCACG_ATCACG_L003.fastq.gz
837 LD0992 /labs/mignot/DGN/DGN_fastq/rawdata/LD0992/Levinson_Library_Pool_191_TAGCTT_L007.fastq.gz
838 LD0463 /labs/mignot/DGN/DGN_fastq/rawdata/LD0463/Levinson_Library_Pool_061_CTTGTA_CTTGTA_L004.fastq.gz
839 LD0473 /labs/mignot/DGN/DGN_fastq/ra

In [8]:
## VERSION WHERE WE SPLIT THE SUBJECTS INTO 0, 1, 2 BASED ON GENOTYPE OF A PARTICULAR SNP

# for genotype in ['zero']:
#     root = "/labs/mignot/DGN/BLAST/{}".format(genotype)
#     for i, folder_name in enumerate(subjects_select[genotype]):
#         file_path = folder_name_file_path_dict[folder_name]
#         save_path = os.path.join(root, folder_name)
#         print(i, folder_name, file_path)
# #         if os.path.exists("{}.out".format(save_path)):
# #             print("skipping above...")
# #             continue
#         with open("{}_{}.sh".format(folder_name, genotype), "w") as f:
#             f.write("#!/bin/bash\n#SBATCH --job-name=default\n#SBATCH --nodes=1\n#SBATCH --ntasks=1\n#SBATCH --cpus-per-task=8\n#SBATCH --partition=batch\n#SBATCH --account=mignot\n#SBATCH --time=10:00:00\n")    
            
#             command1 = "gunzip < {} > {}.fastq\n".format(file_path, save_path)
#             f.write(command1)
#             command2 = "paste - - - - < {}.fastq | cut -f 1,2 | sed 's/^@/>/' | tr '\t' '\n' > {}.fasta\n".format(save_path, save_path)
#             f.write(command2)
#             command3 = "rm {}.fastq\n".format(save_path)
#             f.write(command3)
#             command4 = "blastn -num_threads 8 -db blastdb/imgt_human_TCR_VDJ_F_ORF_inframe_P_cleaned.fasta -query {}.fasta -outfmt 6 -out {}.out\n".format(save_path, save_path)
#             f.write(command4)
#             command5 = "rm {}.fasta\n".format(save_path)
#             f.write(command5)

0 LD0014 /labs/mignot/DGN/DGN_fastq/rawdata/LD0014/Levinson_Library_Pool_351_CAGATC_L007.fastq.gz
1 LD0041 /labs/mignot/DGN/DGN_fastq/rawdata/LD0041/Levinson_Library_Pool_343_CTTGTA_CTTGTA_L007.fastq.gz
2 LD0038 /labs/mignot/DGN/DGN_fastq/rawdata/LD0038/Levinson_Library_Pool_354_GATCAG_L007.fastq.gz
3 LD0084 /labs/mignot/DGN/DGN_fastq/rawdata/LD0084/LD0084.fastq.gz
4 LD0033 /labs/mignot/DGN/DGN_fastq/rawdata/LD0033/Levinson_Library_Pool_354_CAGATC_L007.fastq.gz
5 LD0157 /labs/mignot/DGN/DGN_fastq/rawdata/LD0157/LD0157.fastq.gz
6 LD0045 /labs/mignot/DGN/DGN_fastq/rawdata/LD0045/Levinson_Library_Pool_345_TTAGGC_L001.fastq.gz
7 LD0120 /labs/mignot/DGN/DGN_fastq/rawdata/LD0120/Levinson_Library_Pool_235_CAGATC_CAGATC_L003.fastq.gz
8 LD0011 /labs/mignot/DGN/DGN_fastq/rawdata/LD0011/Levinson_Library_Pool_348_TGACCA_L004.fastq.gz
9 LD0013 /labs/mignot/DGN/DGN_fastq/rawdata/LD0013/LD0013.fastq.gz
10 LD0058 /labs/mignot/DGN/DGN_fastq/rawdata/LD0058/LD0058.fastq.gz
11 LD0001 /labs/mignot/DGN/DGN_

104 LD0482 /labs/mignot/DGN/DGN_fastq/rawdata/LD0482/LD0482.fastq.gz
105 LD0221 /labs/mignot/DGN/DGN_fastq/rawdata/LD0221/LD0221.fastq.gz
106 LD0520 /labs/mignot/DGN/DGN_fastq/rawdata/LD0520/Levinson_Library_Pool_074_GCCAAT_GCCAAT_L002.fastq.gz
107 LD0302 /labs/mignot/DGN/DGN_fastq/rawdata/LD0302/Levinson_Library_Pool_298_GATCAG_GATCAG_L004.fastq.gz
108 LD0323 /labs/mignot/DGN/DGN_fastq/rawdata/LD0323/Levinson_Library_Pool_300_GATCAG_GATCAG_L006.fastq.gz
109 LD0233 /labs/mignot/DGN/DGN_fastq/rawdata/LD0233/Levinson_Library_Pool_015_G02_TGACCA_TGACCA_L007.fastq.gz
110 LD0161 /labs/mignot/DGN/DGN_fastq/rawdata/LD0161/LD0161.fastq.gz
111 LD0145 /labs/mignot/DGN/DGN_fastq/rawdata/LD0145/Levinson_Library_Pool_236_GATCAG_GATCAG_L004.fastq.gz
112 LD0122 /labs/mignot/DGN/DGN_fastq/rawdata/LD0122/Levinson_Library_Pool_019_C03_GATCAG_GATCAG_L003.fastq.gz
113 LD0109 /labs/mignot/DGN/DGN_fastq/rawdata/LD0109/LD0109.fastq.gz
114 LD0181 /labs/mignot/DGN/DGN_fastq/rawdata/LD0181/LD0181.fastq.gz
115 L

208 LD0263 /labs/mignot/DGN/DGN_fastq/rawdata/LD0263/Levinson_Library_Pool_018_B03_CAGATC_CAGATC_L002.fastq.gz
209 LD0519 /labs/mignot/DGN/DGN_fastq/rawdata/LD0519/Levinson_Library_Pool_073_GCCAAT_GCCAAT_L001.fastq.gz
210 LD0738 /labs/mignot/DGN/DGN_fastq/rawdata/LD0738/LD0738.fastq.gz
211 LD0720 /labs/mignot/DGN/DGN_fastq/rawdata/LD0720/Levinson_Library_Pool_348_GGCTAC_L004.fastq.gz
212 LD0436 /labs/mignot/DGN/DGN_fastq/rawdata/LD0436/Levinson_Library_Pool_056_GATCAG_GATCAG_L007.fastq.gz
213 LD0434 /labs/mignot/DGN/DGN_fastq/rawdata/LD0434/LD0434.fastq.gz
214 LD0289 /labs/mignot/DGN/DGN_fastq/rawdata/LD0289/LD0289.fastq.gz
215 LD0345 /labs/mignot/DGN/DGN_fastq/rawdata/LD0345/Levinson_Library_Pool_034_CGATGT_CGATGT_L002.fastq.gz
216 LD0421 /labs/mignot/DGN/DGN_fastq/rawdata/LD0421/LD0421.fastq.gz
217 LD0686 /labs/mignot/DGN/DGN_fastq/rawdata/LD0686/Levinson_Library_Pool_117_ACTTGA_ACTTGA_L002.fastq.gz
218 LD0348 /labs/mignot/DGN/DGN_fastq/rawdata/LD0348/LD0348.fastq.gz
219 LD0354 /labs

317 LD1179 /labs/mignot/DGN/DGN_fastq/rawdata/LD1179/Levinson_Library_Pool_258_TGACCA_TGACCA_L001.fastq.gz
318 LD1248 /labs/mignot/DGN/DGN_fastq/rawdata/LD1248/batch5.Levinson_Library_Pool_276_TAGCTT_TAGCTT_L006.fastq.gz
319 LD0562 /labs/mignot/DGN/DGN_fastq/rawdata/LD0562/Levinson_Library_Pool_083_GATCAG_GATCAG_L003.fastq.gz
320 LD0314 /labs/mignot/DGN/DGN_fastq/rawdata/LD0314/Levinson_Library_Pool_030_F04_GGCTAC_GGCTAC_L006.fastq.gz
321 LD0333 /labs/mignot/DGN/DGN_fastq/rawdata/LD0333/Levinson_Library_Pool_034_ATCACG_ATCACG_L002.fastq.gz
322 LD0416 /labs/mignot/DGN/DGN_fastq/rawdata/LD0416/LD0416.fastq.gz
323 LD0451 /labs/mignot/DGN/DGN_fastq/rawdata/LD0451/Levinson_Library_Pool_061_GGCTAC_GGCTAC_L004.fastq.gz
324 LD0380 /labs/mignot/DGN/DGN_fastq/rawdata/LD0380/LD0380.fastq.gz
325 LD0457 /labs/mignot/DGN/DGN_fastq/rawdata/LD0457/Levinson_Library_Pool_057_CTTGTA_CTTGTA_L008.fastq.gz
326 LD0410 /labs/mignot/DGN/DGN_fastq/rawdata/LD0410/LD0410.fastq.gz
327 LD0609 /labs/mignot/DGN/DGN_f

421 LD0797 /labs/mignot/DGN/DGN_fastq/rawdata/LD0797/Levinson_Library_Pool_137_GCCAAT_GCCAAT_L001.fastq.gz
422 LD0879 /labs/mignot/DGN/DGN_fastq/rawdata/LD0879/Levinson_Library_Pool_165_ATCACG_L005.fastq.gz
423 LD0790 /labs/mignot/DGN/DGN_fastq/rawdata/LD0790/Levinson_Library_Pool_135_ACAGTG_ACAGTG_L007.fastq.gz
424 LD0841 /labs/mignot/DGN/DGN_fastq/rawdata/LD0841/batch5.Levinson_Library_Pool_309_TAGCTT_TAGCTT_L007.fastq.gz
425 LD0816 /labs/mignot/DGN/DGN_fastq/rawdata/LD0816/LD0816.fastq.gz
426 LD0954 /labs/mignot/DGN/DGN_fastq/rawdata/LD0954/batch3.Levinson_Library_Pool_183_CAGATC_CAGATC_L006.fastq.gz
427 LD0659 /labs/mignot/DGN/DGN_fastq/rawdata/LD0659/LD0659.fastq.gz
428 LD0859 /labs/mignot/DGN/DGN_fastq/rawdata/LD0859/batch3.Levinson_Library_Pool_159_GGCTAC_L007.fastq.gz
429 LD1113 /labs/mignot/DGN/DGN_fastq/rawdata/LD1113/batch5.Levinson_Library_Pool_309_GGCTAC_GGCTAC_L007.fastq.gz
430 LD0811 /labs/mignot/DGN/DGN_fastq/rawdata/LD0811/LD0811.fastq.gz
431 LD0979 /labs/mignot/DGN/DG

523 LD0833 /labs/mignot/DGN/DGN_fastq/rawdata/LD0833/Levinson_Library_Pool_151_GATCAG_GATCAG_L007.fastq.gz
524 LD0793 /labs/mignot/DGN/DGN_fastq/rawdata/LD0793/Levinson_Library_Pool_141_ACAGTG_ACAGTG_L005.fastq.gz
525 LD0782 /labs/mignot/DGN/DGN_fastq/rawdata/LD0782/Levinson_Library_Pool_141_TGACCA_TGACCA_L005.fastq.gz
526 LD1153 /labs/mignot/DGN/DGN_fastq/rawdata/LD1153/Levinson_Library_Pool_249_ATCACG_ATCACG_L008.fastq.gz
527 LD0878 /labs/mignot/DGN/DGN_fastq/rawdata/LD0878/Levinson_Library_Pool_164_ATCACG_L004.fastq.gz
528 LD0846 /labs/mignot/DGN/DGN_fastq/rawdata/LD0846/batch3.Levinson_Library_Pool_156_TAGCTT_L004.fastq.gz
529 LD0913 /labs/mignot/DGN/DGN_fastq/rawdata/LD0913/Levinson_Library_Pool_172_TGACCA_TGACCA_L004.fastq.gz
530 LD0987 /labs/mignot/DGN/DGN_fastq/rawdata/LD0987/batch3.Levinson_Library_Pool_187_TAGCTT_TAGCTT_L003.fastq.gz
531 LD0944 /labs/mignot/DGN/DGN_fastq/rawdata/LD0944/Levinson_Library_Pool_169_GCCAAT_GCCAAT_L001.fastq.gz
532 LD0960 /labs/mignot/DGN/DGN_fastq

621 LD1104 /labs/mignot/DGN/DGN_fastq/rawdata/LD1104/Levinson_Library_Pool_210_GATCAG_GATCAG_L002.fastq.gz
622 LD1143 /labs/mignot/DGN/DGN_fastq/rawdata/LD1143/Levinson_Library_Pool_218_CTTGTA_L002.fastq.gz
623 LD1210 /labs/mignot/DGN/DGN_fastq/rawdata/LD1210/batch5.Levinson_Library_Pool_267_CAGATC_CAGATC_L002.fastq.gz
624 LD1313 /labs/mignot/DGN/DGN_fastq/rawdata/LD1313/Levinson_Library_Pool_289_TGACCA_TGACCA_L003.fastq.gz
625 LD1176 /labs/mignot/DGN/DGN_fastq/rawdata/LD1176/Levinson_Library_Pool_255_TTAGGC_TTAGGC_L006.fastq.gz
626 LD1185 /labs/mignot/DGN/DGN_fastq/rawdata/LD1185/Levinson_Library_Pool_263_TGACCA_TGACCA_L006.fastq.gz
627 LD1254 /labs/mignot/DGN/DGN_fastq/rawdata/LD1254/batch5.Levinson_Library_Pool_280_TAGCTT_TAGCTT_L002.fastq.gz
628 LD1267 /labs/mignot/DGN/DGN_fastq/rawdata/LD1267/Levinson_Library_Pool_356_TGACCA_TGACCA_L001.fastq.gz
629 LD1061 /labs/mignot/DGN/DGN_fastq/rawdata/LD1061/Levinson_Library_Pool_208_TGACCA_TGACCA_L008.fastq.gz
630 LD1174 /labs/mignot/DGN/DG

### Now submit the jobs to slurm

In [31]:
import time

In [32]:
count = 0
for folder_name in subjects_select:
    count += 1
#         if count % 5000 == 0:
#             time.sleep(3600)
    script = "/home/ashteng/TCR_usages/new_BLAST_shs/{}.sh".format(folder_name)
    print(folder_name)
    !sbatch {script}

LD0244
Submitted batch job 15870565
LD0044
Submitted batch job 15870566
LD0662
Submitted batch job 15870567
LD1097
Submitted batch job 15870568
LD0903
Submitted batch job 15870569
LD0581
Submitted batch job 15870570
LD0266
Submitted batch job 15870571
LD0251
Submitted batch job 15870572
LD1077
Submitted batch job 15870573
LD0097
Submitted batch job 15870574
LD0028
Submitted batch job 15870575
LD0684
Submitted batch job 15870576
LD0923
Submitted batch job 15870577
LD0089
Submitted batch job 15870578
LD0366
Submitted batch job 15870579
LD0915
Submitted batch job 15870580
LD0345
Submitted batch job 15870581
LD1217
Submitted batch job 15870582
LD0965
Submitted batch job 15870583
LD0457
Submitted batch job 15870584
LD1235
Submitted batch job 15870585
LD0354
Submitted batch job 15870586
LD0090
Submitted batch job 15870587
LD0609
Submitted batch job 15870588
LD0024
Submitted batch job 15870589
LD0208
Submitted batch job 15870590
LD0808
Submitted batch job 15870591
LD1017
Submitted batch job 1

LD0970
Submitted batch job 15870793
LD0772
Submitted batch job 15870794
LD0867
Submitted batch job 15870795
LD0572
Submitted batch job 15870796
LD0201
Submitted batch job 15870797
LD0289
Submitted batch job 15870798
LD0491
Submitted batch job 15870799
LD1236
Submitted batch job 15870800
LD1220
Submitted batch job 15870801
LD0013
Submitted batch job 15870802
LD0075
Submitted batch job 15870803
LD0492
Submitted batch job 15870804
LD1042
Submitted batch job 15870805
LD0766
Submitted batch job 15870806
LD0018
Submitted batch job 15870807
LD0545
Submitted batch job 15870808
LD1153
Submitted batch job 15870809
LD1272
Submitted batch job 15870810
LD0812
Submitted batch job 15870811
LD0652
Submitted batch job 15870812
LD0278
Submitted batch job 15870813
LD0653
Submitted batch job 15870814
LD0498
Submitted batch job 15870815
LD0041
Submitted batch job 15870816
LD0124
Submitted batch job 15870817
LD0158
Submitted batch job 15870818
LD0061
Submitted batch job 15870819
LD0513
Submitted batch job 1

LD1087
Submitted batch job 15871021
LD0247
Submitted batch job 15871022
LD0954
Submitted batch job 15871023
LD0255
Submitted batch job 15871024
LD0371
Submitted batch job 15871025
LD0557
Submitted batch job 15871026
LD0674
Submitted batch job 15871027
LD0133
Submitted batch job 15871028
LD0724
Submitted batch job 15871029
LD0210
Submitted batch job 15871030
LD0213
Submitted batch job 15871031
LD0647
Submitted batch job 15871032
LD0796
Submitted batch job 15871033
LD0411
Submitted batch job 15871034
LD0359
Submitted batch job 15871035
LD0387
Submitted batch job 15871036
LD0363
Submitted batch job 15871037
LD0084
Submitted batch job 15871038
LD1287
Submitted batch job 15871039
LD1286
Submitted batch job 15871040
LD0986
Submitted batch job 15871041
LD0592
Submitted batch job 15871042
LD0209
Submitted batch job 15871043
LD1211
Submitted batch job 15871044
LD0738
Submitted batch job 15871045
LD0737
Submitted batch job 15871046
LD0427
Submitted batch job 15871047
LD1141
Submitted batch job 1

LD0598
Submitted batch job 15871249
LD0748
Submitted batch job 15871250
LD0775
Submitted batch job 15871251
LD0843
Submitted batch job 15871252
LD0507
Submitted batch job 15871253
LD0248
Submitted batch job 15871254
LD1099
Submitted batch job 15871255
LD1251
Submitted batch job 15871256
LD0494
Submitted batch job 15871257
LD0873
Submitted batch job 15871258
LD0801
Submitted batch job 15871259
LD1178
Submitted batch job 15871260
LD1280
Submitted batch job 15871261
LD0392
Submitted batch job 15871262
LD0440
Submitted batch job 15871263
LD0649
Submitted batch job 15871264
LD1001
Submitted batch job 15871265
LD0665
Submitted batch job 15871266
LD1094
Submitted batch job 15871267
LD0475
Submitted batch job 15871268
LD0856
Submitted batch job 15871269
LD0514
Submitted batch job 15871270
LD1039
Submitted batch job 15871271
LD1263
Submitted batch job 15871272
LD0974
Submitted batch job 15871273
LD1085
Submitted batch job 15871274
LD0976
Submitted batch job 15871275
LD0094
Submitted batch job 1

In [20]:
# count = 0
# for genotype in ['zero']:
#     for folder_name in subjects_select[genotype]:
#         count += 1
# #         if count % 5000 == 0:
# #             time.sleep(3600)
#         script = "{}_{}.sh".format(folder_name, genotype)
#         print(folder_name)
#         !sbatch {script}