In [1]:
%%capture
import sys
import os

try:
    import pyclonal
except ImportError:
    pyclonal_path = os.path.abspath(os.path.join('..'))
    if pyclonal_path not in sys.path:
        sys.path.append(pyclonal_path)

from pyclonal.io import combineFiles

## Supported formats

The following formats are supported out of the bos:

- `CHANGEO`
- `CHANGEO` without `SAMPLE` field
- `MIXCR`
- `VDJtools`
- `MITCR`
- `ImmunoSeq`

Each format defines column names that contain data to be imported (read count, CDR3 amino acid sequence, and, for `CHANGEO` format sample name. The rest of the formats get the sample name from the file name under the assumption that one file corresponds to one sample. Each format also defines a unique set of column names (`FMT_COLS`) that is used in format autodetection.

```python
    FORMATS = {
            'mixcr': ('cloneCount', 'aaSeqCDR3', None),
            'changeo': ('DUPCOUNT', 'CLONE_CDR3_AA', 'SAMPLE'),
            'changeof': ('DUPCOUNT', 'CLONE_CDR3_AA', None),
            'vdjtools': ('count', 'cdr3aa', None),
            'mitcr': ('Read_count', 'CDR3_amino_acid_sequence', None),
            'immunoseq': ('count (templates/reads)', 'aminoAcid', None)
            }


    FMT_COLS = {
            "mixcr": ["clonalSequenceQuality", "minQualFR1", "allDAlignments"],
            "changeo": ["SEQUENCE_ID", "JUNCTION_LENGTH", "CLONE_CDR3_AA"],
            "vdjtools": ["freq", "cdr3nt", "cdr3aa"],
            "immunoseq": ["aminoAcid", "frequencyCount", "cdr3Length"],
            "mitcr": ["Read count", "CDR3 amino acid sequence", "V segments"],
            }
```

A small selection of example data files in different formats can be found in `../sample_data_files`.

You can create your own format by specifying sets of data columns and `FMT_COLS`. See examples below.

In [2]:
ls -lah ../sample_input_files/

total 28K
drwxr-xr-x 6 ilya ilya 4.0K Aug  8 15:08 [0m[01;34m.[0m/
drwxr-xr-x 9 ilya ilya 4.0K Aug  8 15:11 [01;34m..[0m/
drwxr-xr-x 2 ilya ilya 4.0K Aug  8 15:08 [01;34mchangeo[0m/
drwxr-xr-x 2 ilya ilya 4.0K Aug  7 11:23 [01;34mImmunoSeq[0m/
-rw-r--r-- 1 ilya ilya 1.1K Aug  8 10:12 metadata_demo.csv
drwxr-xr-x 2 ilya ilya 4.0K Aug  8 15:08 [01;34mMixcr[0m/
drwxr-xr-x 2 ilya ilya 4.0K Aug  8 15:08 [01;34mvdjtools[0m/


## `CHANGEO` format

In [3]:
clone_df, seq_df = combineFiles('../sample_input_files/changeo', pattern='D*.tsv')

../sample_input_files/changeo/D255.changeo_small_demo.tsv looks like a changeo file
../sample_input_files/changeo/D280.changeo_small_demo.tsv looks like a changeo file
../sample_input_files/changeo/D287.changeo_small_demo.tsv looks like a changeo file
../sample_input_files/changeo/D299.changeo_small_demo.tsv looks like a changeo file
../sample_input_files/changeo/D233.changeo_small_demo.tsv looks like a changeo file


In [4]:
seq_df

Unnamed: 0_level_0,Sequence
Index,Unnamed: 1_level_1
0,CAISEVIARRYEQY
1,CASSTTLGGDGYT
2,CASSYKLTWGEHYGYT
3,CASSSGTGILGEQY
4,CASKSGGGYNEQF
5,CATRSEAPY
6,CASGPSTTEAF
7,CASSTRLLNTIY
8,CSVEEYNEQF
9,CASSERGTGELF


In [5]:
clone_df

Unnamed: 0_level_0,2029,2030,2031,2032,2033,2034,2035,2036,2037,2038,...,2019,2020,2021,2022,2023,2024,2025,2026,2027,2028
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
D233_1,1.0,8.0,9.0,1.0,10.0,2.0,4.0,6.0,1.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
D233_2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
D233_3,3.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
D233_4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
D233_5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
D233_6,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
D233_7,2.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
D233_8,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
D255_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
D255_2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## `MIXCR` format

In [6]:
ls -lah ../sample_input_files/Mixcr/

total 4.4M
drwxr-xr-x 2 ilya ilya 4.0K Aug  8 15:08 [0m[01;34m.[0m/
drwxr-xr-x 6 ilya ilya 4.0K Aug  8 15:08 [01;34m..[0m/
-rw-r--r-- 1 ilya ilya  39K Aug  6 15:32 combined.tab
-rw-r--r-- 1 ilya ilya 717K Aug  6 11:34 default_mixcr_output_format.txt
-rw-r--r-- 1 ilya ilya 336K Aug  6 15:20 p1s1.txt
-rw-r--r-- 1 ilya ilya  89K Aug  6 15:20 p1s2.txt
-rw-r--r-- 1 ilya ilya 306K Aug  6 15:20 p2s1.txt
-rw-r--r-- 1 ilya ilya  96K Aug  6 15:20 p2s2.txt
-rw-r--r-- 1 ilya ilya 717K Aug  6 15:20 p3s1.txt
-rw-r--r-- 1 ilya ilya 505K Aug  6 15:20 p3s2.txt
-rw-r--r-- 1 ilya ilya 553K Aug  6 15:20 p4s1.txt
-rw-r--r-- 1 ilya ilya 158K Aug  6 15:20 p4s2.txt
-rw-r--r-- 1 ilya ilya 556K Aug  6 15:20 p5s1.txt
-rw-r--r-- 1 ilya ilya 356K Aug  6 15:20 p5s2.txt


In [7]:
clone_df, seq_df = combineFiles('../sample_input_files/Mixcr', pattern='p*.txt')

../sample_input_files/Mixcr/p4s1.txt looks like a mixcr file
../sample_input_files/Mixcr/p1s2.txt looks like a mixcr file
../sample_input_files/Mixcr/p5s1.txt looks like a mixcr file
../sample_input_files/Mixcr/p3s2.txt looks like a mixcr file
../sample_input_files/Mixcr/p5s2.txt looks like a mixcr file
../sample_input_files/Mixcr/p1s1.txt looks like a mixcr file
../sample_input_files/Mixcr/p2s1.txt looks like a mixcr file
../sample_input_files/Mixcr/p4s2.txt looks like a mixcr file
../sample_input_files/Mixcr/p3s1.txt looks like a mixcr file
../sample_input_files/Mixcr/p2s2.txt looks like a mixcr file


In [8]:
clone_df

Unnamed: 0_level_0,1587,1601,5619,4339,1720,5620,1701,1825,5621,5622,...,5609,5610,5611,5612,5613,5614,5615,5616,5617,5618
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
p1s1,49.0,37.0,26.0,25.0,22.0,18.0,19.0,17.0,16.0,15.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
p1s2,276.0,13.0,0.0,0.0,2.0,0.0,3.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
p2s1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
p2s2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
p3s1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
p3s2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
p4s1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
p4s2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
p5s1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
p5s2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
seq_df

Unnamed: 0_level_0,Sequence
Index,Unnamed: 1_level_1
0,CASSREPGGLNTEAFF
1,CAMSPGDGGSQGNLIF
2,CMQALQTPFTF
3,CASS*GTGD_IVADTQYF
4,CARDQDGPGGTIDYW
5,CASSREPTTLNTEAFF
6,CQQYASSPRTF
7,CAVASNDYKLSF
8,CQQYDIAPATF
9,CAVNQCW_NNRKLIW


## `VDJtools` format

In [10]:
clone_df, seq_df = combineFiles('../sample_input_files/vdjtools', pattern='vdj*.txt')

../sample_input_files/vdjtools/vdjtools_format.txt looks like a vdjtools file


In [11]:
clone_df

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
vdjtools_format,35,23,21,19,16,15,12,11,11,9,...,1,1,1,1,1,1,1,1,1,1


In [12]:
seq_df

Unnamed: 0_level_0,Sequence
Index,Unnamed: 1_level_1
0,CQHYDTLPIFTF
1,CASSLEEGEAFF
2,CQQYGSSPRTF
3,CASSLPRWATYNEQFF
4,CASRDSYSPLHF
5,CASSFARGGYEQYF
6,CAHRRPPIWPFDYW
7,CASSRTPLRGAGELFF
8,CSSYTSSSTWVF
9,CASSLARGAGEQFF


## `ImmunoSeq` format

In [13]:
clone_df, seq_df = combineFiles('../sample_input_files/ImmunoSeq', pattern='*.tsv')

../sample_input_files/ImmunoSeq/D238_ILN_CD8_NAIVE.tsv looks like a immunoseq file
../sample_input_files/ImmunoSeq/D201_ILN_CD8_TEM.tsv looks like a immunoseq file
../sample_input_files/ImmunoSeq/D238_LLN_CD8_NAIVE.tsv looks like a immunoseq file
../sample_input_files/ImmunoSeq/D201_ILN_CD8_NAIVE.tsv looks like a immunoseq file


In [14]:
clone_df

Unnamed: 0_level_0,11396,11397,906,11398,11399,11400,11401,1,11402,11403,...,11386,11387,11388,11389,11390,11391,11392,11393,11394,11395
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
D201_ILN_CD8_NAIVE,2.0,1.0,2.0,2.0,2.0,2.0,2.0,8183,1.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
D201_ILN_CD8_TEM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3015,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
D238_ILN_CD8_NAIVE,0.0,0.0,1.0,0.0,0.0,0.0,0.0,619,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
D238_LLN_CD8_NAIVE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,563,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [15]:
seq_df

Unnamed: 0_level_0,Sequence
Index,Unnamed: 1_level_1
0,CASSPGAGGEQYF
1,
2,CASRLDNEQYF
3,CASSQRTSYEQYF
4,CASSFGGASYEQYF
5,CASSQIWDKAYEQYF
6,CASSYSSEQYF
7,CASSYSPGDYEQYF
8,CASSLRRFYEQYF
9,CASSSSTGPRGTQHF


## Specifying custom data format

To specify and use custom data format use `pyclonal.io.FmtReader` class like so:

```python
import glob
from pyclonal.io import FmtReader

# Define custom format
MyFORMAT = {'myfmt': ('myCountCol', 'mySeqCol', None),}
MyFMT_COLS = {'mixcr': ['mycol1', 'mycol2', 'mycol3'],}

# get the list of files you want to import
myfiles = glob.glob('datadir/*.tsv')

# create the reader and import files
reader = FmtReader(myfiles, fmt=MyFORMAT, fmt_cols=MyFMT_COLS)
clone_df, seq_df = reader.process_files()
```