Step 1 - Import python libraries

In [None]:
%pip install IPython
%pip install pandas

from IPython import get_ipython
import pandas as pd
import glob 
import os 

Step 2 - Load and read your data file (Note that you will need to run step 2 and step 3 for all the sample files in order to convert them to pyTCR standardized format)

In [None]:
# Mount Google Drive
isInGoogle = 'google.colab' in str(get_ipython())

if isInGoogle:
    from google.colab import drive
    drive.mount('/content/drive')

The `filePath` variable in the following code cell should be changed to the location of your file. The following options are supported:
1. A `filePath` from Google Drive (to run on a cloud environment)
2. A `filePath` from your local computer (to run on a local environment, other cloud environments should work as expected)

In [None]:
# Specify the path to your file in Google Drive or locally
filePath = "/content/sample_file"

targetFilename = os.path.basename(filePath)

df_samples = pd.read_table(filePath, low_memory=False, engine="c")

df_samples.head()

Step 3 - Convert data to the pyTCR standardized format:
| column | name | description                                    |
|--:|:---------|:------------------------------------------------|
| 1   | `sample`  | The name of the sample                       |
| 2   | `freq`    | The share of clonotypes in the sample        |
| 3   | `#count`  | The number of reads                          |
| 4   | `cdr3aa`  | CDR3 amino acid clonotype                    |
| 5   | `cdr3nt`  | CDR3 nucleotide                              |
| 6   | `v`       | V gene                                       |
| 7   | `d`       | D gene                                       |
| 8   | `j`       | J gene                                       |
| ... | optional fields | any other fields intended for your use |

- Modify the `required_columns` below to match the column names from your data that are equivalent to pyTCR's columns in the same order as described above
- The following code will create a new `.csv` file with with the correct pyTCR column names and place it in the current directory
- Remove `optional_columns` from the cell if your data doesn't have one

In [None]:
# If you have MiXCR results, please run the code cell below
df_samples['allVHitsWithScore']=df_samples['allVHitsWithScore'].str.replace(r"\(.*\)","")
df_samples['allDHitsWithScore']=df_samples['allDHitsWithScore'].str.replace(r"\(.*\)","")
df_samples['allJHitsWithScore']=df_samples['allJHitsWithScore'].str.replace(r"\(.*\)","")

In [None]:
# Enter the column names from your data that represent the required pyTCR columns
required_columns = [
'sample_name','frequency', 'templates',
'amino_acid', 'rearrangement', 'v_resolved' , 'd_resolved', 'j_resolved'
]

optional_columns = ['hospitalized']

df_new = df_samples.filter(required_columns + optional_columns)

# Rename the columns to pyTCR standard names
df_new.columns = [
'sample','freq', '#count', 'cdr3aa',
'cdr3nt', 'v', 'd', 'j'] + optional_columns

df_new.to_csv(f'./{targetFilename}.csv', na_rep='.', index=False)

Step 4 - Combine all sample files

Add a new column `sample` to each `.csv` file in the current directory with the filename as the value
- This is useful for converting data in other formats that do not contain a column with a sample name

In [None]:
globbed_files = glob.glob("*.csv")

data = []

for csv in globbed_files:
    dataframe = pd.read_csv(csv)
    dataframe['sample'] = os.path.basename(csv.split('.')[0])
    data.append(dataframe)

combined_data = pd.concat(data)
combined_data.to_csv("combined_data.csv", index=False)

df=pd.read_csv("combined_data.csv", index_col=[0])

df

Convert .csv file to .tsv file

In [None]:
targetFileExtension = 'tsv'

df = pd.read_csv("/content/combined_data.csv", low_memory=False, engine="c")

file = "/content/combined_data.csv".split('.')[0]

newFile = f'{file}.{targetFileExtension}'

# Save new file to current directory
df.to_csv(newFile, sep='\t', na_rep='.', index=False)