
Import python libraries

In [None]:
from IPython import get_ipython
import pandas as pd
import glob 
import os 


Upload all the sample files

In [None]:
# Specify the path to your file in Google Drive or locally
filePath = "/Users/jaden/Downloads/complete_COVID_samples.tsv" # "/content/drive/MyDrive/complete_COVID_samples.tsv"

isInGoogle = 'google.colab' in str(get_ipython())

if isInGoogle:
    from google.colab import drive
    drive.mount('/content/drive')

targetFilename = os.path.basename(filePath)

df_samples = pd.read_table(filePath, low_memory=False, engine="c")

df_samples.head()

Convert data to the pyTCR standardized format:
| column | name | description                                    |
|--:|:---------|:------------------------------------------------|
| 1   | `sample`  | The name of the sample                       |
| 2   | `freq`    | The share of clonotypes in the sample        |
| 3   | `#count`  | The number of reads                          |
| 4   | `cdr3aa`  | CDR3 amino acid clonotype                    |
| 5   | `cdr3nt`  | CDR3 nucleotide                              |
| 6   | `v`       | V gene                                       |
| 7   | `d`       | D gene                                       |
| 8   | `j`       | J gene                                       |
| ... | optional fields | any other fields intended for your use |

- Modify the `required_columns` below to match the column names from your data that are equivalent to pyTCR's columns in the same order as described above.
- The following code will create a new `.tsv` file with with the correct pyTCR column names and place it in the current directory.

In [None]:
# Enter the column names from your data that represent the required pyTCR columns.
required_columns = [
'sample', 'frequency', 'templates',
'amino_acid', 'rearrangement', 'v_resolved' , 'd_resolved', 'j_resolved'
]

optional_columns = ['hospitalized']

df_new = df_samples.filter(required_columns + optional_columns)

# rename the columns to pyTCR standard names
df_new.columns = [
'sample', 'freq', '#count', 'cdr3aa',
'cdr3nt', 'v', 'd', 'j'
] + optional_columns

# Options: 'tsv', 'txt
ext = 'tsv'

df_new.to_csv(f'./{targetFilename}.{ext}', sep='\t', na_rep='.', index=False)

Add a column to the each file with filename as the new column

In [None]:
globbed_files = glob.glob("*.csv")

data = []

for csv in globbed_files:
    dataframe = pd.read_csv(csv)
    dataframe['sample'] = os.path.basename(csv.split('.')[0])
    data.append(dataframe)

combined_data = pd.concat(data)
combined_data.to_csv("combined_data.csv",index=False)

df=pd.read_csv("combined_data.csv",index_col=[0])
df

Utility - Convert .csv file to .tsv or .txt file

In [None]:
# Options: 'tsv', 'txt
targetFileExtension = 'tsv'

df_samples = pd.read_csv(filePath, low_memory=False, engine="c")

file = targetFilename.split('.')[0]
newFile = f'{file}.new.{targetFileExtension}'
# Saves new file to current directory.
df_samples.to_csv(newFile, sep='\t', na_rep='.', index=False)
