Step 1 - Import python libraries

In [None]:
%pip install IPython
%pip install matplotlib
%pip install pandas
%pip install seaborn
%pip install scipy

from IPython import get_ipython
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

pd.options.mode.chained_assignment = None

Step 2 - Load and read your data file
- pyTCR accepts a single `.csv` file that should contain all the samples.
  - The following cell attempts to detect whether you are running the notebook in a Google Colab cloud environment or in a local environment, and then loads the data at the specified path.
- The `filePath` variable in the following code cell should be changed to the location of your file. The following options are supported:
  1. A `filePath` from Google Drive (to run on a cloud environment)
  2. A `filePath` from your local computer (to run on a local environment, other cloud environments should work as expected)
- Adjust the `optional_fields` according to your data

In [None]:
# Mount Google Drive
isInGoogle = 'google.colab' in str(get_ipython())

if isInGoogle:
    from google.colab import drive
    drive.mount('/content/drive')

In [None]:
# Specify the path to your data in Google Drive or locally
filePath = "../combined_data.csv" # or "/content/drive/MyDrive/combined_data.csv"

df = pd.read_csv(filePath, low_memory=False, engine="c")

optional_fields = ['hospitalized']

df.head()

Motif analysis 1.1 - Amino acid spectratype

In [None]:
# Add CDR3 amino acid length as a new column
df['aa_length'] = df['cdr3aa'].str.len()

df_aa_spectratype = df.groupby(['sample', 'hospitalized', 'aa_length'], as_index=False)[
    'freq'].agg({'spectratype': 'sum'})

df_aa_spectratype

Motif analysis 1.2 - Show the most frequent amino acid length in each sample

In [None]:
df_aa_max_spectratype = df_aa_spectratype.loc[df_aa_spectratype.groupby('sample')['spectratype'].idxmax()]

df_aa_max_spectratype

Motif analysis 2.1 - Amino acid motif count (result table)

In [None]:
# Define the function to count amino acid motifs (k is the length of the motif)
def aamotif(k, aa_list):
    aamotifCount = {}
    for aa in aa_list:
        for i in range(len(aa)-k+1):
            aamotif = aa[i:i+k]
            aamotifCount[aamotif] = aamotifCount.get(aamotif, 0)+1
    return aamotifCount

# Create an empty dataframe for storing results
df_aa_motif = pd.DataFrame()

# Create a list of the sample names
samples = set(df['sample'])

for sample in samples:

    # Store the rows related to the sample
    df_temp = df.loc[df['sample'] == sample]

    # Use amino acid motif length of 6 as an example
    df_temp = aamotif(6, df_temp['cdr3aa'])
    df_temp = pd.DataFrame(df_temp.items(), columns=['motif', 'count'])
    df_temp['sample'] = sample

    # Append the dataframe based on amino acid motifs and stores in the result dataframe
    df_aa_motif = df_aa_motif.append(df_temp, ignore_index=True)

    # Add the hospitalization information as a column
    df_aa_motif_1 = df_aa_motif.merge(df_aa_max_spectratype[['sample', 'hospitalized']], on='sample')

df_aa_motif_1

In [None]:
# Show the most abundant amino acid motif in each sample
df_aa_motif_most = df_aa_motif_1.loc[df_aa_motif_1.groupby('sample')['count'].idxmax()]

df_aa_motif_most

Motif analysis 2.2 - Amino acid motif count (plots)

In [None]:
# Remove motif counts that are less than 9999
df_aa_motif_1 = df_aa_motif_1[df_aa_motif_1['count'] > 9999]

# Select the motifs that present in more than 2 samples
df_aa_motif_2 = df_aa_motif_1.groupby(['hospitalized','motif'], sort=False).size().reset_index(name='number_of_samples')
df_aa_motif_2 = df_aa_motif_2[df_aa_motif_2['number_of_samples'] > 2]
df_aa_motif_1 = pd.merge(df_aa_motif_1, df_aa_motif_2, on=['hospitalized','motif'])
df_aa_motif_1

In [None]:
attribute = "hospitalized"

ax = plt.subplots(figsize=(8,5))

ax = sns.stripplot(data=df_aa_motif_1, x='motif', y='count', hue=attribute, dodge=True, size=6, linewidth=1)

ax.set_xlabel('Amino acid motif', fontsize=20)
ax.set_ylabel('Count', fontsize=20)
plt.xticks(fontsize=18, rotation=90)
plt.yticks(fontsize=18)
plt.setp(ax.get_legend().get_texts(), fontsize='20')
plt.setp(ax.get_legend().get_title(), fontsize='20')
sns.despine()

plt.show()

Motif analysis 3.1 - Nucleotide sequence motif count (result table)

In [None]:
# Define the function to count nucleotide motifs (k is the length of the motif)
def ntmotif(k, nt_list):
    ntmotifCount = {}
    for nt in nt_list:
        for i in range(len(nt)-k+1):
            ntmotif = nt[i:i+k]
            ntmotifCount[ntmotif] = ntmotifCount.get(ntmotif, 0)+1
    return ntmotifCount

# Create an empty dataframe for storing results
df_nt_motif = pd.DataFrame()

# Create a list of the sample names
samples = set(df['sample'])

for sample in samples:

    # Store the rows related to the sample
    df_temp = df.loc[df['sample'] == sample]

    # Use amino acid motif length of 6 as an example
    df_temp = ntmotif(6, df_temp['cdr3nt'])
    df_temp = pd.DataFrame(df_temp.items(), columns=['motif', 'count'])
    df_temp['sample'] = sample

    # Append the dataframe based on amino acid motifs and stores in the result dataframe
    df_nt_motif = df_nt_motif.append(df_temp, ignore_index=True)

    # Add the hospitalization information as a column
    df_nt_motif_1 = df_nt_motif.merge(
        df_aa_max_spectratype[['sample', 'hospitalized']], on='sample')

df_nt_motif_1

In [None]:
# Show the most abundant amino acid motif in each sample
df_nt_motif_most = df_nt_motif_1.loc[df_nt_motif_1.groupby('sample')['count'].idxmax()]

df_nt_motif_most

Motif analysis 3.2 - Nucleotide sequence motif count (plots)

In [None]:
# Remove motif counts that are less than 150000
df_nt_motif_1 = df_nt_motif_1[df_nt_motif_1['count'] > 150000]

# Select the motifs that present in more than 2 samples
df_nt_motif_2 = df_nt_motif_1.groupby(['hospitalized','motif'], sort=False).size().reset_index(name='number_of_samples')
df_nt_motif_2 = df_nt_motif_2[df_nt_motif_2['number_of_samples'] > 2]
df_nt_motif_1 = pd.merge(df_nt_motif_1, df_nt_motif_2, on=['hospitalized','motif'])
df_nt_motif_1

In [None]:
attribute = "hospitalized"

ax = plt.subplots(figsize=(8,5))

ax = sns.stripplot(data=df_nt_motif_1, x='motif', y='count', hue=attribute, dodge=True, size=6, linewidth=1)

ax.set_xlabel('Nucleotide motif', fontsize=20)
ax.set_ylabel('Count', fontsize=20)
plt.xticks(fontsize=18, rotation=90)
plt.yticks(fontsize=18)
plt.setp(ax.get_legend().get_texts(), fontsize='20')
plt.setp(ax.get_legend().get_title(), fontsize='20')
sns.despine()

plt.show()