Step 1 - Import python libraries

In [None]:
from IPython import get_ipython
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

pd.options.mode.chained_assignment = None

Step 2 - Load and read the data file
1.   Upload data file from Google drive and run the notebook in the cloud
2.   Upload data file from local computer and run the notebook in the cloud
3.   Load data file from local computer and run the notebook locally

In [None]:
# Specify the path to your data in Google Drive or locally
filePath = "/content/drive/MyDrive/complete_COVID_samples.tsv"

isInGoogle = 'google.colab' in str(get_ipython())

if isInGoogle:
    from google.colab import drive
    drive.mount('/content/drive')

df = pd.read_table(filePath, low_memory=False, engine="c")

df.head()


Step 2 - Option 2 - Upload data file from local computer


Step 2 - Option 3 - Run the notebook locally (refer here for how to connect to local runtime https://research.google.com/colaboratory/local-runtimes.html)

Motif analysis 1 - Amino acid spectratype

In [None]:
# add CDR3 amino acid length as a new column
df['aa_length'] = df['cdr3aa'].str.len()

df_aa_spectratype = df.groupby(['sample', 'hospitalization', 'aa_length'], as_index=False)[
    'freq'].agg({'spectratype': 'sum'})

df_aa_spectratype


Motif analysis 1.1 - Show the most frequent amino acid length in each sample

In [None]:
df_aa_max_spectratype = df_aa_spectratype.loc[df_aa_spectratype.groupby('sample')[
    'spectratype'].idxmax()]

df_aa_max_spectratype


Motif analysis 1.2 - Amino acid motif count (result table)

In [None]:
# define the function to count amino acid motifs (k is the length of the motif)
def aamotif(k, aa_list):
    aamotifCount = {}
    for aa in aa_list:
        for i in range(len(aa)-k+1):
            aamotif = aa[i:i+k]
            aamotifCount[aamotif] = aamotifCount.get(aamotif, 0)+1
    return aamotifCount


# create an empty dataframe for storing results
df_aa_motif = pd.DataFrame()

# create a list of the sample names
samples = set(df['sample'])

for sample in samples:

    # store the rows related to the sample
    df_temp = df.loc[df['sample'] == sample]

    # use amino acid motif length of 6 as an example
    df_temp = aamotif(6, df_temp['cdr3aa'])
    df_temp = pd.DataFrame(df_temp.items(), columns=['motif', 'count'])
    df_temp['sample'] = sample

    # append the dataframe based on amino acid motifs and stores in the result dataframe
    df_aa_motif = df_aa_motif.append(df_temp, ignore_index=True)

    # add the hospitalization information as a column
    df_aa_motif_1 = df_aa_motif.merge(
        df_aa_max_spectratype[['sample', 'hospitalization']], on='sample')

df_aa_motif_1


In [None]:
#show the most abundant amino acid motif in each sample
df_aa_motif_most = df_aa_motif_1.loc[df_aa_motif_1.groupby('sample')['count'].idxmax()]
df_aa_motif_most

Motif analysis 1.3 - Amino acid motif count (plots)

In [None]:
# remove motif counts that are less than 4999
df_aa_motif_1 = df_aa_motif_1[df_aa_motif_1['count'] > 5000]

ax = plt.subplots(figsize=(20, 15))
ax = sns.boxplot(data=df_aa_motif_1, x='motif',
                 y='count', hue='hospitalization')
plt.xticks(rotation=90)
ax.set_xlabel('Motif', fontsize=30)
ax.set_ylabel('Count', fontsize=30)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.legend(fontsize='x-large', title_fontsize='50')


In [None]:
# remove motif counts that are less than 4999
df_aa_motif_1 = df_aa_motif_1[df_aa_motif_1['count'] > 5000]

ax = df_aa_motif_1.groupby(['motif', 'hospitalization'])['count'].sum(
).unstack().plot(kind='bar', stacked=True, figsize=(20, 15))
ax.set_xlabel('Motif', fontsize=30)
ax.set_ylabel('Count', fontsize=30)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.legend(fontsize='x-large', title_fontsize='50')


Motif analysis 2.1 - nucleotide sequence motif analysis (result table)

In [None]:
# define the function to count nucleotide motifs (k is the length of the motif)
def ntmotif(k, nt_list):
    ntmotifCount = {}
    for nt in nt_list:
        for i in range(len(nt)-k+1):
            ntmotif = nt[i:i+k]
            ntmotifCount[ntmotif] = ntmotifCount.get(ntmotif, 0)+1
    return ntmotifCount


# create an empty dataframe for storing results
df_nt_motif = pd.DataFrame()

# create a list of the sample names
samples = set(df['sample'])

for sample in samples:

    # store the rows related to the sample
    df_temp = df.loc[df['sample'] == sample]

    # use amino acid motif length of 6 as an example
    df_temp = ntmotif(6, df_temp['cdr3nt'])
    df_temp = pd.DataFrame(df_temp.items(), columns=['motif', 'count'])
    df_temp['sample'] = sample

    # append the dataframe based on amino acid motifs and stores in the result dataframe
    df_nt_motif = df_nt_motif.append(df_temp, ignore_index=True)

    # add the hospitalization information as a column
    df_nt_motif_1 = df_nt_motif.merge(
        df_aa_max_spectratype[['sample', 'hospitalization']], on='sample')

df_nt_motif_1


In [None]:
#show the most abundant amino acid motif in each sample
df_nt_motif_most = df_nt_motif_1.loc[df_nt_motif_1.groupby('sample')['count'].idxmax()]
df_nt_motif_most

Motif analysis 2.2 - Show the motif counts across the samples by bar plots 

In [None]:
# remove motif counts that are less than 150000
df_nt_motif_1 = df_nt_motif_1[df_nt_motif_1['count'] > 149999]

ax = plt.subplots(figsize=(20, 15))
ax = sns.boxplot(data=df_nt_motif_1, x='motif',
                 y='count', hue='hospitalization')

plt.xticks(rotation=90)
ax.set_xlabel('Motif', fontsize=30)
ax.set_ylabel('Count', fontsize=30)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.legend(fontsize='x-large', title_fontsize='50')


In [None]:
# remove motif counts that are less than 150000
df_nt_motif_1 = df_nt_motif_1[df_nt_motif_1['count'] > 149999]

ax = df_nt_motif_1.groupby(['motif', 'hospitalization'])['count'].sum(
).unstack().plot(kind='bar', stacked=True, figsize=(20, 15))

ax.set_xlabel('Motif', fontsize=30)
ax.set_ylabel('Count', fontsize=30)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.legend(fontsize='x-large', title_fontsize='50')
