Step 1 - Import python libraries

In [None]:
from IPython import get_ipython
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from scipy import stats

pd.options.mode.chained_assignment = None

Step 2 - Load and read your data file
- pyTCR accepts a single `.tsv` file that should contain all the samples.
  - The following cell attempts to detect whether you are running the notebook in a Google Colab cloud environment or in a local environment, and then loads the data at the specified path.
- The `filePath` variable in the following code cell should be changed to the location of your file. The following options are supported:
  1. A `filePath` from Google Drive (to run on a cloud environment)
  2. A `filePath` from your local computer (to run on a local environment, other cloud environments should work as expected)
- The `data_adapter` notebook can be used to convert the data into the correct format for pyTCR to read.

In [None]:
# Specify the path to your data in Google Drive or locally
filePath = "/content/drive/MyDrive/complete_COVID_samples.tsv"

isInGoogle = 'google.colab' in str(get_ipython())

if isInGoogle:
    from google.colab import drive
    drive.mount('/content/drive')

df = pd.read_table(filePath, low_memory=False, engine="c")

df.head()


Clonality analysis 1 - the most frequent clonotype

In [None]:
df_clonality_most = df.loc[df.groupby('sample')['freq'].idxmax()]
df_clonality_most = df_clonality_most[['sample', 'hospitalization', 'cdr3aa', '#count']]
df_clonality_most

Clonality analysis 2 - the least frequent clonotype

In [None]:
df_clonality_least = df.loc[df.groupby('sample')['freq'].idxmin()]
df_clonality_least = df_clonality_least[['sample', 'hospitalization', 'cdr3aa', '#count']]
df_clonality_least

Clonality analysis 3.1 - 1-Pielou index

In [None]:
#calculate the clonotype counts for each sample
df_diversity = df.groupby(['sample'], sort=False).size().reset_index(name='clonotype_count')

#add the clonotype counts as a column to the dataframe
df_clonality=pd.merge(df, df_diversity, on=['sample'])

#calculate 1-Pielou index
df_clonality['clonality'] = df_clonality['freq']*np.log(df_clonality['freq'])/np.log(df_clonality['clonotype_count'])
df_clonality= df_clonality.groupby(['sample','hospitalization']).agg({'clonality':'sum'}).reset_index().rename(columns={'':"clonality"})
df_clonality['1_pielou'] = df_clonality['clonality'] + 1
df_clonality = df_clonality[['sample','hospitalization','1_pielou']]

df_clonality

Clonality analysis 3.2 - Statistical analysis of 1-Pielou index

Clonality analysis 3.2.1 - Test if 1-Pielou index is normally distributed
1.   the null hypothesis here is normality
2.   if the p value is greater than 0.05, we cannot reject the null hypothesis (it is a normal distribution). if the p value is smaller than 0.05, we reject the null hypothesis (it is not a normal distribution)

In [None]:
x = stats.normaltest(df_clonality['1_pielou'])
x

Clonality analysis 3.2.2 - Mean or median of 1-Pielou index among groups
1.   if the dataset is normally distributed, calculate mean
2.   if the dataset is not normally distributed, calculate median

In [None]:
#calculate the mean among two groups
df_clonality_mean = df_clonality.groupby('hospitalization')['1_pielou'].mean().reset_index()
df_clonality_mean

In [None]:
#calculate the median among two groups
df_clonality_median = df_clonality.groupby('hospitalization')['1_pielou'].median().reset_index()
df_clonality_median

Clonality analysis 3.2.3 - Stat test
1.   If the dataset is normally distributed, use t-test (stats.ttest_ind)
*   change the group1, group2 to the groups/samples that you are interested in
2.   If the dataset is not normally distributed, use Wilcoxon rank-sum test (stats.ranksums)
*   change the group1, group2 to the groups/samples that you are interested in

In [None]:
df1 = df_clonality.copy()
df_group1 = df1[df1['hospitalization'] == True]
df_group2 = df1[df1['hospitalization'] == False]
stats.ranksums(df_group1['1_pielou'], df_group2['1_pielou'])

Clonality analysis 4.1 - Clonal proportion

In [None]:
#create an empty dataframe for storing results
df_clonality_portion = pd.DataFrame()

#create a list of the sample names
samples = set(df['sample'])


for sample in samples:

    #store the rows related to the sample 
    df_temp = df.loc[df['sample'] == sample]

    #sort the sample clonotypes by frequency in descending order
    df_temp = df_temp.sort_values(by='freq', ascending=False)

    #create a column to store the order
    df_temp['clonotype_number'] = np.arange(df_temp.shape[0])+1

    #compute and store the cumulative sum of the frequencies
    df_temp['accum_freq'] = df_temp['freq'].cumsum()

    #filter out accumulated frequencies smaller than 10% (10% can be adjusted by the user)
    df_temp = df_temp.loc[(df_temp['accum_freq']>=0) & (df_temp['accum_freq']<=0.1)]

    #select the smallest accumulated frequency record and stores in the result dataframe
    df_temp = df_temp.sort_values(by='accum_freq', ascending=False)
    df_temp = df_temp.head(1)  
    df_clonality_portion = pd.concat([df_clonality_portion, df_temp],sort=True)

    #show the results 
    df_clonality_portion = df_clonality_portion[['sample','hospitalization','clonotype_number']]

df_clonality_portion.columns = ['sample', 'hospitalization', 'clonality_portion']
df_clonality_portion

Clonality analysis 4.2 - Statistical analysis of clonality portion

Clonality analysis 4.2.1 - Test if the clonality portion is normally distributed
1.   the null hypothesis here is normality
2.   if the p value is greater than 0.05, we cannot reject the null hypothesis (it is a normal distribution). If the p value is smaller than 0.05, we reject the null hypothesis (it is not a normal distribution)

In [None]:
x = stats.normaltest(df_clonality_portion['clonality_portion'])
x

Clonality analysis 4.2.2 - Mean or median of clonality portion among groups
1.   if the dataset is normally distributed, calculate mean
2.   if the dataset is not normally distributed, calculate median

In [None]:
#calculate the mean among two groups
df_clonality_portion_mean = df_clonality_portion.groupby('hospitalization')['clonality_portion'].mean().reset_index()
df_clonality_portion_mean

In [None]:
#calculate the median among two groups
df_clonality_portion_mean = df_clonality_portion.groupby('hospitalization')['clonality_portion'].median().reset_index()
df_clonality_portion_mean

Clonality analysis 4.2.3 - Stat test
1.   if the dataset is normally distributed, use t-test
*   change the group1, group2 to the groups/samples that you are interested in
2.   if the dataset is not normally distributed, use Wilcoxon rank-sum test (stats.ranksums)
*   change the group1, group2 to the groups/samples that you are interested in

In [None]:
df2 = df_clonality_portion.copy()
df_group1 = df2[df2['hospitalization'] == True]
df_group2 = df2[df2['hospitalization'] == False]
stats.ranksums(df_group1['clonality_portion'], df_group2['clonality_portion'])

Clonality analysis 4.3 - clonality portion plot per sample
1.   x-axis and y-axis labels, figsize, fontsize are customizable

In [None]:
ax = plt.subplots(figsize = (10,10))
ax = sns.barplot(data=df_clonality_portion,x='sample',y='clonality_portion',hue='hospitalization')
ax.set_xlabel('sample',fontsize=20)
ax.set_ylabel('number of clonotypes',fontsize=20)
plt.xticks(fontsize=10, rotation=90)
plt.yticks(fontsize=20)

Clonality analysis 4.4 -  clonality portion violin plot per group
1.   x-axis and y-axis labels, figsize, fontsize are customizable
2.   change the violin plot (sns.violinplot) to the plot type that you are interested in, includes strip plot (sns.stripplot), swarm plot (sns.swarmplot), box plot (sns.boxplot), boxen plot (sns.boxenplot), point plot (sns.pointplot), and bar plot (sns.barplot)

In [None]:
ax = plt.subplots(figsize = (10,10))

ax = sns.violinplot(x='hospitalization',y='clonality_portion', data=df_clonality_portion)

ax.set_xlabel('hospitalization',fontsize=20)
ax.set_ylabel('number of clonotypes',fontsize=20)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)

plt.show()

Clonality analysis 5 - Relative abundance (in all clonotypes)

In [None]:
#define clonotype groups based on frequency
def clonotype_group (row):
    if row['freq'] > 0.01 and row['freq'] <= 1:
        return 'Hyperexpanded'
    if row['freq'] > 0.001 and row['freq'] <= 0.01: 
        return 'Large'
    if row['freq'] > 0.0001 and row['freq'] <= 0.001:
        return 'Medium'
    if row['freq'] > 0.00001 and row['freq'] <= 0.0001:
        return 'Small'
    if row['freq'] > 0 and row['freq'] <= 0.00001:
        return 'Rare'

#apply the clonotype_group function to the dataframe
df['clonotype_group'] = df.apply (lambda row: clonotype_group(row),axis=1)

#calculate the relative abundance in each sample based on clonotype groups
df_relative_abundance= df.groupby(['sample','hospitalization','clonotype_group']).agg({'freq':'sum'}).reset_index().rename(columns={'':"relative_abundance"})
df_relative_abundance

In [None]:
label_order = ['Hyperexpanded', 'Large', 'Medium', 'Small', 'Rare']

ax = df_relative_abundance.groupby(['sample','clonotype_group'])['freq'].sum().unstack()[label_order].plot(kind='bar', stacked=True)
ax.set_xlabel('sample',fontsize=20)
ax.set_ylabel('clonotype frequency',fontsize=20)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.gcf().set_size_inches(20, 10)

Clonality analysis 6 - Relative abundance in top clonotypes
1.   the top clonotypes are defined as top 100 clonotypes in the analysis below

In [None]:
#define groups based on clone counts 
def reads_group_top (row):
    if row['#count'] > 1 and row['#count'] <= 10:
        return '1-10'
    if row['#count'] >= 11 and row['#count'] <= 100:
        return '11-100'
    if row['#count'] >= 101 and row['#count'] <= 1000:
        return '101-1000'
    if row['#count'] >= 1001 and row['#count'] <= 5000:
        return '1001-5000'

#get the top 100 clonotypes in each sample
df_top=df.sort_values(['sample', 'freq'], axis=0).groupby('sample').tail(100)

#apply the function to the dataframe
df_top['reads_group'] = df_top.apply (lambda row: reads_group_top(row),axis=1)

#show the result
df_top_result = df_top[['sample','hospitalization','reads_group']]
df_top_result

In [None]:
ax = df_top.groupby(['sample', 'reads_group'])['#count'].count().unstack().plot(kind='bar', stacked=True)
ax.set_xlabel('sample',fontsize=20)
ax.set_ylabel('number of clonotypes',fontsize=20)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.gcf().set_size_inches(20, 10)

Clonality analysis 7 - Relative abundance in rare clonotypes
1.   the rare clonotypes are defined as rare 100 clonotypes in the analysis below

In [None]:
#define groups based on clone counts 
def reads_group_rare (row):
    if row['#count'] == 1 :
        return '1'
    if row['#count'] >= 2 and row['#count'] <= 3:
        return '2-3'
    if row['#count'] >= 4 and row['#count'] <= 10:
        return '4-10'
    if row['#count'] >= 11 and row['#count'] <= 30:
        return '11-30'
    if row['#count'] >= 31 and row['#count'] <= 100:
        return '31-100'  
    if row['#count'] >= 101 and row['#count'] <= 200:
        return '101-200'

#get the rare 100 clonotypes
df_rare=df.sort_values(['sample', 'freq'], axis=0).groupby('sample').head(100)

#apply the functions to the dataframe
df_rare['reads_group'] = df_rare.apply (lambda row: reads_group_rare(row),axis=1)

#show the result
df_rare_result = df_rare[['sample','hospitalization','reads_group']]
df_rare_result

In [None]:
ax = df_rare.groupby(['sample', 'reads_group'])['#count'].count().unstack().plot(kind='bar', stacked=True)
ax.set_xlabel('sample',fontsize=20)
ax.set_ylabel('number of clonotypes',fontsize=20)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.gcf().set_size_inches(20, 10)