## Processes the .out files generated by BLAST to get gene counts

In [1]:
import os
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import statsmodels.api as sm
import pickle
%matplotlib inline

In [2]:
path = "/labs/mignot/DGN/BLAST"
zero_path = os.path.join(path, "zero")
one_path = os.path.join(path, "one")
two_path = os.path.join(path, "two")

In [3]:
# f = open("/home/ashteng/allJCodes.txt", "r")
# all_j_codes = set([x.strip() for x in f.readlines()])

In [4]:
genotype_name_path_dict = {"GG": zero_path} #"GG": zero_path, "GC": one_path, 
genotype_name_dose_dict = {"GG": 0, "GC": 1, "CC": 2}

In [5]:
#df = pd.DataFrame(columns=["patid", "genotype", "dose", "transcript", "count"])
out_file = open("VDJCounts_GG.txt", "w")
out_file.write("patid,genotype,dose,transcript,count\n")
for genotype in genotype_name_path_dict.keys(): # zero, one, two
    print(genotype)
    genotype_path = genotype_name_path_dict[genotype]
    for file in os.listdir(genotype_path): # for each person
        print(file)
        patid, extension = file[:-4], file[-4:]
        if extension == ".out":
            if os.path.exists("{}.fasta".format(patid)): # skip .out files where the fasta is still present, unfinished
                continue 
            filepath = os.path.join(genotype_path, file)
            # load each person's .out file into a separate dataframe
            raw_df = pd.read_csv(filepath, delimiter="\t", names=['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore'])
            # group by read id and only extract the maximum score read across families
            # we only want one hit for each RNAseq transcript (61584 rows --> 18868 rows)
            raw_df = raw_df.loc[raw_df.groupby(["qseqid"], sort=False)["pident"].idxmax()]
            # for each row in an .out file, we only care about which transcript was hit (i.e. sseqid).
            # so for each person, we count all of the sseqid hits from unique transcripts and put into a dictionary
            transcript_count_dict = raw_df.groupby("sseqid")['sseqid'].count().to_dict()
            for transcript in transcript_count_dict:
                out_file.write("{},{},{},{},{}\n".format(patid,genotype,genotype_name_dose_dict[genotype],transcript,transcript_count_dict[transcript]))
                #df = df.append({"patid": patid, "genotype": genotype, "dose": genotype_name_dose_dict[genotype], "transcript": transcript, "count": transcript_count_dict[transcript]}, ignore_index=True)
        break
# #             all_j_codes_dict = {x:0 for x in all_j_codes} # each person has a counts distribution over all J codes
# #             for index, row in raw_df.iterrows():
# #                 match = row['sseqid']
# #                 if match in all_j_codes_dict:
# #                     all_j_codes_dict[match] += 1
# #                 else:
# #                     print("Error: unknown match {} from {}.".format(match, filepath))
# #             for j_code in all_j_codes_dict:
# #                 df = df.append({"patid": patid, "genotype": genotype, "dose": genotype_name_dose_dict[genotype], "transcript": j_code, "count": all_j_codes_dict[j_code]}, ignore_index=True)

GG
LD1157.out


In [None]:
assert len(all_j_codes_dict) == len(df['transcript'].unique()) == len(df[df['patid'] == 'LD0420'])
df.to_csv("allJCounts2.txt")

In [None]:
df = pd.read_csv("allJCounts2.txt", index_col=0)

In [None]:
df = df[df['transcript'].str.startswith("TRA")] # only keep TRAJ

In [None]:
transcript_family_subtype_df = df["transcript"].str.split("*", n = 1, expand = True).rename(columns={0:"family", 1:"subtype"})

In [None]:
df["family"] = transcript_family_subtype_df["family"]
df["subtype"] = transcript_family_subtype_df["subtype"]

In [None]:
df

In [None]:
# For comparison with Sharon
patid_family_counts_df = df[['patid', 'family', 'count']].groupby(['patid', 'family']).sum().reset_index()
pickle.dump(patid_family_counts_df, open("patid_family_count_df.pkl", "wb"))

In [None]:
patid_genotype_df = df[['patid', 'genotype', 'dose']].drop_duplicates()
patid_genotype_df.head()
patid_genotype_df.to_csv("patid_genotype.csv")

In [None]:
len(df.patid.unique())

In [None]:
total_j_count_df = df[["patid", "count"]].groupby("patid").sum().rename(columns={"count":"total_J_count"}).reset_index()
total_j_count_df.head()

In [None]:
df_with_total = pd.merge(df, total_j_count_df, how="inner", on="patid")
df_with_total["ratio"] = df_with_total["count"]/df_with_total["total_J_count"]
df_with_total['patid'] = df_with_total['patid'].apply(lambda x: int(x[2:]))

In [None]:
df_with_total.head()

In [None]:
all_j_ratios_df = df_with_total[["patid", "transcript", "ratio"]]

In [None]:
all_j_ratios_df.to_csv("allJRatios.csv")

Add covariates to DF

In [None]:
PCA_path = "/labs/mignot/GenRED.II.autosomalClean.Final.mds"
PCA_df = pd.read_csv(PCA_path, delim_whitespace=True)
PCA_df = PCA_df.drop("IID", axis=1)
PCA_df['patid'] = PCA_df['FID'].apply(lambda x: x.split("_")[-1])
PCA_df = PCA_df[PCA_df['patid'].str.startswith("LD")]
PCA_df['patid'] = PCA_df['patid'].apply(lambda x: int(x[2:]))
PCA_df = PCA_df.drop(["FID", "SOL"], axis=1)
PCA_df.head()

In [None]:
df_with_total.head()

In [None]:
patid_genotype_df['patid'] = patid_genotype_df['patid'].apply(lambda x: int(x[2:]))

In [None]:
def getPlotDf(query_transcripts, plot_column="ratio"):
    # query_transcripts can be single, or list of multiple, which will be summed together
    summed_plot_column_name = "summed_{}".format(plot_column)
    
    # filter out rows that match query_transcripts
    filtered_transcripts_df = df_with_total[df_with_total['transcript'].isin(query_transcripts)]
    
    # sum up all row values by groupby patid
    summed_filtered_transcripts_df = filtered_transcripts_df[['patid', plot_column]].groupby('patid').sum().reset_index().rename(columns={plot_column:summed_plot_column_name})
    
    # add metadata about genotypes and dose
    plot_df = pd.merge(summed_filtered_transcripts_df, patid_genotype_df)
    
    # normalize summed_count column within each class, creating new column normalized_summed_count
    normalized_summed_plot_column_name = 'normalized_{}'.format(summed_plot_column_name)
    plot_df[normalized_summed_plot_column_name] = (plot_df[summed_plot_column_name] - plot_df[summed_plot_column_name].mean()) / plot_df[summed_plot_column_name].std()

    # remove outliers (two sides) using the normalized_summed_count
    quantile_low, quantile_high = plot_df[normalized_summed_plot_column_name].quantile([0.025, 0.975]).values
    outliers_removed_plot_df = plot_df[(plot_df[normalized_summed_plot_column_name] > quantile_low) & (plot_df[normalized_summed_plot_column_name] < quantile_high)]
    
#     # remove outliers with hard threshold
#     outliers_removed_plot_df = plot_df[plot_df[summed_plot_column_name] <= 0.025]
    
    # join in covariates
    plot_df_with_covariates = pd.merge(outliers_removed_plot_df, PCA_df, on="patid", how="inner")
    
    return plot_df_with_covariates

In [None]:
def drawFamilyBoxPlots(plot_df, plot_column_name):    
    fig, ax = plt.subplots()
    sns.boxplot(x="genotype", y=plot_column_name, data=plot_df, order=["GG", "GC", "CC"], ax=ax)
    sns.regplot(x="dose", y=plot_column_name, data=plot_df, x_estimator=np.median, color="yellow", ax=ax, label="Median Estimator")
    #sns.scatterplot(x="dose", y=summed_plot_column, data=plot_df)
    plt.legend()
    plt.title("BLAST {} {}".format(query_transcripts, plot_column_name))

In [None]:
def drawFamilyViolinPlots(plot_df, plot_column_name):
    fig, ax = plt.subplots()
    sns.violinplot(x="genotype", y=plot_column_name, data=plot_df)
    plt.title("BLAST {} {}".format(query_transcripts, plot_column_name))

In [None]:
def regress(plot_df, plot_column_name, covariates=True):
    if covariates:
        X = plot_df.loc[:, ["dose", "C1", "C2", "C3", "C4"]].values
    else:
        X = plot_df.loc[:, ["dose"]].values
    y = plot_df[plot_column_name].values
    XX = sm.add_constant(X)
    mod = sm.OLS(y, XX)
    res = mod.fit()
    print('Parameters: ', res.params)
    print('R2: ', res.rsquared)
    print('p values:', res.pvalues)
    print(res.summary())

In [None]:
df_with_total.drop_duplicates('patid').groupby("genotype").count() # final cohort

In [None]:
query_transcripts = ["TRAJ24*01"]
plot_df = getPlotDf(query_transcripts, "ratio")
plot_column_name = "summed_ratio"
regress(plot_df, plot_column_name)
drawFamilyBoxPlots(plot_df, plot_column_name)
drawFamilyViolinPlots(plot_df, plot_column_name)

In [None]:
query_transcripts = ["TRAJ28*01"]
plot_df = getPlotDf(query_transcripts, "ratio")
plot_column_name = "summed_ratio"
regress(plot_df, plot_column_name)
drawFamilyBoxPlots(plot_df, plot_column_name)
drawFamilyViolinPlots(plot_df, plot_column_name)

In [None]:
plot_df

In [None]:
# Non Linear Model Testing

In [None]:
GG_arr = plot_df[plot_df['genotype'] == "GG"]['summed_ratio']
GC_arr = plot_df[plot_df['genotype'] == "GC"]['summed_ratio']
CC_arr = plot_df[plot_df['genotype'] == "CC"]['summed_ratio']
haveC_arr = np.concatenate((GC_arr.values, CC_arr.values))

In [None]:
import scipy

In [None]:
scipy.stats.mannwhitneyu(GG_arr, CC_arr)

In [None]:
scipy.stats.mannwhitneyu(GG_arr, GC_arr)

In [None]:
scipy.stats.mannwhitneyu(GC_arr, CC_arr)

In [None]:
scipy.stats.mannwhitneyu(GG_arr, haveC_arr)