In [1]:
import os
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import statsmodels.api as sm
import pickle
%matplotlib inline

In [2]:
out_path = "/labs/mignot/DGN/BLAST"

In [4]:
#df = pd.DataFrame(columns=["patid", "genotype", "dose", "transcript", "count"])
out_file = open("DGN_family_counts_df.csv", "w")
out_file.write("patid,family,count\n")

patid_multifamily_reads_dict = dict()

for file in os.listdir(out_path): # for each person
    patid, extension = file[:-4], file[-4:]
    if extension == ".out":
        print(file)
#         if os.path.exists("{}.fasta".format(patid)): # skip .out files where the fasta is still present, unfinished
#             continue 
        filepath = os.path.join(out_path, file)
        # load each person's .out file into a separate dataframe
        raw_df = pd.read_csv(filepath, delimiter="\t", names=['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore'])
        #raw_df = raw_df[raw_df['pident'] == 100.0]
        # group by read id and only extract the maximum score read across families
        # we only want one hit for each RNAseq transcript
#         raw_df = raw_df.loc[raw_df.groupby(["qseqid"], sort=False)["pident"].idxmax()]
#         # for each row in an .out file, we only care about which transcript was hit (i.e. sseqid).
#         # so for each person, we count all of the sseqid hits from unique transcripts and put into a dictionary
#         transcript_count_dict = raw_df.groupby("sseqid")['sseqid'].count().to_dict()

        # for every read, get the max percent identity score
        max_pident = raw_df.groupby(["qseqid"], sort=False)['pident'].max().to_frame().reset_index()
        # add max percent identity score into df
        raw_df = pd.merge(max_pident, raw_df, on='qseqid', suffixes=("_max", ""))
        # only keep rows where the row's score is the max score of the group
        raw_df = raw_df[raw_df["pident_max"] == raw_df['pident']]
        # get the family of the transcript in each row
        fam_df = raw_df['sseqid'].str.split("*", expand=True).rename(columns={0:"family", 1:"subtype"})
        # add family into df
        raw_df.loc[:,'family'] = fam_df['family']
        # for each read, get all the unique families (of max score)
        read_unique_fam_series = raw_df.groupby(["qseqid"], sort=False)['family'].unique()
        # where there is only one max family, add the read count to that family
        transcript_count_dict = dict()
        for fams in read_unique_fam_series.values:
            if len(fams) == 1:
                fam = fams[0]
                if fam not in transcript_count_dict:
                    transcript_count_dict[fam] = 0
                transcript_count_dict[fam] += 1
            else:
                if patid not in patid_multifamily_reads_dict:
                    patid_multifamily_reads_dict[patid] = []
                patid_multifamily_reads_dict[patid].append(list(fams))
                


        for transcript in transcript_count_dict:
            out_file.write("{},{},{}\n".format(patid,transcript,transcript_count_dict[transcript]))
            #df = df.append({"patid": patid, "genotype": genotype, "dose": genotype_name_dose_dict[genotype], "transcript": transcript, "count": transcript_count_dict[transcript]}, ignore_index=True)
out_file.close()

LD1157.out
LD0105.out
LD1145.out
LD0359.out
LD1187.out
LD1326.out
LD0806.out
LD1192.out
LD0811.out
LD0993.out
LD0561.out
LD1044.out
LD0070.out
LD0270.out
LD1148.out
LD0788.out
LD1283.out
LD0313.out
LD1110.out
LD0442.out
LD0210.out
LD0351.out
LD0427.out
LD1092.out
LD0027.out
LD0932.out
LD1127.out
LD0147.out
LD0691.out
LD0889.out
LD0730.out
LD0148.out
LD0579.out
LD0684.out
LD0409.out
LD0543.out
LD1303.out
LD0377.out
LD0826.out
LD1306.out
LD1020.out
LD0021.out
LD0478.out
LD1309.out
LD1227.out
LD0165.out
LD0275.out
LD0512.out
LD0122.out
LD0347.out
LD1305.out
LD0134.out
LD0061.out
LD1007.out
LD0205.out
LD0833.out
LD1285.out
LD1266.out
LD1364.out
LD0963.out
LD0531.out
LD1193.out
LD0877.out
LD0368.out
LD0041.out
LD0714.out
LD0171.out
LD0773.out
LD0934.out
LD0461.out
LD0009.out
LD1131.out
LD1139.out
LD0106.out
LD1225.out
LD0044.out
LD0873.out
LD0413.out
LD1159.out
LD1290.out
LD0158.out
LD1214.out
LD0679.out
LD0468.out
LD0721.out
LD0388.out
LD1183.out
LD0189.out
LD1248.out
LD0706.out
LD0067.out

LD0323.out
LD0890.out
LD0598.out
LD0533.out
LD1155.out
LD0244.out
LD1069.out
LD0437.out
LD1293.out
LD1135.out
LD0090.out
LD0565.out
LD0190.out
LD1153.out
LD0450.out
LD0581.out
LD0396.out
LD1103.out
LD0410.out
LD0142.out
LD0830.out
LD0823.out
LD0217.out
LD0736.out
LD0768.out
LD0474.out
LD0334.out
LD1210.out
LD0342.out
LD0451.out
LD1203.out
LD1174.out
LD1236.out
LD0662.out
LD0534.out
LD0157.out
LD0174.out
LD0959.out
LD1190.out
LD0742.out
LD0865.out
LD0544.out
LD0104.out
LD0923.out
LD0891.out
LD1181.out
LD1268.out
LD1149.out
LD0117.out
LD0977.out
LD1243.out
LD1090.out
LD0447.out
LD1232.out
LD1211.out
LD0175.out
LD1356.out
LD0458.out
LD1059.out
LD0681.out
LD0054.out
LD0013.out
LD0012.out
LD0209.out
LD0484.out
LD0255.out
LD0799.out
LD0720.out
LD0673.out
LD1038.out
LD0790.out
LD1254.out
LD1037.out
LD0878.out
LD0619.out
LD0034.out
LD0979.out
LD0748.out
LD1298.out
LD0325.out
LD0732.out
LD0103.out
LD1094.out
LD0615.out
LD0620.out
LD0490.out
LD0163.out
LD0952.out
LD0786.out
LD0355.out
LD0206.out

In [6]:
import pickle

In [7]:
# pickle.dump(patid_multifamily_reads_dict, open("patid_multifamily_reads_dict.pkl", "wb"))

In [186]:
family_counts_df = pd.read_csv("DGN_family_counts_df.csv")

In [187]:
family_counts_table = family_counts_df.pivot(index="patid", columns="family", values="count").fillna(0.0)

In [188]:
family_counts_table.to_csv("DGN_family_counts_table.csv")

In [138]:
patid_family_counts_dict = family_counts_table.transpose().to_dict()

In [139]:
for problematic_fam in ['TRBV24-1', 'TRBV6-2']:
    for patid in patid_family_counts_dict:
        patid_family_counts_dict[patid][problematic_fam] = 0

for patid in patid_family_counts_dict:
    for group in patid_multifamily_reads_dict[patid]:
        if group == ['TRBV24/OR9-2', 'TRBV24-1']:
            patid_family_counts_dict[patid]['TRBV24-1'] += 1
        if group == ['TRBV6-3', 'TRBV6-2']:
            patid_family_counts_dict[patid]['TRBV6-2'] += 1

In [157]:
family_counts_table = pd.DataFrame.from_dict(patid_family_counts_dict).transpose()
family_counts_table.index.name = "patid"
family_counts_table.to_csv("DGN_family_counts_table.csv")
patid_family_counts_dict = family_counts_table.transpose().to_dict()

In [203]:
family_counts_table['TRBV6-2']

patid
LD0001    159.0
LD0002    137.0
LD0003    257.0
LD0006    279.0
LD0007    313.0
          ...  
LD1357    291.0
LD1361    211.0
LD1362     74.0
LD1364    460.0
LD1366    314.0
Name: TRBV6-2, Length: 895, dtype: float64

In [166]:
family_counts_df = family_counts_table.reset_index().melt(id_vars="patid", var_name="family", value_name="count")

In [167]:
family_counts_df.to_csv("DGN_family_counts_df.csv")

In [193]:
patid_family_counts_dict = family_counts_table.transpose().to_dict()

compute usage again

In [180]:
family_usage_table = pd.read_csv("DGN_family_usage_table.csv").set_index("patid")

In [204]:
family_usage_table['TRBV6-2']

patid
LD0001    0.021130
LD0002    0.012016
LD0003    0.032606
LD0006    0.025812
LD0007    0.024314
            ...   
LD1357    0.036366
LD1361    0.014479
LD1362    0.008357
LD1364    0.041494
LD1366    0.022475
Name: TRBV6-2, Length: 895, dtype: float64

In [181]:
patid_family_usage_dict = family_usage_table.transpose().to_dict()

In [184]:
patid_multifamily_reads_dict2 = dict()
for patid in patid_family_usage_dict:
    patid_multifamily_reads_dict2[patid] = []
    for group in patid_multifamily_reads_dict[patid]:
        if group != ['TRBV24/OR9-2', 'TRBV24-1'] and group != ['TRBV6-3', 'TRBV6-2']:
            if 'TRBV6-3' in group:
                group.remove("TRBV6-3")
                if "TRBV6-2" not in group: # only 6-3
                    group.append("TRBV6-2")
            if 'TRBV24/OR9-2' in group:
                group.remove('TRBV24/OR9-2')
                if 'TRBV24-1' not in group: # only 6-3
                    group.append('TRBV24-1')
            patid_multifamily_reads_dict2[patid].append(group)

In [194]:
for patid in patid_multifamily_reads_dict2:
    for group in patid_multifamily_reads_dict2[patid]:
        usages = np.array([patid_family_usage_dict[patid][fam] for fam in group])
        usages_normalized = usages / np.sum(usages)
        for i, fam in enumerate(group):
            patid_family_counts_dict[patid][fam] += usages_normalized[i] # fractional count

  after removing the cwd from sys.path.


In [102]:
patid_family_usage_dict['LD0001']['TRBV24-1']

0.00096140640021975

In [208]:
family_counts_table_new = pd.DataFrame.from_dict(patid_family_counts_dict).transpose()

In [210]:
family_counts_table_new['TRBV6-2']

LD0001    198.598031
LD0002    170.353486
LD0003    318.144538
LD0006    347.627408
LD0007    395.200035
             ...    
LD1357    374.401798
LD1361    258.050943
LD1362     91.758688
LD1364    577.230057
LD1366    373.012724
Name: TRBV6-2, Length: 895, dtype: float64

In [213]:
family_counts_table_new.index.name = "patid"

In [215]:
family_counts_df_new = family_counts_table_new.reset_index().melt(id_vars="patid", var_name="family", value_name="count")

In [220]:
family_counts_table_new.fillna(0.0).to_csv("DGN_family_counts_table.csv")

In [217]:
family_counts_df_new.to_csv("DGN_family_counts_df.csv", index=None)

In [16]:
'TRAV8-5' in family_counts_table.columns

False

In [None]:
        c = raw_df

        max_s = c.groupby(["qseqid"], sort=False)['pident'].max().to_frame().reset_index()

        c = pd.merge(max_s, c, on='qseqid', suffixes=("_max", ""))

        c = c[c["pident_max"] == c['pident']]

        fam_df = c['sseqid'].str.split("*", expand=True).rename(columns={0:"family", 1:"subtype"})
        c.loc[:,'family'] = fam_df['family']

        unique = c.groupby(["qseqid"], sort=False)['family'].nunique() == 1

        c = pd.merge(unique, c, on='qseqid', suffixes=('_unique', ''))

        c = c[c['family_unique'] == True]
        
        fams_lst = c.groupby(["qseqid"], sort=False)['family'].first().values

        transcript_count_dict = dict()
        for fam in fams_lst:
            if fam not in transcript_count_dict:
                transcript_count_dict[fam] = 0
            transcript_count_dict[fam] += 1