In [None]:
!pip install dask
!pip install sklearn

In [None]:
import dask.dataframe as dd
import pandas as pd
pd.options.display.max_columns = 999
import glob

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import re
import numpy as np

In [None]:
%%time
metadata = pd.read_csv('data/AdaptiveISB_Metadata.csv', usecols = ['repertoire_id', 'sex', 'age_min', 'ethnicity',
       'disease_length', 'disease_stage', 'intervention', 'medical_history',
       'collection_time_point_relative'])
metadata.collection_time_point_relative = metadata.collection_time_point_relative.str.rsplit(" d", n=0, expand=True).rename(columns = {0:'collection_time_point_relative'}).drop(1, axis=1)

In [None]:
raw_data = dd.read_csv('data/adaptive-ISB-combined.tsv',sep='\t')
raw_data=raw_data.repartition(partition_size="20GB")
raw_data.to_parquet('data/adaptive_combined.parquet')

### V call

In [None]:
%%time
df1 = pd.DataFrame()
for parq_file in glob.glob('data/adaptive_combined/part.*.parquet'):
    
    data = pd.read_parquet(parq_file,columns=['v_call','repertoire_id']).drop_duplicates().reset_index(drop=True)
    count_vec = CountVectorizer( analyzer='word', tokenizer=lambda x: re.split(r', |or\s', x),
                                ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None)
    count_train = count_vec.fit(data.v_call)
    bag_of_words = count_vec.transform(data.v_call)
    rr = pd.DataFrame(bag_of_words.toarray(), columns = count_vec.get_feature_names()).drop([''],axis=1)
    cols = rr.columns
    print(parq_file)
    df = pd.concat([data[['repertoire_id']],rr],axis=1)
    df = df.groupby('repertoire_id')[cols].sum().reset_index()
    df1 = pd.concat([df1, df], sort=False).fillna(0)
    cols = df1.drop('repertoire_id',axis=1).columns
    df1 = df1.groupby('repertoire_id')[cols].sum().reset_index()
    del data
    del bag_of_words
    del rr
    del df
df2.to_parquet('data/adaptiveisb1_v_call.parquet')

In [None]:
%%time
data = pd.read_parquet('data/adaptiveisb1_v_call.parquet')
cols = data.drop('repertoire_id',axis=1).columns
data = data.groupby('repertoire_id')[cols].sum().reset_index()
dff = pd.merge(metadata, data, on='repertoire_id', how='inner')
dff.to_parquet('data/v_call.parquet')

# D_call

In [None]:
%%time
df2 = pd.DataFrame()
for parq_file in glob.glob('data/adaptive_combined/part.*.parquet'):
    
    data = pd.read_parquet(parq_file,columns=['d_call','repertoire_id']).drop_duplicates().reset_index(drop=True)
    count_vec = CountVectorizer( analyzer='word', tokenizer=lambda x: re.split(r', |or\s', x),
                                ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None)
    data.d_call.fillna(value='None', inplace=True)
    count_train = count_vec.fit(data.d_call)
    bag_of_words = count_vec.transform(data.d_call)
    rr = pd.DataFrame(bag_of_words.toarray(), columns = count_vec.get_feature_names()).drop(['','none'],axis=1)
    cols = rr.columns
    print(parq_file)
    df = pd.concat([data[['repertoire_id']],rr],axis=1)
    df = df.groupby('repertoire_id')[cols].sum().reset_index()
    df2 = pd.concat([df2, df], sort=False).fillna(0)
    cols = df2.drop('repertoire_id',axis=1).columns
    df2 = df2.groupby('repertoire_id')[cols].sum().reset_index()
    del data
    del bag_of_words
    del rr
    del df

In [None]:
df2.to_parquet('data/adaptiveisb2_d_call.parquet')

In [None]:
data = pd.read_parquet('data/adaptiveisb2_d_call.parquet')
cols = data.drop('repertoire_id',axis=1).columns
data = data.groupby('repertoire_id')[cols].sum().reset_index()

In [None]:
dff = pd.merge(metadata, data, on='repertoire_id', how='inner')
dff.to_parquet('data/d_call.parquet')

# J_call

In [None]:
%%time
df2 = pd.DataFrame()
for parq_file in glob.glob('data/adaptive_combined/part.*.parquet'):
    
    data = pd.read_parquet(parq_file,columns=['j_call','repertoire_id']).drop_duplicates().reset_index(drop=True)
    count_vec = CountVectorizer( analyzer='word', tokenizer=lambda x: re.split(r', |or\s', x),
                                ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None)
    data.j_call.fillna(value='None', inplace=True)
    count_train = count_vec.fit(data.j_call)
    bag_of_words = count_vec.transform(data.j_call)
    rr = pd.DataFrame(bag_of_words.toarray(), columns = count_vec.get_feature_names()).drop([''],axis=1)
    cols = rr.columns
    print(parq_file)
    df = pd.concat([data[['repertoire_id']],rr],axis=1)
    df = df.groupby('repertoire_id')[cols].sum().reset_index()
    df2 = pd.concat([df2, df], sort=False).fillna(0)
    cols = df2.drop('repertoire_id',axis=1).columns
    df2 = df2.groupby('repertoire_id')[cols].sum().reset_index()
    del data
    del bag_of_words
    del rr
    del df

In [None]:
df2.to_parquet('data/adaptiveisb2_j_call.parquet.parquet')

In [None]:
data = pd.read_parquet('data/adaptiveisb2_j_call.parquet.parquet')
cols = data.drop('repertoire_id',axis=1).columns
data = data.groupby('repertoire_id')[cols].sum().reset_index()
dff = pd.merge(metadata, data, on='repertoire_id', how='inner')

In [None]:
dff.to_parquet('data/j_call.parquet')