# Feature Counts Distribution

Get mean and standard deviation for the feature counts used by Conor.

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from datetime import timedelta
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [2]:
results_dir = "/home/eulalio/BMI212/2019_data/"

# read in features counts - downloaded from BQ
features_counts_file = "{}/traige_features_counts_long.csv".format(results_dir)
features_counts = pd.read_csv(features_counts_file)

In [3]:
features_counts.shape

(10594979, 6)

In [4]:
features_counts.head()

Unnamed: 0,anon_id,pat_enc_csn_id_coded,admit_time,feature_type,features,values
0,JCe55538,131257458220,2018-08-28 00:35:00 UTC,Meds,DOCUSATE SODIUM 250 MG PO CAPS,1
1,JCe55538,131257458220,2018-08-28 00:35:00 UTC,Meds,LACTOBACILLUS RHAMNOSUS GG 15 BILLION CELL PO ...,7
2,JCe55538,131257458220,2018-08-28 00:35:00 UTC,Diagnosis,Z90.411,1
3,JCe55538,131257458220,2018-08-28 00:35:00 UTC,Meds,PROCHLORPERAZINE MALEATE 5 MG PO TABS,1
4,JCe55538,131257458220,2018-08-28 00:35:00 UTC,Meds,MORPHINE 4 MG/ML INJ SYRG,2


In [5]:
grouped_mean = features_counts.groupby(['feature_type', 'features'])[['values']].describe()

In [6]:
grouped_mean = grouped_mean.reset_index()

In [7]:
grouped_mean.head()

Unnamed: 0_level_0,feature_type,features,values,values,values,values,values,values,values,values
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,mean,std,min,25%,50%,75%,max
0,Diagnosis,A01.00,5.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
1,Diagnosis,A02.0,32.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
2,Diagnosis,A02.1,13.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
3,Diagnosis,A02.24,1.0,1.0,,1.0,1.0,1.0,1.0,1.0
4,Diagnosis,A02.8,7.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0


In [8]:
savefile = "{}/feature_counts_individual.csv".format(results_dir)

In [9]:
grouped_mean.to_csv(savefile, index=False)

# Group by only the feature types

In [10]:
features_counts.head()

Unnamed: 0,anon_id,pat_enc_csn_id_coded,admit_time,feature_type,features,values
0,JCe55538,131257458220,2018-08-28 00:35:00 UTC,Meds,DOCUSATE SODIUM 250 MG PO CAPS,1
1,JCe55538,131257458220,2018-08-28 00:35:00 UTC,Meds,LACTOBACILLUS RHAMNOSUS GG 15 BILLION CELL PO ...,7
2,JCe55538,131257458220,2018-08-28 00:35:00 UTC,Diagnosis,Z90.411,1
3,JCe55538,131257458220,2018-08-28 00:35:00 UTC,Meds,PROCHLORPERAZINE MALEATE 5 MG PO TABS,1
4,JCe55538,131257458220,2018-08-28 00:35:00 UTC,Meds,MORPHINE 4 MG/ML INJ SYRG,2


In [11]:
feature_mean = features_counts.groupby(['feature_type', 'pat_enc_csn_id_coded'])[['values']].sum().reset_index()

In [12]:
feature_mean

Unnamed: 0,feature_type,pat_enc_csn_id_coded,values
0,Diagnosis,131062667066,68
1,Diagnosis,131062745090,91
2,Diagnosis,131062747648,248
3,Diagnosis,131062788358,5
4,Diagnosis,131063044001,44
...,...,...,...
402530,vitals_train,131282654066,52
402531,vitals_train,131282667784,29
402532,vitals_train,131282673144,16
402533,vitals_train,131282708895,22


In [13]:
featuretype_count = feature_mean.groupby(['feature_type'])[['values']].describe().reset_index()

In [14]:
featuretype_count

Unnamed: 0_level_0,feature_type,values,values,values,values,values,values,values,values
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max
0,Diagnosis,37253.0,74.1427,79.091173,1.0,18.0,48.0,103.0,643.0
1,Imaging,39900.0,23.757995,32.618299,1.0,5.0,12.0,29.0,670.0
2,Lab,42879.0,168.061429,315.365219,1.0,20.0,53.0,176.0,7971.0
3,Meds,42125.0,127.258113,245.908809,1.0,9.0,35.0,139.0,6096.0
4,Microbiology,1382.0,2.476122,1.407725,1.0,2.0,2.0,2.0,24.0
5,Microbiology Culture,16883.0,4.928508,2.492813,1.0,4.0,4.0,6.0,34.0
6,Procedures,18699.0,5.719985,10.912486,1.0,2.0,3.0,6.0,928.0
7,demo,41654.0,309.115283,33.383988,164.0,287.0,308.0,330.0,552.0
8,labs_results_test,39226.0,34.883776,12.943731,1.0,29.0,32.0,39.0,243.0
9,labs_results_train,39226.0,34.883776,12.943731,1.0,29.0,32.0,39.0,243.0


In [15]:
feature_mean[feature_mean.feature_type == 'Microbiology']

Unnamed: 0,feature_type,pat_enc_csn_id_coded,values
162157,Microbiology,131063329976,8
162158,Microbiology,131063482758,10
162159,Microbiology,131063531343,2
162160,Microbiology,131063642984,2
162161,Microbiology,131063868113,2
...,...,...,...
163534,Microbiology,131282002169,2
163535,Microbiology,131282106090,4
163536,Microbiology,131282218216,2
163537,Microbiology,131282353799,2


In [16]:
grouped_micro = feature_mean.copy()

grouped_micro.loc[(grouped_micro['feature_type'] == 'Microbiology Culture'), 'feature_type'] = 'Microbiology'

In [25]:
feature_mean.feature_type.value_counts()

Lab                     42879
Meds                    42125
vitals_train            41654
vitals_test             41654
demo                    41654
Imaging                 39900
labs_results_train      39226
labs_results_test       39226
Diagnosis               37253
Procedures              18699
Microbiology Culture    16883
Microbiology             1382
Name: feature_type, dtype: int64

In [26]:
grouped_micro.feature_type.value_counts()

Lab                   42879
Meds                  42125
vitals_train          41654
vitals_test           41654
demo                  41654
Imaging               39900
labs_results_train    39226
labs_results_test     39226
Diagnosis             37253
Procedures            18699
Microbiology          18265
Name: feature_type, dtype: int64

In [17]:
grouped_micro.head()

Unnamed: 0,feature_type,pat_enc_csn_id_coded,values
0,Diagnosis,131062667066,68
1,Diagnosis,131062745090,91
2,Diagnosis,131062747648,248
3,Diagnosis,131062788358,5
4,Diagnosis,131063044001,44


In [18]:
grouped_micro.feature_type.unique()

array(['Diagnosis', 'Imaging', 'Lab', 'Meds', 'Microbiology',
       'Procedures', 'demo', 'labs_results_test', 'labs_results_train',
       'vitals_test', 'vitals_train'], dtype=object)

In [19]:
grouped_micro[grouped_micro.feature_type == 'Microbiology']

Unnamed: 0,feature_type,pat_enc_csn_id_coded,values
162157,Microbiology,131063329976,8
162158,Microbiology,131063482758,10
162159,Microbiology,131063531343,2
162160,Microbiology,131063642984,2
162161,Microbiology,131063868113,2
...,...,...,...
180417,Microbiology,131282431253,2
180418,Microbiology,131282440635,4
180419,Microbiology,131282507776,8
180420,Microbiology,131282667784,6


In [20]:
grouped_micro_count = grouped_micro.groupby(['feature_type'])[['values']].describe().reset_index()

In [21]:
grouped_micro_count

Unnamed: 0_level_0,feature_type,values,values,values,values,values,values,values,values
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max
0,Diagnosis,37253.0,74.1427,79.091173,1.0,18.0,48.0,103.0,643.0
1,Imaging,39900.0,23.757995,32.618299,1.0,5.0,12.0,29.0,670.0
2,Lab,42879.0,168.061429,315.365219,1.0,20.0,53.0,176.0,7971.0
3,Meds,42125.0,127.258113,245.908809,1.0,9.0,35.0,139.0,6096.0
4,Microbiology,18265.0,4.742951,2.512846,1.0,4.0,4.0,6.0,34.0
5,Procedures,18699.0,5.719985,10.912486,1.0,2.0,3.0,6.0,928.0
6,demo,41654.0,309.115283,33.383988,164.0,287.0,308.0,330.0,552.0
7,labs_results_test,39226.0,34.883776,12.943731,1.0,29.0,32.0,39.0,243.0
8,labs_results_train,39226.0,34.883776,12.943731,1.0,29.0,32.0,39.0,243.0
9,vitals_test,41654.0,29.587483,28.979134,6.0,13.0,21.0,35.0,534.0
