In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
df_train = pd.read_csv("output_files/combined.csv", index_col=0)

In [9]:
df_train['REPORTED_TERM'] = df_train['REPORTED_TERM'].apply(lambda x: x.lower())

In [10]:
df_train.head()

Unnamed: 0,ART_CODE,DESC_CODED,HLGT_NAME_COMPL,HLT_NAME_COMPL,INC_CODE,INC_CODE_J,LLT_NAME_COMPL,PT_NAME_COMPL,REPORTED_TERM,SOC_CODE,len
0,4356,Hyponatraemia,ELECTROLYTE AND FLUID BALANCE CONDITIONS,SODIUM IMBALANCE,10021038.0,10021036,HYPONATREMIA,HYPONATRAEMIA,hyponatremia,10027433.0,1
1,12574,Subacute cutaneous lupus erythematosus,EPIDERMAL AND DERMAL CONDITIONS,CONNECTIVE TISSUE DISORDERS,10057903.0,10057903,SUBACUTE CUTANEOUS LUPUS ERYTHEMATOSUS,SUBACUTE CUTANEOUS LUPUS ERYTHEMATOSUS,omeprazole induced subacute cutaneous lupus er...,10040785.0,4
2,1214,Blood bilirubin unconjugated increased,HEPATOBILIARY INVESTIGATIONS,LIVER FUNCTION ANALYSES,10021709.0,10021709,INDIRECT BILIRUBIN INCREASED,BLOOD BILIRUBIN UNCONJUGATED INCREASED,indirect bilirubin (74.7 micromol/l),10022891.0,4
3,8134,toxic epidermal necrolysis,EPIDERMAL AND DERMAL CONDITIONS,BULLOUS CONDITIONS,10044223.0,10044223,TOXIC EPIDERMAL NECROLYSIS,TOXIC EPIDERMAL NECROLYSIS,toxic epidermal necrolysis,10040785.0,3
4,1597,Bradycardia,CARDIAC ARRHYTHMIAS,RATE AND RHYTHM DISORDERS NEC,10006093.0,10006093,BRADYCARDIA,BRADYCARDIA,bradycardia,10007541.0,1


In [11]:
terms_list = list(df_train['REPORTED_TERM'])

In [13]:
all_text = ' '.join(terms_list)

In [18]:
words = all_text.split()

#### Encoding Words

In [21]:
from collections import Counter

In [22]:
counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}

In [26]:
reported_terms_ints = []
for term in terms_list:
    reported_terms_ints.append([vocab_to_int[word] for word in term.split()])

In [36]:
# outlier review stats
reported_term_lens = Counter([len(x) for x in reported_terms_ints])

In [37]:
print("Zero-length reviews: {}".format(reported_term_lens[0]))
print("Maximum review length: {}".format(max(reported_term_lens)))

Zero-length reviews: 0
Maximum review length: 43


In [38]:
len(reported_terms_ints)

63233

In [39]:
def pad_features(reported_terms_ints, seq_length):
    ''' Return features of review_ints, where each review is padded with 0's 
        or truncated to the input seq_length.
    '''
    
    # getting the correct rows x cols shape
    features = np.zeros((len(reported_terms_ints), seq_length), dtype=int)

    # for each review, I grab that review and 
    for i, row in enumerate(reported_terms_ints):
        features[i, -len(row):] = np.array(row)[:seq_length]
    
    return features

In [41]:
seq_length = 200

features = pad_features(reported_terms_ints, seq_length=seq_length)

In [48]:
features

array([[    0,     0,     0, ...,     0,     0,   413],
       [    0,     0,     0, ...,  1952,     2,   332],
       [    0,     0,     0, ...,   286, 12175,  1640],
       ...,
       [    0,     0,     0, ...,    30,    51,   282],
       [    0,     0,     0, ...,  1858,    10,   182],
       [    0,     0,     0, ...,     0,     0,   428]])

In [47]:
len(reported_terms_ints)

63233