In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

#### Reading Training Data

In [2]:
df_train = pd.read_csv('data/train.csv')

In [3]:
df_train.dropna(axis=0, how='all', inplace=True)

In [4]:
df_train = df_train[~df_train.INC_CODE.isnull()]

In [5]:
no_train = df_train.shape[0]

In [6]:
print("No of data points in training set: ", no_train)

No of data points in training set:  45226


In [7]:
df_train.INC_CODE_J = df_train.INC_CODE_J.astype(int)
df_train.ART_CODE = df_train.ART_CODE.astype(int)
df_train.INC_CODE = df_train.INC_CODE.astype(int)

In [8]:
df_train.head()

Unnamed: 0,REPORTED_TERM,DESC_CODED,LLT_NAME_COMPL,PT_NAME_COMPL,HLT_NAME_COMPL,HLGT_NAME_COMPL,SOC_CODE,INC_CODE_J,ART_CODE,INC_CODE
0,HYPONATREMIA,Hyponatraemia,HYPONATREMIA,HYPONATRAEMIA,SODIUM IMBALANCE,ELECTROLYTE AND FLUID BALANCE CONDITIONS,10027433,10021036,10021036,10021038
1,OMEPRAZOLE INDUCED SUBACUTE CUTANEOUS LUPUS ER...,Subacute cutaneous lupus erythematosus,SUBACUTE CUTANEOUS LUPUS ERYTHEMATOSUS,SUBACUTE CUTANEOUS LUPUS ERYTHEMATOSUS,CONNECTIVE TISSUE DISORDERS,EPIDERMAL AND DERMAL CONDITIONS,10040785,10057903,10057903,10057903
2,INDIRECT BILIRUBIN (74.7 MICROMOL/L),Blood bilirubin unconjugated increased,INDIRECT BILIRUBIN INCREASED,BLOOD BILIRUBIN UNCONJUGATED INCREASED,LIVER FUNCTION ANALYSES,HEPATOBILIARY INVESTIGATIONS,10022891,10021709,10005370,10021709
3,TOXIC EPIDERMAL NECROLYSIS,toxic epidermal necrolysis,TOXIC EPIDERMAL NECROLYSIS,TOXIC EPIDERMAL NECROLYSIS,BULLOUS CONDITIONS,EPIDERMAL AND DERMAL CONDITIONS,10040785,10044223,10044223,10044223
4,BRADYCARDIA,Bradycardia,BRADYCARDIA,BRADYCARDIA,RATE AND RHYTHM DISORDERS NEC,CARDIAC ARRHYTHMIAS,10007541,10006093,10006093,10006093


#### Reading Test data

In [9]:
df_test = pd.read_csv('data/test.csv')

In [10]:
df_test.head()

Unnamed: 0,REPORTED_TERM,DESC_CODED,LLT_NAME_COMPL,PT_NAME_COMPL,HLGT_NAME_COMPL,HLT_NAME_COMPL,SOC_CODE,INC_CODE_J,ART_CODE,INC_CODE
0,GRADE 2 HAND-FOOT SYNDROME,Palmar-plantar erythrodysaesthesia syndrome,HAND AND FOOT SYNDROME,PALMAR-PLANTAR ERYTHRODYSAESTHESIA SYNDROME,EPIDERMAL AND DERMAL CONDITIONS,DERMATITIS ASCRIBED TO SPECIFIC AGENT,10040785,10019111,10033553,10019111.0
1,BRADYCARDIA,Bradycardia,BRADYCARDIA,BRADYCARDIA,CARDIAC ARRHYTHMIAS,RATE AND RHYTHM DISORDERS NEC,10007541,10006093,10006093,10006093.0
2,ACUTE ONSET IMBALANCE,Balance disorder,BALANCE DISORDER,BALANCE DISORDER,NEUROLOGICAL DISORDERS NEC,COORDINATION AND BALANCE DISTURBANCES,10029205,10049848,10049848,10049848.0
3,PSEUDOCIRRHOSIS,pseudocirrhosis,PSEUDOCIRRHOSIS,PSEUDOCIRRHOSIS,HEPATIC AND HEPATOBILIARY DISORDERS,HEPATIC FIBROSIS AND CIRRHOSIS,10019805,10076501,10076501,10076501.0
4,HYPERHIDROSIS,Hyperhidrosis,HYPERHIDROSIS,HYPERHIDROSIS,SKIN APPENDAGE CONDITIONS,APOCRINE AND ECCRINE GLAND DISORDERS,10040785,10020642,10020642,10020642.0


In [11]:
df_test.dropna(axis=0, how='all', inplace=True)

In [12]:
no_test = df_test.shape[0]

In [13]:
print("No of data points in testing set: ", no_test)

No of data points in testing set:  19408


#### Reading MEDDRA data

In [15]:
df_med = pd.read_csv('data/meddra.csv')

In [16]:
df_med.rename(columns={'ID':'id', 'Term':'term','Primary SOC':'primary_soc'}, inplace=True)

In [17]:
df_med.head()

Unnamed: 0,id,term,primary_soc
0,10000002,11-beta-hydroxylase deficiency,10010331
1,10000005,17 ketosteroids urine,10022891
2,10000007,17 ketosteroids urine decreased,10022891
3,10000009,17 ketosteroids urine increased,10022891
4,10000011,17 ketosteroids urine normal,10022891


In [18]:
no_terms = df_med.shape[0]

In [19]:
print("Total no of MEDDRA terms: ", no_terms)

Total no of MEDDRA terms:  21612


### Checking Terms Present in Train but not in MEDDRA dict

In [20]:
print("Total train", no_train)
missing_train = df_train[~df_train.ART_CODE.isin(df_med.id)].shape[0]
print("No of Missing training set terms: ",missing_train)

Total train 45226
No of Missing training set terms:  993


In [21]:
print("Total test", no_test)
missing_test = df_test[~df_test.ART_CODE.isin(df_med.id)].shape[0]
print("No of Missing testing set terms: ",missing_test)

Total test 19408
No of Missing testing set terms:  381


In [85]:
train_df = df_train[df_train.ART_CODE.isin(df_med.id)]

In [86]:
train_df.shape

(44233, 10)

In [87]:
train_df.to_csv('output_files/train_modified.csv')

In [88]:
test_df = df_test[df_test.ART_CODE.isin(df_med.id)]

In [89]:
test_df.to_csv('output_files/test_modified.csv')

In [23]:
df_train.ART_CODE.nunique()

4576

In [24]:
df_test.ART_CODE.nunique()

3167

In [29]:
df_test[~df_test.ART_CODE.isin(df_train.ART_CODE)]

Unnamed: 0,REPORTED_TERM,DESC_CODED,LLT_NAME_COMPL,PT_NAME_COMPL,HLGT_NAME_COMPL,HLT_NAME_COMPL,SOC_CODE,INC_CODE_J,ART_CODE,INC_CODE
3,PSEUDOCIRRHOSIS,pseudocirrhosis,PSEUDOCIRRHOSIS,PSEUDOCIRRHOSIS,HEPATIC AND HEPATOBILIARY DISORDERS,HEPATIC FIBROSIS AND CIRRHOSIS,10019805,10076501,10076501,10076501.0
12,DILATED INTRAHEPATIC DUCT FOLLOWING BILIARY OB...,Dilatation intrahepatic duct acquired,DILATATION INTRAHEPATIC DUCT ACQUIRED,DILATATION INTRAHEPATIC DUCT ACQUIRED,BILE DUCT DISORDERS,STRUCTURAL AND OTHER BILE DUCT DISORDERS,10019805,10052383,10052383,10052383.0
56,TRACHEITIS,Tracheitis,TRACHEITIS,TRACHEITIS,INFECTIONS - PATHOGEN UNSPECIFIED,UPPER RESPIRATORY TRACT INFECTIONS,10021881,10044302,10044302,10044302.0
65,SPINAL SUBARACHNOID HEMORRHAGE IN THE CONTEXT ...,Metallosis of globe,SIDEROSIS OF GLOBE,METALLOSIS OF GLOBE,INJURIES NEC,EYE INJURIES NEC,10022117,10040665,10027440,10040665.0
70,LEFT LUMBO-CRURAL SCIATICA,Sciatica,SCIATICA,SCIATICA,SPINAL CORD AND NERVE ROOT DISORDERS,LUMBAR SPINAL CORD AND NERVE ROOT DISORDERS,10029205,10039674,10039674,10039674.0
99,"MILD CONJUNCTIVOCHALASIS IN BOTH EYES, MORE PR...",Conjunctivochalasis,CONJUNCTIVOCHALASIS,CONJUNCTIVOCHALASIS,"ANTERIOR EYE STRUCTURAL CHANGE, DEPOSIT AND DE...","CONJUNCTIVAL STRUCTURAL CHANGE, DEPOSIT AND DE...",10015919,10064132,10064132,10064132.0
103,ORBITAL SWELLING,orbital oedema,ORBITAL OEDEMA,ORBITAL OEDEMA,"OCULAR STRUCTURAL CHANGE, DEPOSIT AND DEGENERA...","ORBITAL STRUCTURAL CHANGE, DEPOSIT AND DEGENER...",10015919,10031051,10031051,10031051.0
163,HYPERSEGMENTED NEUTROPHILS (WITH 6 LOBES) DUE...,Neutrophil hypersegmented morphology present,NEUTROPHIL HYPERSEGMENTED MORPHOLOGY PRESENT,NEUTROPHIL HYPERSEGMENTED MORPHOLOGY PRESENT,HAEMATOLOGY INVESTIGATIONS (INCL BLOOD GROUPS),WHITE BLOOD CELL ANALYSES,10022891,10029374,10029374,10029374.0
199,SEVERE HUMORAL IMMUNODEFICIENCY,Humoral immune defect,DEFICIENCY OF HUMORAL IMMUNITY,HUMORAL IMMUNE DEFECT,IMMUNODEFICIENCY SYNDROMES,IMMUNODEFICIENCY DISORDERS NEC,10021428,10012134,10020464,10012134.0
208,LEARNING DIFFICULTIES,Learning disability,LEARNING DISABILITY,LEARNING DISABILITY,COGNITIVE AND ATTENTION DISORDERS AND DISTURBA...,LEARNING DISORDERS,10037175,10024092,10024092,10024092.0


In [63]:
df_combined = pd.concat([df_train, df_test], axis=0)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [64]:
df_combined.head()

Unnamed: 0,ART_CODE,DESC_CODED,HLGT_NAME_COMPL,HLT_NAME_COMPL,INC_CODE,INC_CODE_J,LLT_NAME_COMPL,PT_NAME_COMPL,REPORTED_TERM,SOC_CODE
0,10021036,Hyponatraemia,ELECTROLYTE AND FLUID BALANCE CONDITIONS,SODIUM IMBALANCE,10021038.0,10021036,HYPONATREMIA,HYPONATRAEMIA,HYPONATREMIA,10027433
1,10057903,Subacute cutaneous lupus erythematosus,EPIDERMAL AND DERMAL CONDITIONS,CONNECTIVE TISSUE DISORDERS,10057903.0,10057903,SUBACUTE CUTANEOUS LUPUS ERYTHEMATOSUS,SUBACUTE CUTANEOUS LUPUS ERYTHEMATOSUS,OMEPRAZOLE INDUCED SUBACUTE CUTANEOUS LUPUS ER...,10040785
2,10005370,Blood bilirubin unconjugated increased,HEPATOBILIARY INVESTIGATIONS,LIVER FUNCTION ANALYSES,10021709.0,10021709,INDIRECT BILIRUBIN INCREASED,BLOOD BILIRUBIN UNCONJUGATED INCREASED,INDIRECT BILIRUBIN (74.7 MICROMOL/L),10022891
3,10044223,toxic epidermal necrolysis,EPIDERMAL AND DERMAL CONDITIONS,BULLOUS CONDITIONS,10044223.0,10044223,TOXIC EPIDERMAL NECROLYSIS,TOXIC EPIDERMAL NECROLYSIS,TOXIC EPIDERMAL NECROLYSIS,10040785
4,10006093,Bradycardia,CARDIAC ARRHYTHMIAS,RATE AND RHYTHM DISORDERS NEC,10006093.0,10006093,BRADYCARDIA,BRADYCARDIA,BRADYCARDIA,10007541


In [67]:
df_combined = df_combined[df_combined.ART_CODE.isin(df_med.id)]

In [69]:
no_of_medra_terms = df_med.id.nunique()
int_to_med = {idx:med_id for idx, med_id in enumerate(df_med.id.values, 1)}

med_id_to_int = {med_id: idx  for idx,med_id in int_to_med.items()}

In [71]:
df_combined['ART_CODE'] = df_combined.ART_CODE.apply(lambda x: med_id_to_int[x])

In [72]:
df_combined.head()

Unnamed: 0,ART_CODE,DESC_CODED,HLGT_NAME_COMPL,HLT_NAME_COMPL,INC_CODE,INC_CODE_J,LLT_NAME_COMPL,PT_NAME_COMPL,REPORTED_TERM,SOC_CODE
0,4356,Hyponatraemia,ELECTROLYTE AND FLUID BALANCE CONDITIONS,SODIUM IMBALANCE,10021038.0,10021036,HYPONATREMIA,HYPONATRAEMIA,HYPONATREMIA,10027433
1,12574,Subacute cutaneous lupus erythematosus,EPIDERMAL AND DERMAL CONDITIONS,CONNECTIVE TISSUE DISORDERS,10057903.0,10057903,SUBACUTE CUTANEOUS LUPUS ERYTHEMATOSUS,SUBACUTE CUTANEOUS LUPUS ERYTHEMATOSUS,OMEPRAZOLE INDUCED SUBACUTE CUTANEOUS LUPUS ER...,10040785
2,1214,Blood bilirubin unconjugated increased,HEPATOBILIARY INVESTIGATIONS,LIVER FUNCTION ANALYSES,10021709.0,10021709,INDIRECT BILIRUBIN INCREASED,BLOOD BILIRUBIN UNCONJUGATED INCREASED,INDIRECT BILIRUBIN (74.7 MICROMOL/L),10022891
3,8134,toxic epidermal necrolysis,EPIDERMAL AND DERMAL CONDITIONS,BULLOUS CONDITIONS,10044223.0,10044223,TOXIC EPIDERMAL NECROLYSIS,TOXIC EPIDERMAL NECROLYSIS,TOXIC EPIDERMAL NECROLYSIS,10040785
4,1597,Bradycardia,CARDIAC ARRHYTHMIAS,RATE AND RHYTHM DISORDERS NEC,10006093.0,10006093,BRADYCARDIA,BRADYCARDIA,BRADYCARDIA,10007541


In [90]:
df_combined = df_combined[~df_combined['PT_NAME_COMPL'].isnull()]

In [91]:
df_combined['len'] = df_combined['PT_NAME_COMPL'].apply(lambda x: len(x.split()))

In [92]:
np.max(df_combined['len'].values)

10

In [93]:
df_combined.to_csv("output_files/combined.csv")