In [102]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

#### Reading Training Data

In [103]:
df_train = pd.read_csv('data/train.csv')

In [104]:
df_train.dropna(axis=0, how='all', inplace=True)

In [105]:
df_train = df_train[~df_train.INC_CODE.isnull()]

In [126]:
no_train = df_train.shape[0]

In [127]:
print("No of data points in training set: ", no_train)

No of data points in training set:  45226


In [108]:
df_train.INC_CODE_J = df_train.INC_CODE_J.astype(int)
df_train.ART_CODE = df_train.ART_CODE.astype(int)
df_train.INC_CODE = df_train.INC_CODE.astype(int)

In [109]:
df_train.head()

Unnamed: 0,REPORTED_TERM,DESC_CODED,LLT_NAME_COMPL,PT_NAME_COMPL,HLT_NAME_COMPL,HLGT_NAME_COMPL,SOC_CODE,INC_CODE_J,ART_CODE,INC_CODE
0,HYPONATREMIA,Hyponatraemia,HYPONATREMIA,HYPONATRAEMIA,SODIUM IMBALANCE,ELECTROLYTE AND FLUID BALANCE CONDITIONS,10027433,10021036,10021036,10021038
1,OMEPRAZOLE INDUCED SUBACUTE CUTANEOUS LUPUS ER...,Subacute cutaneous lupus erythematosus,SUBACUTE CUTANEOUS LUPUS ERYTHEMATOSUS,SUBACUTE CUTANEOUS LUPUS ERYTHEMATOSUS,CONNECTIVE TISSUE DISORDERS,EPIDERMAL AND DERMAL CONDITIONS,10040785,10057903,10057903,10057903
2,INDIRECT BILIRUBIN (74.7 MICROMOL/L),Blood bilirubin unconjugated increased,INDIRECT BILIRUBIN INCREASED,BLOOD BILIRUBIN UNCONJUGATED INCREASED,LIVER FUNCTION ANALYSES,HEPATOBILIARY INVESTIGATIONS,10022891,10021709,10005370,10021709
3,TOXIC EPIDERMAL NECROLYSIS,toxic epidermal necrolysis,TOXIC EPIDERMAL NECROLYSIS,TOXIC EPIDERMAL NECROLYSIS,BULLOUS CONDITIONS,EPIDERMAL AND DERMAL CONDITIONS,10040785,10044223,10044223,10044223
4,BRADYCARDIA,Bradycardia,BRADYCARDIA,BRADYCARDIA,RATE AND RHYTHM DISORDERS NEC,CARDIAC ARRHYTHMIAS,10007541,10006093,10006093,10006093


#### Reading Test data

In [110]:
df_test = pd.read_csv('data/test.csv')

In [111]:
df_test.head()

Unnamed: 0,REPORTED_TERM,DESC_CODED,LLT_NAME_COMPL,PT_NAME_COMPL,HLGT_NAME_COMPL,HLT_NAME_COMPL,SOC_CODE,INC_CODE_J,ART_CODE,INC_CODE
0,GRADE 2 HAND-FOOT SYNDROME,Palmar-plantar erythrodysaesthesia syndrome,HAND AND FOOT SYNDROME,PALMAR-PLANTAR ERYTHRODYSAESTHESIA SYNDROME,EPIDERMAL AND DERMAL CONDITIONS,DERMATITIS ASCRIBED TO SPECIFIC AGENT,10040785,10019111,10033553,10019111.0
1,BRADYCARDIA,Bradycardia,BRADYCARDIA,BRADYCARDIA,CARDIAC ARRHYTHMIAS,RATE AND RHYTHM DISORDERS NEC,10007541,10006093,10006093,10006093.0
2,ACUTE ONSET IMBALANCE,Balance disorder,BALANCE DISORDER,BALANCE DISORDER,NEUROLOGICAL DISORDERS NEC,COORDINATION AND BALANCE DISTURBANCES,10029205,10049848,10049848,10049848.0
3,PSEUDOCIRRHOSIS,pseudocirrhosis,PSEUDOCIRRHOSIS,PSEUDOCIRRHOSIS,HEPATIC AND HEPATOBILIARY DISORDERS,HEPATIC FIBROSIS AND CIRRHOSIS,10019805,10076501,10076501,10076501.0
4,HYPERHIDROSIS,Hyperhidrosis,HYPERHIDROSIS,HYPERHIDROSIS,SKIN APPENDAGE CONDITIONS,APOCRINE AND ECCRINE GLAND DISORDERS,10040785,10020642,10020642,10020642.0


In [112]:
df_test.dropna(axis=0, how='all', inplace=True)

In [128]:
no_test = df_test.shape[0]

In [129]:
print("No of data points in testing set: ", no_test)

No of data points in testing set:  19408


#### Reading MEDDRA data

In [114]:
df_med = pd.read_csv('data/meddra.csv')

In [115]:
df_med.rename(columns={'ID':'id', 'Term':'term','Primary SOC':'primary_soc'}, inplace=True)

In [116]:
df_med.head()

Unnamed: 0,id,term,primary_soc
0,10000002,11-beta-hydroxylase deficiency,10010331
1,10000005,17 ketosteroids urine,10022891
2,10000007,17 ketosteroids urine decreased,10022891
3,10000009,17 ketosteroids urine increased,10022891
4,10000011,17 ketosteroids urine normal,10022891


In [131]:
no_terms = df_med.shape[0]

In [132]:
print("Total no of MEDDRA terms: ", no_terms)

Total no of MEDDRA terms:  21612


### Checking Terms Present in Train but not in MEDDRA dict

In [137]:
print("Total train", no_train)
missing_train = df_train[~df_train.ART_CODE.isin(df_med.id)].shape[0]
print("No of Missing training set terms: ",missing_train)

Total train 45226
No of Missing training set terms:  993


In [138]:
print("Total test", no_test)
missing_test = df_test[~df_test.ART_CODE.isin(df_med.id)].shape[0]
print("No of Missing testing set terms: ",missing_test)

Total test 19408
No of Missing testing set terms:  381


In [139]:
train_df = df_train[~df_train.ART_CODE.isin(df_med.id)]

In [140]:
train_df.to_csv('train_modified.csv')

In [141]:
test_df = df_test[~df_test.ART_CODE.isin(df_med.id)]

In [142]:
test_df.to_csv('test_modified.csv')

In [30]:
df_train.REPORTED_TERM.nunique()

28907

In [31]:
df_test.REPORTED_TERM.nunique()

13750

In [59]:
df_train.INC_CODE_J.nunique()

8277

In [60]:
df_test.INC_CODE_J.nunique()

5218