# Smoking Status Classification

Spark NLP v 2.4.5

Spark NLP-JSL v 2.4.6

In [1]:
import os

from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.base import *
from sparknlp.common import *
import sparknlp_jsl
spark = sparknlp_jsl.start('xxx')

In [2]:
import xmltodict

with open('data/smoker/smokers_surrogate_test_all_groundtruth_version2.xml', 'r') as f:
    text = f.read()
    
xml = xmltodict.parse(text)

In [13]:
xml['ROOT']['RECORD'][0].keys()

odict_keys(['@ID', 'SMOKING', 'TEXT'])

In [None]:
xml['ROOT']['RECORD'][0]['TEXT']

In [14]:
len(xml['ROOT']['RECORD'])

104

In [3]:
import pandas as pd

test_df = pd.DataFrame([(t['TEXT'],t['SMOKING']['@STATUS']) for t in xml['ROOT']['RECORD']], columns = ['text','label'])
test_df.label.value_counts()

UNKNOWN           63
NON-SMOKER        16
CURRENT SMOKER    11
PAST SMOKER       11
SMOKER             3
Name: label, dtype: int64

In [4]:
import xmltodict

with open('data/smoker/smokers_surrogate_train_all_version2.xml', 'r') as f:
    text = f.read()
    
xml = xmltodict.parse(text)

In [5]:
train_df = pd.DataFrame([(t['TEXT'],t['SMOKING']['@STATUS']) for t in xml['ROOT']['RECORD']], columns = ['text','label'])
train_df.label.value_counts()

UNKNOWN           252
NON-SMOKER         66
PAST SMOKER        36
CURRENT SMOKER     35
SMOKER              9
Name: label, dtype: int64

In [6]:
print (train_df[train_df.label=='PAST SMOKER']['text'].values[0][:100])

123547445
FIH
7111426
47933/f911
557344
11/19/1994 12:00:00 AM
Discharge Summary
Unsigned
DIS
Report


In [7]:
train_df.head()

Unnamed: 0,text,label
0,977146916\nHLGMC\n2878891\n022690\n01/27/1997 ...,CURRENT SMOKER
1,026738007\nCMC\n15319689\n3/25/1998 12:00:00 A...,CURRENT SMOKER
2,071962960\nBH\n4236518\n417454\n12/10/2001 12:...,CURRENT SMOKER
3,418520250\nNVH\n61562872\n3/11/1995 12:00:00 A...,CURRENT SMOKER
4,301443520\nCTMC\n49020928\n448922\n1/11/1990 1...,CURRENT SMOKER


In [8]:
test_df.head()

Unnamed: 0,text,label
0,156406283\nHLGMC\n7213645\n64723/51cy\n5/28/19...,PAST SMOKER
1,487197293\nPUOMC\n2466262\n408602\n1976045\n4/...,CURRENT SMOKER
2,176318078\nFIH\n4189699\n28872/d70h\n094756\n1...,CURRENT SMOKER
3,245096078\nFIH\n9174858\n12/February\n997359\n...,CURRENT SMOKER
4,237073320\nFIH\n9746390\n797120\n251605\n01/26...,CURRENT SMOKER


In [147]:
print (test_df.text[0][:1000])

156406283
HLGMC
7213645
64723/51cy
5/28/1993 12:00:00 AM
Discharge Summary
Unsigned
DIS
Report Status :
Unsigned
ADMISSION DATE :
5-28-93
DISCHARGE DATE :
6-4-93
HISTORY OF PRESENT ILLNESS :
The patient is a 58 year old right hand dominant white male with a long history of hypertension , changed his medications from Aldomet to Clonidine six weeks ago .
The patient has a history of adult onset diabetes mellitus , ankylosing spondylitis , status post myocardial infarction in '96 ( ? ) now with acute onset of left face and arm greater than leg hemiplegia and primary hemisensory loss on the left .
Briefly , he was talking to a friend at 5:30 p.m. the day prior to admission , when he had to grab his locker and sit down .
His voice became slurred and he had a mild central dull headache .
He was unable to move the left side of his body and felt numb on that side .
He was taken to Wayskemedcalltown Talmi and transferred to Heaonboburg Linpack Grant Medical Center with a computerized tomography

In [7]:
spark_train_df = spark.createDataFrame(train_df.append(test_df))

In [8]:
from pyspark.sql import functions as F

# create a monotonically increasing id 
spark_train_df = spark_train_df.withColumn("id", F.monotonically_increasing_id())


In [9]:
spark_train_df.show(1)

+--------------------+--------------+---+
|                text|         label| id|
+--------------------+--------------+---+
|977146916
HLGMC
2...|CURRENT SMOKER|  0|
+--------------------+--------------+---+
only showing top 1 row



In [24]:
spark_train_df.select('label').show(10)

+--------------+
|         label|
+--------------+
|CURRENT SMOKER|
|CURRENT SMOKER|
|CURRENT SMOKER|
|CURRENT SMOKER|
|CURRENT SMOKER|
|CURRENT SMOKER|
|CURRENT SMOKER|
|CURRENT SMOKER|
|CURRENT SMOKER|
|CURRENT SMOKER|
+--------------+
only showing top 10 rows



In [25]:

rules = '''
(no|non|not|never|negative)\W*(smoker|smoking|smoked|tobacco), xxx
denies\W*smoking, xxx
nonsmoker, xxx
(tobacco|smoke|smoking|nicotine)\W*(never|no), xxx
doesn\'t smoke, xxx
'''

with open('data/smoker/smoking_regex_rules.txt', 'w') as f:
    
    f.write(rules)


In [145]:
sparknlp_jsl.version()

'2.4.6'

In [11]:
entities = ['smoke', 'secondhand', 'thirdhand', 'pipes',
           'cigs', 'tobacco', 'cigarettes', 'cigar', 'cigars',
           'tobaco', 'cigarette', 'hookah', 'nutcrackers',
            'nicotine','nicotene', 'nicoderm', 'nictoine']

with open ('data/smoker/smoker_entities.txt', 'w') as f:
    for i in entities:
        f.write(i+'\n')


In [10]:
document = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentence = SentenceDetector()\
    .setInputCols(['document'])\
    .setOutputCol('sentence')\
    .setCustomBounds(['\n'])

regex_matcher = RegexMatcher()\
    .setInputCols('sentence')\
    .setStrategy("MATCH_ALL")\
    .setOutputCol("nonsmoker_regex_matches")\
    .setExternalRules(path='data/smoker/smoking_regex_rules.txt', delimiter=',')
    
token = Tokenizer()\
    .setInputCols(['sentence'])\
    .setOutputCol('token')

entity_extractor = TextMatcher() \
    .setInputCols(["sentence",'token'])\
    .setOutputCol("smoker_entities")\
    .setEntities('data/smoker/smoker_entities.txt')

embeddings = WordEmbeddingsModel.pretrained('embeddings_clinical', 'en', 'clinical/models')\
      .setInputCols(["sentence", "token"])\
      .setOutputCol("embeddings")

clinical_ner = NerDLModel.pretrained('ner_clinical', 'en', 'clinical/models')\
      .setInputCols(["sentence", "token", "embeddings"]) \
      .setOutputCol("clinical_ner")

clinical_converter = NerConverter()\
  .setInputCols(["sentence", "token", "clinical_ner"])\
  .setOutputCol("clinical_ner_chunk")

bionlp_model = NerDLModel.pretrained('ner_bionlp', 'en', 'clinical/models')\
     .setInputCols(["sentence", "token", "embeddings"])\
     .setOutputCol("bio_ner")

bionlp_converter = NerConverter()\
  .setInputCols(["sentence", "token", "bio_ner"])\
  .setOutputCol("bio_ner_chunk")
                        
posology_ner_model = NerDLModel.pretrained('ner_posology_large', 'en', 'clinical/models')\
     .setInputCols(["sentence", "token", "embeddings"])\
     .setOutputCol("posology_ner")
                                
posology_converter = NerConverter()\
  .setInputCols(["sentence", "token", "posology_ner"])\
  .setOutputCol("posology_ner_chunk")

risk_ner_model = NerDLModel.pretrained('ner_risk_factors', 'en', 'clinical/models')\
     .setInputCols(["sentence", "token", "embeddings"])\
     .setOutputCol("risk_ner")
                                
risk_converter = NerConverter()\
  .setInputCols(["sentence", "token", "risk_ner"])\
  .setOutputCol("risk_ner_chunk")\
.setWhiteList(['SMOKER'])

risk_assertion_dl = sparknlp_jsl.annotators.AssertionDLModel.pretrained('assertion_dl', 'en', 'clinical/models')\
  .setInputCols(["sentence", "risk_ner_chunk", "embeddings"])\
  .setOutputCol("assertion")

cell_ner_model = NerDLModel.pretrained('ner_cellular', 'en', 'clinical/models')\
     .setInputCols(["sentence", "token", "embeddings"])\
     .setOutputCol("cell_ner")
                                
cell_converter = NerConverter()\
  .setInputCols(["sentence", "token", "cell_ner"])\
  .setOutputCol("cell_ner_chunk")


ner_pipeline = Pipeline(
    stages = [
        document,
        sentence,
        regex_matcher,
        token,
        entity_extractor,
        embeddings,
        clinical_ner,
        clinical_converter,
        bionlp_model,
        bionlp_converter,
        posology_ner_model,
        posology_converter,
        risk_ner_model,
        risk_converter,
        risk_assertion_dl,
        cell_ner_model,
        cell_converter,
    ])

empty_data = spark.createDataFrame([[""]]).toDF("text")

model = ner_pipeline.fit(empty_data)

print ('Spark NLP pipeline is built')

ner_clinical download started this may take some time.
Approximate size to download 13.8 MB
[OK!]
ner_bionlp download started this may take some time.
Approximate size to download 13.9 MB
[OK!]
ner_posology_large download started this may take some time.
Approximate size to download 13.8 MB
[OK!]
ner_risk_factors download started this may take some time.
Approximate size to download 13.9 MB
[OK!]
ner_cellular download started this may take some time.
Approximate size to download 13.9 MB
[OK!]
Spark NLP pipeline is built


In [11]:
lm = LightPipeline(model)

In [14]:
match_df = model.transform(spark_train_df)

In [None]:
lm.fullAnnotate(match_df.select('text').take(1)[0][0][:200])

In [None]:
lm.fullAnnotate('He is a nonsmoker. He quit cigar a year ago')

In [17]:
#ann_results = lm.fullAnnotate(list(text_df['text'])[0])

In [159]:
lm.annotate('He is a nonsmoker. He quit cigar a year ago')

{'posology_ner': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 'document': ['He is a nonsmoker. He quit cigar a year ago'],
 'bio_ner': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 'posology_ner_chunk': [],
 'assertion': ['associated_with_someone_else'],
 'bio_ner_chunk': [],
 'clinical_ner': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 'nonsmoker_regex_matches': ['nonsmoker', 'nonsmoker'],
 'token': ['He',
  'is',
  'a',
  'nonsmoker',
  '.',
  'He',
  'quit',
  'cigar',
  'a',
  'year',
  'ago'],
 'clinical_ner_chunk': [],
 'embeddings': ['He',
  'is',
  'a',
  'nonsmoker',
  '.',
  'He',
  'quit',
  'cigar',
  'a',
  'year',
  'ago'],
 'risk_ner': ['O',
  'O',
  'O',
  'B-SMOKER',
  'I-SMOKER',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 'risk_ner_chunk': ['nonsmoker.'],
 'smoker_entities': ['cigar'],
 'cell_ner_chunk': [],
 'cell_ner': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 'sentence': ['He is a nonsmoker.', 'He quit cigar a 

In [None]:
match_df.select('text').take(1)

In [None]:
match_df.show(2)

In [166]:
match_df.select("smoker_entities.metadata").take(1)

[Row(metadata=[{'sentence': '0', 'chunk': '0'}])]

In [None]:
match_df.select('sentence').take(1)

In [92]:
match_df.select('id','label','assertion.result', 'nonsmoker_regex_matches.result', 'smoker_entities.result').show(3)

+---+--------------+------+------+-------+
| id|         label|result|result| result|
+---+--------------+------+------+-------+
|  0|CURRENT SMOKER|    []|    []|[cigar]|
|  1|CURRENT SMOKER|    []|    []|     []|
|  2|CURRENT SMOKER|    []|    []|     []|
+---+--------------+------+------+-------+
only showing top 3 rows



In [93]:
from pyspark.sql import functions as F 

pandas_df = match_df.select('id','label','nonsmoker_regex_matches','smoker_entities','assertion',
                F.explode(F.arrays_zip('clinical_ner_chunk.result',"clinical_ner_chunk.metadata",
                                                    'bio_ner_chunk.result',"bio_ner_chunk.metadata",
                                                    'posology_ner_chunk.result',"posology_ner_chunk.metadata",
                                                    'risk_ner_chunk.result',"risk_ner_chunk.metadata",
                                                    'cell_ner_chunk.result',"cell_ner_chunk.metadata",
                                      )).alias("cols")) \
    .select('id','label','nonsmoker_regex_matches.result','smoker_entities.result','assertion.result',
            F.expr("cols['0']").alias("clinical_token"),
            F.expr("cols['1'].entity").alias("clinical_entity"),
            F.expr("cols['2']").alias("bionlp_token"),
            F.expr("cols['3'].entity").alias("bionlp_entity"),
            F.expr("cols['4']").alias("posology_token"),
           F.expr("cols['5'].entity").alias("posology_entity"),
             F.expr("cols['6']").alias("risk_token"),
            F.expr("cols['7'].entity").alias("risk_entity"),
            F.expr("cols['8']").alias("cell_token"),
           F.expr("cols['9'].entity").alias("cell_entity")).toPandas()

In [95]:
pandas_df.head()

Unnamed: 0,id,label,result,result.1,result.2,clinical_token,clinical_entity,bionlp_token,bionlp_entity,posology_token,posology_entity,risk_token,risk_entity,cell_token,cell_entity
0,0,CURRENT SMOKER,[],[cigar],[],CARCINOMA,PROBLEM,HLGMC,Cell,ARF32 FA,DRUG,,,HLGMC,cell_line
1,0,CURRENT SMOKER,[],[cigar],[],Carcinoma of the colon,PROBLEM,ARF32,Gene_or_gene_product,GIRRESNET,DRUG,,,DIS,DNA
2,0,CURRENT SMOKER,[],[cigar],[],Urinary tract infection,PROBLEM,FA,Organism_substance,liter,DOSAGE,,,ARF32 FA,DNA
3,0,CURRENT SMOKER,[],[cigar],[],cirrhosis of the liver,PROBLEM,NAME,Simple_chemical,normal saline,DRUG,,,GIRRESNET,DNA
4,0,CURRENT SMOKER,[],[cigar],[],colon cancer,PROBLEM,DIEDREO A,Gene_or_gene_product,thiamine,DRUG,,,DIEDREO A,DNA


In [102]:

pandas_df.to_pickle('data/smoker_features_df.pickle')

In [50]:
import pandas as pd

pandas_df= pd.read_pickle('data/smoker_features_df.pickle')

In [97]:
pandas_df.columns

Index(['id', 'label', 'result', 'result', 'result', 'clinical_token',
       'clinical_entity', 'bionlp_token', 'bionlp_entity', 'posology_token',
       'posology_entity', 'risk_token', 'risk_entity', 'cell_token',
       'cell_entity'],
      dtype='object')

In [98]:
pandas_df.columns = ['id', 'label', 'nonsmoker_regex_matches', 'smoker_entities', 'assertion', 'clinical_token',
       'clinical_entity', 'bionlp_token', 'bionlp_entity', 'posology_token',
       'posology_entity', 'risk_token', 'risk_entity', 'cell_token',
       'cell_entity']

In [99]:
pandas_df.shape

(44504, 15)

In [100]:
pandas_df.head()

Unnamed: 0,id,label,nonsmoker_regex_matches,smoker_entities,assertion,clinical_token,clinical_entity,bionlp_token,bionlp_entity,posology_token,posology_entity,risk_token,risk_entity,cell_token,cell_entity
0,0,CURRENT SMOKER,[],[cigar],[],CARCINOMA,PROBLEM,HLGMC,Cell,ARF32 FA,DRUG,,,HLGMC,cell_line
1,0,CURRENT SMOKER,[],[cigar],[],Carcinoma of the colon,PROBLEM,ARF32,Gene_or_gene_product,GIRRESNET,DRUG,,,DIS,DNA
2,0,CURRENT SMOKER,[],[cigar],[],Urinary tract infection,PROBLEM,FA,Organism_substance,liter,DOSAGE,,,ARF32 FA,DNA
3,0,CURRENT SMOKER,[],[cigar],[],cirrhosis of the liver,PROBLEM,NAME,Simple_chemical,normal saline,DRUG,,,GIRRESNET,DNA
4,0,CURRENT SMOKER,[],[cigar],[],colon cancer,PROBLEM,DIEDREO A,Gene_or_gene_product,thiamine,DRUG,,,DIEDREO A,DNA


In [101]:
pandas_df.assertion.value_counts()

[]                                      34696
[present]                                7576
[absent]                                 1302
[present, present]                        518
[present, present, present, present]      180
[absent, absent]                          138
[associated_with_someone_else]             94
Name: assertion, dtype: int64

In [103]:
pandas_df.nonsmoker_regex_matches.apply(lambda x: 1 if len(x)>0 else 0).sum()

2192

In [104]:
pandas_df.nonsmoker_regex_matches.apply(lambda x: len(x)).sum()

3398

In [105]:
#assertion_scores = {'absent':}

from collections import Counter

def get_assertion_stats(ass):
    
    ass_list =[]
    
    for s in ass:
        
        ass_list.extend(s)
    
    x = dict(Counter(ass))
    
    
    return x
    
adf = pandas_df.assertion.apply(lambda x: get_assertion_stats(x)).value_counts().reset_index()

dic = {'present':0,
'absent':0,
'associated_with_someone_else':0}

for i,row in adf.iterrows():
    try:
        k = list(row['index'].keys())[0]
        dic[k]= dic[k]+row['index'][k]*row['assertion']
    except:
        pass
    
dic

{'present': 9332, 'absent': 1578, 'associated_with_someone_else': 94}

In [106]:
pandas_df.assertion.apply(lambda x: get_assertion_stats(x)).value_counts()

{}                                     34696
{'present': 1}                          7576
{'absent': 1}                           1302
{'present': 2}                           518
{'present': 4}                           180
{'absent': 2}                            138
{'associated_with_someone_else': 1}       94
Name: assertion, dtype: int64

In [None]:
#{k: x.get(k, 0) + y.get(k, 0) for k in set(x) | set(y)}

In [107]:
pandas_df.nonsmoker_regex_matches = pandas_df.nonsmoker_regex_matches.apply(lambda x: len(x))
pandas_df.smoker_entities = pandas_df.smoker_entities.apply(lambda x: len(x))

In [108]:
pandas_df.smoker_entities.value_counts()

0    30556
1    12041
2     1522
3      205
7      180
Name: smoker_entities, dtype: int64

In [109]:
pandas_df.nonsmoker_regex_matches.value_counts()

0    42312
2     1206
1      986
Name: nonsmoker_regex_matches, dtype: int64

In [110]:
#clinical_entity = list(pandas_df.clinical_entity.unique())
#bionlp_entity = list(pandas_df.bionlp_entity.unique())
#posology_entity = list(pandas_df.posology_entity.unique())
#cell_entity = list(pandas_df.cell_entity.unique())
#risk_entity = list(pandas_df.risk_entity.unique())

pids = pandas_df['id'].unique()

parsed_dict = {}

#ents = {k:0 for k in clinical_entity+bionlp_entity+posology_entity}


xx=[]
for i in pids:

    temp_dict = pandas_df[pandas_df.id==i]['clinical_entity'].value_counts().to_dict()

    temp_dict.update(pandas_df[pandas_df.id==i]['bionlp_entity'].value_counts().to_dict())

    temp_dict.update(pandas_df[pandas_df.id==i]['posology_entity'].value_counts().to_dict())
    
    temp_dict.update(pandas_df[pandas_df.id==i]['cell_entity'].value_counts().to_dict())

    temp_dict.update(pandas_df[pandas_df.id==i]['risk_entity'].value_counts().to_dict())
    
    #temp_dict.update(pandas_df[pandas_df.id==i]['assertion'].apply(lambda x: get_assertion_stats(x)).value_counts())
    
    adf = pandas_df[pandas_df.id==i]['assertion'].apply(lambda x: get_assertion_stats(x)).value_counts().reset_index()

    dic = {'present':0,
    'absent':0,
    'associated_with_someone_else':0}

    for j,row in adf.iterrows():
        try:
            k = list(row['index'].keys())[0]
            dic[k]= dic[k]+row['index'][k]*row['assertion']
        except:
            pass
    
    
    temp_dict.update(dic)

    temp_dict['smoker_entities'] = pandas_df[pandas_df.id==i]['smoker_entities'].sum()
    
    temp_dict['nonsmoker_regex_matches'] = pandas_df[pandas_df.id==i]['nonsmoker_regex_matches'].sum()
        
    temp_dict['id']=i

    xx.append(temp_dict)

stats_df = pd.DataFrame(xx)

stats_df.columns = ['entity_{}'.format(c) for c in stats_df.columns]

stats_df = stats_df.rename(columns={'entity_pid':'pid'})


In [111]:
stats_df.entity_associated_with_someone_else.sum()

94

In [112]:
stats_df.head()

Unnamed: 0,entity_PROBLEM,entity_TEST,entity_TREATMENT,entity_Simple_chemical,entity_Gene_or_gene_product,entity_Organ,entity_Organism,entity_Organism_subdivision,entity_Cell,entity_Organism_substance,...,entity_DURATION,entity_FORM,entity_Tissue,entity_Cellular_component,entity_Immaterial_anatomical_entity,entity_SMOKER,entity_Anatomical_system,entity_RNA,entity_Amino_acid,entity_Developing_anatomical_structure
0,38,29.0,11.0,15.0,13.0,13.0,8.0,7.0,6.0,3.0,...,,,,,,,,,,
1,51,20.0,39.0,21.0,31.0,5.0,15.0,4.0,4.0,1.0,...,3.0,,,,,,,,,
2,37,142.0,43.0,31.0,87.0,,1.0,,9.0,,...,,7.0,,,,,,,,
3,84,56.0,56.0,38.0,6.0,15.0,14.0,5.0,3.0,3.0,...,1.0,1.0,1.0,,,,,,,
4,31,9.0,30.0,10.0,7.0,2.0,18.0,,1.0,1.0,...,,1.0,2.0,1.0,,,,,,


In [113]:
stats_df['entity_SMOKER'].sum()

86.0

In [114]:
stats_df.entity_smoker_entities.sum()

16960

In [115]:
stats_df.columns

Index(['entity_PROBLEM', 'entity_TEST', 'entity_TREATMENT',
       'entity_Simple_chemical', 'entity_Gene_or_gene_product', 'entity_Organ',
       'entity_Organism', 'entity_Organism_subdivision', 'entity_Cell',
       'entity_Organism_substance', 'entity_Cancer',
       'entity_Pathological_formation', 'entity_Multi-tissue_structure',
       'entity_DRUG', 'entity_DOSAGE', 'entity_DNA', 'entity_protein',
       'entity_cell_type', 'entity_cell_line', 'entity_present',
       'entity_absent', 'entity_associated_with_someone_else',
       'entity_smoker_entities', 'entity_nonsmoker_regex_matches', 'entity_id',
       'entity_STRENGTH', 'entity_FREQUENCY', 'entity_ROUTE',
       'entity_DURATION', 'entity_FORM', 'entity_Tissue',
       'entity_Cellular_component', 'entity_Immaterial_anatomical_entity',
       'entity_SMOKER', 'entity_Anatomical_system', 'entity_RNA',
       'entity_Amino_acid', 'entity_Developing_anatomical_structure'],
      dtype='object')

In [116]:
stats_df.shape

(502, 38)

In [117]:
pandas_df[['id','label']].drop_duplicates()

Unnamed: 0,id,label
0,0,CURRENT SMOKER
78,1,CURRENT SMOKER
188,2,CURRENT SMOKER
410,3,CURRENT SMOKER
606,4,CURRENT SMOKER
...,...,...
44359,60129542207,UNKNOWN
44367,60129542208,UNKNOWN
44402,60129542209,UNKNOWN
44453,60129542210,UNKNOWN


In [118]:
stats_df.entity_id.value_counts()

17179869242    1
42949673017    1
34359738409    1
42949672984    1
34359738373    1
              ..
17179869202    1
34359738370    1
17179869231    1
51539607567    1
0              1
Name: entity_id, Length: 502, dtype: int64

In [119]:
model_df = pandas_df[['id','label']].drop_duplicates().merge(stats_df, left_on='id', right_on='entity_id').fillna(0)

In [120]:
model_df.head()

Unnamed: 0,id,label,entity_PROBLEM,entity_TEST,entity_TREATMENT,entity_Simple_chemical,entity_Gene_or_gene_product,entity_Organ,entity_Organism,entity_Organism_subdivision,...,entity_DURATION,entity_FORM,entity_Tissue,entity_Cellular_component,entity_Immaterial_anatomical_entity,entity_SMOKER,entity_Anatomical_system,entity_RNA,entity_Amino_acid,entity_Developing_anatomical_structure
0,0,CURRENT SMOKER,38,29.0,11.0,15.0,13.0,13.0,8.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,CURRENT SMOKER,51,20.0,39.0,21.0,31.0,5.0,15.0,4.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,CURRENT SMOKER,37,142.0,43.0,31.0,87.0,0.0,1.0,0.0,...,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,CURRENT SMOKER,84,56.0,56.0,38.0,6.0,15.0,14.0,5.0,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,CURRENT SMOKER,31,9.0,30.0,10.0,7.0,2.0,18.0,0.0,...,0.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [121]:
from sklearn.model_selection import train_test_split

X=model_df.drop(['label','entity_id', 'id'], axis=1)  # Features
y=model_df['label']  # Labels

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) 

In [122]:
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train, y_train)

y_pred=clf.predict(X_test)

In [123]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.7722772277227723


In [124]:
print(metrics.classification_report(y_test, y_pred))

                precision    recall  f1-score   support

CURRENT SMOKER       0.50      0.09      0.15        11
    NON-SMOKER       0.70      0.89      0.78        18
   PAST SMOKER       0.25      0.12      0.17         8
        SMOKER       0.00      0.00      0.00         4
       UNKNOWN       0.83      1.00      0.91        60

      accuracy                           0.77       101
     macro avg       0.46      0.42      0.40       101
  weighted avg       0.69      0.77      0.71       101



  'precision', 'predicted', average, warn_for)


In [125]:
import pandas as pd
feature_imp = pd.Series(clf.feature_importances_,index=X.columns).sort_values(ascending=False)
feature_imp

entity_smoker_entities                    0.151526
entity_present                            0.062117
entity_SMOKER                             0.052913
entity_Organism                           0.050223
entity_TEST                               0.043517
entity_PROBLEM                            0.043249
entity_TREATMENT                          0.039590
entity_Multi-tissue_structure             0.037426
entity_Organism_subdivision               0.036004
entity_Organ                              0.035646
entity_Simple_chemical                    0.035174
entity_DRUG                               0.034169
entity_Gene_or_gene_product               0.030230
entity_Cancer                             0.027798
entity_protein                            0.025667
entity_ROUTE                              0.024937
entity_DNA                                0.024601
entity_FREQUENCY                          0.024025
entity_Cell                               0.023520
entity_STRENGTH                

In [126]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0).fit(X_train, y_train)

y_pred=clf.predict(X_test)



In [127]:
print(metrics.classification_report(y_test, y_pred))

                precision    recall  f1-score   support

CURRENT SMOKER       0.25      0.09      0.13        11
    NON-SMOKER       0.70      0.78      0.74        18
   PAST SMOKER       0.40      0.25      0.31         8
        SMOKER       0.00      0.00      0.00         4
       UNKNOWN       0.83      0.98      0.90        60

      accuracy                           0.75       101
     macro avg       0.44      0.42      0.42       101
  weighted avg       0.68      0.75      0.71       101



## Creating TfIdf features and appending to scalar features

In [128]:
train_df_text = spark_train_df.toPandas()

In [129]:
model_df_text = train_df_text.merge(model_df.drop(['label','entity_id'], axis=1))

In [130]:
import string

In [131]:
model_df_text['text'] = model_df_text['text'].apply(lambda x: ''.join([i for i in x if not i.isdigit()]).lower().replace('\n',' '))

In [132]:
model_df_text['text'].head()

0     hlgmc   // :: am carcinoma of the colon . uns...
1     cmc  // :: am mediastinal adenopathy rt. lung...
2     bh   // :: am discharge summary unsigned dis ...
3     nvh  // :: am discharge summary signed dis ad...
4     ctmc   // :: am discharge summary unsigned di...
Name: text, dtype: object

In [133]:
from sklearn.feature_extraction.text import TfidfVectorizer
        
vect = TfidfVectorizer(max_features=1000,  min_df= 5, norm='l2', ngram_range=(1, 3), stop_words='english')

tfidf_matrix = vect.fit_transform(model_df_text['text'])

df = pd.DataFrame(tfidf_matrix.toarray(), columns = vect.get_feature_names())

X = pd.concat([df, model_df_text], axis=1)

print ('train with tfidf:', X.shape)

train with tfidf: (502, 1040)


In [72]:
X.columns

Index(['abdomen', 'abdomen soft', 'abdomen soft nontender', 'abdominal',
       'abdominal pain', 'able', 'abnormalities', 'active', 'activity',
       'acute',
       ...
       'entity_DURATION', 'entity_FORM', 'entity_Tissue',
       'entity_Cellular_component', 'entity_Immaterial_anatomical_entity',
       'entity_SMOKER', 'entity_Anatomical_system', 'entity_RNA',
       'entity_Amino_acid', 'entity_Developing_anatomical_structure'],
      dtype='object', length=1040)

In [134]:
X.head()

Unnamed: 0,abdomen,abdomen soft,abdomen soft nontender,abdominal,abdominal pain,able,abnormalities,active,activity,acute,...,entity_DURATION,entity_FORM,entity_Tissue,entity_Cellular_component,entity_Immaterial_anatomical_entity,entity_SMOKER,entity_Anatomical_system,entity_RNA,entity_Amino_acid,entity_Developing_anatomical_structure
0,0.0,0.0,0.0,0.086369,0.0,0.0,0.0,0.0,0.0,0.043185,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.026001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.015077,0.021557,0.027824,0.0,0.0,0.0,0.0,0.058103,0.023205,0.0,...,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.015271,0.0,0.0,0.061268,0.0,0.0,0.0,0.0,0.0,0.020423,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [111]:
X['text'].head()

0     hlgmc   // :: am carcinoma of the colon . uns...
1     cmc  // :: am mediastinal adenopathy rt. lung...
2     bh   // :: am discharge summary unsigned dis ...
3     nvh  // :: am discharge summary signed dis ad...
4     ctmc   // :: am discharge summary unsigned di...
Name: text, dtype: object

In [135]:
X_train, X_test, y_train, y_test = train_test_split(X.drop(['text','id', 'label'], axis=1), X.label, test_size=0.2) 

In [136]:
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=1000)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train, y_train)

y_pred=clf.predict(X_test)

In [137]:
print(metrics.classification_report(y_test, y_pred))

                precision    recall  f1-score   support

CURRENT SMOKER       0.50      0.14      0.22         7
    NON-SMOKER       0.48      0.85      0.61        13
   PAST SMOKER       1.00      0.08      0.14        13
        SMOKER       0.00      0.00      0.00         3
       UNKNOWN       0.85      0.98      0.91        65

      accuracy                           0.76       101
     macro avg       0.57      0.41      0.38       101
  weighted avg       0.77      0.76      0.70       101



  'precision', 'predicted', average, warn_for)


In [114]:
print(metrics.classification_report(y_test, y_pred))

                precision    recall  f1-score   support

CURRENT SMOKER       1.00      0.12      0.22         8
    NON-SMOKER       0.63      0.92      0.75        13
   PAST SMOKER       0.00      0.00      0.00        10
        SMOKER       0.00      0.00      0.00         2
       UNKNOWN       0.85      1.00      0.92        68

      accuracy                           0.80       101
     macro avg       0.50      0.41      0.38       101
  weighted avg       0.73      0.80      0.73       101



  'precision', 'predicted', average, warn_for)


In [138]:
import pandas as pd
feature_imp = pd.Series(clf.feature_importances_,index=X.drop(['text','id', 'label'], axis=1).columns).sort_values(ascending=False)
feature_imp

entity_smoker_entities                    0.030146
social                                    0.025326
smoking                                   0.022264
social history                            0.021354
alcohol                                   0.019452
                                            ...   
entity_Developing_anatomical_structure    0.000020
entity_associated_with_someone_else       0.000019
arrived                                   0.000016
milligrams                                0.000008
entity_Amino_acid                         0.000000
Length: 1037, dtype: float64

## with feature selection (select best K)

In [139]:
from sklearn.feature_selection import SelectKBest, f_classif

selector = SelectKBest(f_classif, k = 20)

sub_X = selector.fit_transform(X.drop(['text','id', 'label'], axis=1), X.label)

In [140]:
sub_X

array([[4.96999365e-02, 1.93638896e-02, 5.00723305e-02, ...,
        0.00000000e+00, 7.80000000e+01, 0.00000000e+00],
       [4.00174746e-02, 1.55914477e-02, 8.06346387e-02, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.32051088e-02, 9.04108128e-03, 2.33789811e-02, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [5.02260961e-02, 1.95688898e-02, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 4.01662551e-02, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 2.02660089e-02, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [141]:
X_train, X_test, y_train, y_test = train_test_split(sub_X, X.label, test_size=0.2) 

In [142]:
clf=RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train, y_train)

y_pred=clf.predict(X_test)

In [143]:
print(metrics.classification_report(y_test, y_pred))

                precision    recall  f1-score   support

CURRENT SMOKER       0.80      0.57      0.67         7
    NON-SMOKER       0.65      0.81      0.72        16
   PAST SMOKER       0.00      0.00      0.00         5
        SMOKER       0.00      0.00      0.00         2
       UNKNOWN       0.96      0.97      0.97        71

      accuracy                           0.85       101
     macro avg       0.48      0.47      0.47       101
  weighted avg       0.83      0.85      0.84       101



  'precision', 'predicted', average, warn_for)


In [139]:
print(metrics.classification_report(y_test, y_pred))

                precision    recall  f1-score   support

CURRENT SMOKER       0.75      0.50      0.60         6
    NON-SMOKER       0.57      0.92      0.71        13
   PAST SMOKER       0.33      0.14      0.20         7
        SMOKER       0.00      0.00      0.00         2
       UNKNOWN       1.00      1.00      1.00        52

      accuracy                           0.85        80
     macro avg       0.53      0.51      0.50        80
  weighted avg       0.83      0.85      0.83        80



  'precision', 'predicted', average, warn_for)


In [145]:
print(metrics.classification_report(y_test, y_pred))

                precision    recall  f1-score   support

CURRENT SMOKER       0.57      0.50      0.53         8
    NON-SMOKER       0.73      0.89      0.80        18
   PAST SMOKER       1.00      0.11      0.20         9
        SMOKER       0.00      0.00      0.00         1
       UNKNOWN       0.93      0.97      0.95        65

      accuracy                           0.83       101
     macro avg       0.65      0.49      0.50       101
  weighted avg       0.86      0.83      0.81       101



In [83]:
model_df_text.columns

Index(['text', 'label', 'id', 'entity_PROBLEM', 'entity_TEST',
       'entity_TREATMENT', 'entity_Simple_chemical',
       'entity_Gene_or_gene_product', 'entity_Organ', 'entity_Organism',
       'entity_Organism_subdivision', 'entity_Cell',
       'entity_Organism_substance', 'entity_Cancer',
       'entity_Pathological_formation', 'entity_Multi-tissue_structure',
       'entity_DRUG', 'entity_DOSAGE', 'entity_DNA', 'entity_protein',
       'entity_cell_type', 'entity_cell_line', 'entity_present',
       'entity_absent', 'entity_associated_with_someone_else',
       'entity_smoker_entities', 'entity_nonsmoker_regex_matches',
       'entity_STRENGTH', 'entity_FREQUENCY', 'entity_ROUTE',
       'entity_DURATION', 'entity_FORM', 'entity_Tissue',
       'entity_Cellular_component', 'entity_Immaterial_anatomical_entity',
       'entity_SMOKER', 'entity_Anatomical_system', 'entity_RNA',
       'entity_Amino_acid', 'entity_Developing_anatomical_structure'],
      dtype='object')

## Using smoke-related sentences for TfIDF

In [None]:
text_df = match_df.select('id','text').toPandas()

In [160]:
text_df.head()

Unnamed: 0,id,text
0,0,977146916\nHLGMC\n2878891\n022690\n01/27/1997 ...
1,1,026738007\nCMC\n15319689\n3/25/1998 12:00:00 A...
2,2,071962960\nBH\n4236518\n417454\n12/10/2001 12:...
3,3,418520250\nNVH\n61562872\n3/11/1995 12:00:00 A...
4,4,301443520\nCTMC\n49020928\n448922\n1/11/1990 1...


In [None]:
sent_dic={}

for m, row in text_df.iterrows():
    
    ann = lm.fullAnnotate(row['text'])

    for i in ann:

        sent_ids = [str(j.metadata['sentence']) for j in i['risk_ner_chunk']]

        sent_ids.extend([str(j.metadata['sentence']) for j in i['nonsmoker_regex_matches']])

        sent_ids.extend([str(j.metadata['sentence']) for j in i['smoker_entities']])

        sentences = [j.result for j in i['sentence'] if j.metadata['sentence'] in sent_ids]

        print (sentences)
        
        sent_dic[str(row['id'])] = sentences
    
    print (m)

In [144]:
model_df_text['smoke_sents'] = model_df_text['id'].apply(lambda x: ' '.join(sent_dic[str(x)]))

In [147]:
model_df_text.to_pickle('data/smoker_model_withText.pickle')

In [145]:
model_df_text.head()

Unnamed: 0,text,label,id,entity_PROBLEM,entity_TEST,entity_TREATMENT,entity_Simple_chemical,entity_Gene_or_gene_product,entity_Organ,entity_Organism,...,entity_FORM,entity_Tissue,entity_Cellular_component,entity_Immaterial_anatomical_entity,entity_SMOKER,entity_Anatomical_system,entity_RNA,entity_Amino_acid,entity_Developing_anatomical_structure,smoke_sents
0,hlgmc // :: am carcinoma of the colon . uns...,CURRENT SMOKER,0,38,29.0,11.0,15.0,13.0,13.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"Drinks ginger brandy to excess , pipe and ciga..."
1,cmc // :: am mediastinal adenopathy rt. lung...,CURRENT SMOKER,1,51,20.0,39.0,21.0,31.0,5.0,15.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
2,bh // :: am discharge summary unsigned dis ...,CURRENT SMOKER,2,37,142.0,43.0,31.0,87.0,0.0,1.0,...,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
3,nvh // :: am discharge summary signed dis ad...,CURRENT SMOKER,3,84,56.0,56.0,38.0,6.0,15.0,14.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
4,ctmc // :: am discharge summary unsigned di...,CURRENT SMOKER,4,31,9.0,30.0,10.0,7.0,2.0,18.0,...,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,


In [146]:
model_df_text['smoke_sents']

0      Drinks ginger brandy to excess , pipe and ciga...
1                                                       
2                                                       
3                                                       
4                                                       
                             ...                        
497                                                     
498                                                     
499                                                     
500                                                     
501                                                     
Name: smoke_sents, Length: 502, dtype: object

In [155]:
from sklearn.feature_extraction.text import TfidfVectorizer
        
vect = TfidfVectorizer(max_features=100,  min_df= 5, norm='l2', ngram_range=(1, 3), stop_words=None)

tfidf_matrix = vect.fit_transform(model_df_text['smoke_sents'])

df = pd.DataFrame(tfidf_matrix.toarray(), columns = vect.get_feature_names())

X = pd.concat([df, model_df_text], axis=1)

print ('train with tfidf:', X.shape)

train with tfidf: (502, 141)


In [156]:
X_train, X_test, y_train, y_test = train_test_split(X.drop(['text','id', 'label','smoke_sents'], axis=1), X.label, test_size=0.2) 

In [157]:
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train, y_train)

y_pred=clf.predict(X_test)

In [158]:
print(metrics.classification_report(y_test, y_pred))

                precision    recall  f1-score   support

CURRENT SMOKER       0.40      0.17      0.24        12
    NON-SMOKER       0.81      0.94      0.87        18
   PAST SMOKER       1.00      0.18      0.31        11
        SMOKER       0.00      0.00      0.00         3
       UNKNOWN       0.78      1.00      0.88        57

      accuracy                           0.77       101
     macro avg       0.60      0.46      0.46       101
  weighted avg       0.74      0.77      0.71       101



  'precision', 'predicted', average, warn_for)
