In [195]:
import numpy as np
import pandas as pd
import nltk
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text  import CountVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity

In [196]:
os.chdir('/Users/klarow/Desktop/ComputationalLabs/Project/Data')

In [197]:
#pd.set_option('display.max_colwidth', -1)
#pd.set_option('display.max_colwidth', 10)

##  Combine 3 datasets - does not need to be run again

In [198]:
data = pd.read_csv("step3processed.csv")

In [199]:
rawdata = data[:]

##  Cosine Similarity

In [200]:
#Can set parameters for how Tfidf is calculated, update to have a token as 1+ alphanumeric factor and not >1
#this way single character words (like I, F for female etc. are included)

def getCos(df,passed_dict):
    notelist = list(df.TEXT)
    rowidlist = list((df.ROW_ID))
    if len(notelist) <=1:
        passed_dict[rowidlist[0]] = "Only 1 note"
        return(passed_dict)
    tfidf = TfidfVectorizer(analyzer ="word", token_pattern='(?u)\\b\\w+\\b')
    tfs = tfidf.fit_transform(notelist)
    feature_names = tfidf.get_feature_names()
    
#loop through each note, calculate cosine similarity against it against all others (including self)
#to get average, make sure to subtact comparison to itself (1) in numerator (cosine) and denominator (number of notes)
#since first using tfidf (produces normalized vectors) can  use linear_kernal (faster) instead of cosine_similarity
# did test below as well to make sure same results
    counter = 0
    for note in tfs:
        row_id = rowidlist[counter]
        cosine_similarities = linear_kernel(note, tfs).flatten()
        avgcos = (cosine_similarities.sum()-1)/(len(cosine_similarities)-1)
        passed_dict[row_id] = avgcos
        counter +=1
    return(passed_dict)

In [201]:
#loop through each patient one at a time
#loop through patient's category types
# caluclate cosine similarity for patient/category subset of notes and add to categ dict by the rowid
#then loop through all the descriptions within category and add to desc dict by rowid
cosine_desc_dict = {}
cosine_categ_dict = {}
count = 0

for subj in data.SUBJECT_ID.unique():
    if count%100000 == 0:
            print(count)
    pat_df = data[data.SUBJECT_ID == subj]
    for cat in pat_df.CATEGORY.unique():
        cat_df = pat_df[pat_df.CATEGORY == cat]
        getCos(cat_df,cosine_categ_dict)
            
        for desc in cat_df.DESCRIPTION.unique():
            desc_df = cat_df[cat_df.DESCRIPTION == desc]
            getCos(desc_df,cosine_desc_dict)
            count+=1


0


In [202]:
#append the description cosine similarity for each row
data["CosDesc"] = data.ROW_ID.map(cosine_desc_dict)
#[cosine_desc_dict[x] for x in data.ROW_ID]

In [203]:
#append the category cosine similarity for each row
data["CosCat"] = data.ROW_ID.map(cosine_categ_dict)
#[cosine_categ_dict[x] for x in data.ROW_ID]

In [204]:
#data.to_csv("/Users/klarow/Desktop/ComputationalLabs/Project/Data/cosine_clean.csv",index = False)

## Test Train and Merge with HADM-ID

In [205]:
addmissions_data = pd.read_csv("ADMISSIONS.csv")

In [206]:
addmissions_data.ADMISSION_LOCATION.unique()

array(['EMERGENCY ROOM ADMIT', 'PHYS REFERRAL/NORMAL DELI',
       'TRANSFER FROM HOSP/EXTRAM', 'CLINIC REFERRAL/PREMATURE',
       'TRANSFER FROM SKILLED NUR', 'HMO REFERRAL/SICK',
       'TRANSFER FROM OTHER HEALT', '** INFO NOT AVAILABLE **',
       'TRSF WITHIN THIS FACILITY'], dtype=object)

In [207]:
#subset on columns we want to include in analysis (ie admission type, adm location, insurannce, lang, ethnicity)
to_add_hadm = addmissions_data[[2,6,7,8,9,10,11,13]]

In [208]:
to_add_hadm.head()

Unnamed: 0,HADM_ID,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,RELIGION,ETHNICITY
0,165315,EMERGENCY,EMERGENCY ROOM ADMIT,DISC-TRAN CANCER/CHLDRN H,Private,,UNOBTAINABLE,WHITE
1,152223,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Medicare,,CATHOLIC,WHITE
2,124321,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,Medicare,ENGL,CATHOLIC,WHITE
3,161859,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME,Private,,PROTESTANT QUAKER,WHITE
4,129635,EMERGENCY,EMERGENCY ROOM ADMIT,HOME,Private,,UNOBTAINABLE,WHITE


In [209]:
#Assume that missing values are english
to_add_hadm.LANGUAGE.fillna(value="ENGL",inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [210]:
#number of instances with no HADMID
len(data[data.HADM_ID.isnull()==True])

81

In [211]:
#remove these instances with no hadmid, since so many variables are dependent on matching on it
data = data[data.HADM_ID.isnull()==False]

In [212]:
dfwith_admin = pd.merge(data,to_add_hadm,on = "HADM_ID")

In [213]:
data = dfwith_admin

##  Partition Dates by Hours

In [214]:
#get times from date time
ctime1 = data.CHARTTIME.str.split(" ",expand = True)

In [215]:
#breakup hour, minutes, seconds (in military time)
ctime2 = ctime1[1].str.split(":",expand = True)

In [216]:
#take the hour as the category
ctime3 = ctime2[0]

In [217]:
#create new column with the chart time as category (24 hrs)
data["CHARTTIMECAT"] = ctime3

In [218]:
#repeat steps for store time
stime = data.STORETIME.str.split(" ",expand = True)

In [219]:
stime2 = stime[1].str.split(":", expand = True)

In [220]:
#stime2[0].unique()

In [221]:
stime3 = stime2[0]

In [222]:
data["STORETIMECAT"] = ctime3

In [223]:
data.head()

Unnamed: 0,HADM_ID,SUBJECT_ID,CGID,ROW_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,ISERROR,...,CosCat,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,RELIGION,ETHNICITY,CHARTTIMECAT,STORETIMECAT
0,136468.0,28742,,4417,8/1/43,,,discharg summari,report,,...,0.481639,EMERGENCY,EMERGENCY ROOM ADMIT,HOME HEALTH CARE,Medicare,ENGL,CATHOLIC,WHITE,,
1,136468.0,28742,,130306,7/27/43,,,ecg,report,,...,0.243769,EMERGENCY,EMERGENCY ROOM ADMIT,HOME HEALTH CARE,Medicare,ENGL,CATHOLIC,WHITE,,
2,136468.0,28742,,130307,7/27/43,,,ecg,report,,...,0.132512,EMERGENCY,EMERGENCY ROOM ADMIT,HOME HEALTH CARE,Medicare,ENGL,CATHOLIC,WHITE,,
3,136468.0,28742,,972636,7/27/43,7/27/43 18:38,,radiolog,chest portabl ap,,...,0.121509,EMERGENCY,EMERGENCY ROOM ADMIT,HOME HEALTH CARE,Medicare,ENGL,CATHOLIC,WHITE,18.0,18.0
4,136468.0,28742,14411.0,1636259,7/29/43,7/29/43 2:56,7/29/43 3:08,nursingoth,report,,...,0.23247,EMERGENCY,EMERGENCY ROOM ADMIT,HOME HEALTH CARE,Medicare,ENGL,CATHOLIC,WHITE,2.0,2.0


##  Partition  Test/Train  and set all blank iserror to 0

In [224]:
data.ISERROR.fillna(value = 0,inplace=True)

In [225]:
errortemp = data[data.ISERROR == 1]
errortemp["Train"] = np.random.normal(0,1,len(errortemp),) <= .80
#errortemp.Train
correcttemp = data[data.ISERROR != 1]
correcttemp["Train"] = np.random.normal(0,1,len(correcttemp),) <= .80
#correcttemp.Train
#mr (model ready) dataframe
data = pd.concat([correcttemp,errortemp], axis = 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


##  Reduce # Language Categories, combine all those with < 100 notes with subject speaking it

In [226]:
data["OldLanguage"] = data["LANGUAGE"]

In [227]:
#data.head()

In [228]:
#get counts of # people who speak each language
language_Df = pd.DataFrame(data.LANGUAGE.value_counts()).reset_index()

In [229]:
#create dictionary to collapse languages with fewer than 100 people speaking it into an other category
lang_dict = {}
for each in language_Df[language_Df.LANGUAGE < 100]["index"]:
    lang_dict[each] = "Other"

In [230]:
#keep other languages as is
for each in language_Df[language_Df.LANGUAGE >= 100]["index"]:
    lang_dict[each] = each

In [231]:
#assign other to those languages with fewer
data["LANGUAGE"] = data.LANGUAGE.map(lang_dict)


In [232]:
#columns to drop, not needed for modelling
to_drp = ["CHARTDATE","CHARTTIME","STORETIME","TIMECHARTED","TIMESTORED"]

In [233]:
#drop
data_clean = data.drop(to_drp, axis=1, level=None, inplace=False, errors='raise')

In [234]:
#output modelread file
data_clean.to_csv("/Users/klarow/Desktop/ComputationalLabs/Project/Data/modelready.csv",index = False)

## Test Code

In [235]:
# cosine_similarities = linear_kernel(tfs[1:2], tfs).flatten()
# avgcos = (cosine_similarities.sum()-1)/(len(cosine_similarities)-1)
# cosine_similarities
# avgcos