In [138]:
import pandas as pd
import nltk
from nltk.stem import PorterStemmer
import numpy as np
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn import tree
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
import time
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
from nltk.tokenize import RegexpTokenizer

from nltk.sentiment import SentimentIntensityAnalyzer


In [2]:
X = pd.read_csv('train/train_radiology.csv')

In [16]:
X.head()

Unnamed: 0,patient_id,charttime,note_type,note_seq,text
0,p100001,2174/5/27 7:21,RR,21,"AP CHEST, 7:27 A.M., ___. HISTORY: ___ man w..."
1,p100001,2174/5/26 18:18,RR,22,INDICATION: ___ year old man with pneumonia a...
2,p100002,2176/11/25 22:13,RR,14,"EXAMINATION: CTA CHEST WANDW/O CANDRECONS, NO..."
3,p100004,2154/4/24 13:53,RR,20,EXAMINATION: Left vertebral artery angiogram....
4,p100004,2154/4/24 22:00,RR,21,EXAMINATION: CTA HEAD AND CTA NECK Q16 CT IN...


In [89]:
def get_words(df, stemmer=None, use_stemmer=False):
    all_sentences = ' '.join(df)
    tokenizer = RegexpTokenizer(r'\w+')
    
    all_words = tokenizer.tokenize(all_sentences)
    uniq = np.unique(all_words)

    if use_stemmer:
        st = np.vectorize(stemmer.stem)
        stemmed = st(uniq)
        return np.unique(stemmed)

    return uniq

In [90]:
words = get_words(X['text'])

In [91]:
ps = PorterStemmer()
stemmed_words = get_words(X['text'], stemmer=ps, use_stemmer=True)

In [92]:
words.shape

(25363,)

In [93]:
stemmed_words.shape

(14890,)

In [94]:
# joins the notes based on patient id, new df has one row per patient with all their notes
aggregation_functions =  {'text': lambda x: ' '.join(x)}
grouped_words = X.groupby(X['patient_id']).aggregate(aggregation_functions)

In [95]:
grouped_words

Unnamed: 0_level_0,text
patient_id,Unnamed: 1_level_1
p100001,"AP CHEST, 7:27 A.M., ___. HISTORY: ___ man w..."
p100002,"EXAMINATION: CTA CHEST WANDW/O CANDRECONS, NO..."
p100004,EXAMINATION: Left vertebral artery angiogram....
p100006,"INDICATION: ___ man with a history of COPD, p..."
p100007,"PORTABLE CHEST, ___ COMPARISON: Study of ear..."
...,...
p117129,"AP CHEST, 6:15 P.M. ___ HISTORY: Acute panc..."
p117130,EXAMINATION: Portable chest radiograph INDIC...
p117131,EXAMINATION: CHEST (PORTABLE AP) INDICATION:...
p117132,EXAMINATION: CHEST (PORTABLE AP) INDICATION:...


In [122]:
def create_features(df, words, stemmer=None, use_stemmer=False):
    cp = df.copy()
    cp.index = np.arange(0, len(cp))
    cp = pd.concat([
        cp,
        pd.DataFrame(np.zeros((cp.shape[0], words.shape[0])), columns=words)
    ], axis=1)
    

    def count_row(row):
        all_words = nltk.word_tokenize(row['__REVIEW__'])
        if use_stemmer:
            st = np.vectorize(stemmer.stem)
            all_words = st(all_words)
        for word in all_words:
            if word in row:
                row[word] += 1
        return row

    cp = cp.apply(lambda row: count_row(row), axis=1)
    return cp

In [96]:
tokenizer = RegexpTokenizer(r'\w+')

In [159]:
res = grouped_words.apply(lambda row: tokenizer.tokenize(row['text']), axis=1)

In [135]:
vectors = pd.DataFrame(index=grouped_words.index, columns=words).fillna(0)

  vectors = pd.DataFrame(index=grouped_words.index, columns=words).fillna(0)


In [136]:
vectors

Unnamed: 0_level_0,0,00,000,0009,001,0010,0014,0016,0018,002,...,zonal,zone,zones,zoster,zosyn,zygoma,zygomatic,zygomatico,zygomaticofrontal,zygomaticomaxillary
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
p100001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
p100002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
p100004,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
p100006,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
p100007,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
p117129,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
p117130,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
p117131,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
p117132,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [162]:
pd.DataFrame(res.explode()[:100]).reset_index().apply(lambda x: , axis=1)

0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


0     None
1     None
2     None
3     None
4     None
      ... 
95    None
96    None
97    None
98    None
99    None
Length: 100, dtype: object

0

In [68]:
res = pd.DataFrame(res).reset_index()

In [72]:
res['index'] = res['index'].str.lower()

In [77]:
res = res.set_index('index')

In [78]:
no_stop = res.drop(stopwords.words('english'), errors='ignore', axis=0)

In [79]:
no_stop

Unnamed: 0_level_0,count
index,Unnamed: 1_level_1
___,51721
right,47033
left,39141
impression:,22553
indication:,20001
...,...
"alimentation,",1
"pallidii,",1
03:01,1
vertebrobasilar,1


In [164]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.2-cp310-cp310-macosx_10_9_x86_64.whl (24.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m36.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting smart-open>=1.8.1
  Downloading smart_open-7.0.3-py3-none-any.whl (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting wrapt
  Downloading wrapt-1.16.0-cp310-cp310-macosx_10_9_x86_64.whl (37 kB)
Installing collected packages: wrapt, smart-open, gensim
Successfully installed gensim-4.3.2 smart-open-7.0.3 wrapt-1.16.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [165]:
from gensim.models import Word2Vec
import gensim
from nltk.tokenize import sent_tokenize, word_tokenize
import warnings

In [173]:
grouped_words.itter

Unnamed: 0_level_0,text
patient_id,Unnamed: 1_level_1
p100001,"AP CHEST, 7:27 A.M., ___. HISTORY: ___ man w..."
p100002,"EXAMINATION: CTA CHEST WANDW/O CANDRECONS, NO..."
p100004,EXAMINATION: Left vertebral artery angiogram....
p100006,"INDICATION: ___ man with a history of COPD, p..."
p100007,"PORTABLE CHEST, ___ COMPARISON: Study of ear..."
...,...
p117129,"AP CHEST, 6:15 P.M. ___ HISTORY: Acute panc..."
p117130,EXAMINATION: Portable chest radiograph INDIC...
p117131,EXAMINATION: CHEST (PORTABLE AP) INDICATION:...
p117132,EXAMINATION: CHEST (PORTABLE AP) INDICATION:...


In [169]:
# Create CBOW model
model = Word2Vec(sentences=list(grouped_words), vector_size=100, window=5, min_count=1, workers=4)
model.save("word2vec.model")

In [172]:
from gensim.models import KeyedVectors
# Store just the words + their trained embeddings.
word_vectors = model.wv
word_vectors.save("word2vec.wordvectors")

# Load back with memory-mapping = read-only, shared across processes.
wv = KeyedVectors.load("word2vec.wordvectors", mmap='r')

vector = wv['text']  # Get numpy vector of a word

KeyError: "Key 'text' not present"

In [176]:
import gensim.downloader as api
path = api.load("word2vec-google-news-300", return_path=True)
print(path)

# model = KeyedVectors.load_word2vec_format (vector_path, binary=True)
# model.wv.similar_by_word('mutation')

ValueError: unable to read local cache '/Users/ryderkemper/gensim-data/information.json' during fallback, connect to the Internet and retry