In [1]:
import os
import collections
from collections import Counter

import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

from sklearn.metrics import r2_score

import scipy.sparse as sp

## DECOMPOSITION
from sklearn.decomposition import NMF
from scipy.linalg import svd

## NLP
import gensim
import re
import nltk
lemmatizer = nltk.stem.WordNetLemmatizer()
from nltk.corpus import stopwords
stopw = set(stopwords.words('english'))
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
def tknz(txt):
    return set(tokenizer.tokenize(re.sub("0|1|2|3|4|5|6|7|8|9|0", "", txt.lower())))-stopw



    
# # MATRIX-FACTORIZATION: DIMENSIONALITY REDUCTION & ARCHETYPING

# ## CLUSTER FEATURES INTO OCCUPATION CATEGORIES
# ## Use non-zero matrix factorization for clustering
# ## Use singular value decomposition first state for determining overall similarity


class Archetypes:
    '''
    Archetypes: Performs NMF of order n on X and stores the result as attributes. 
    Archetypes are normalized: cosine similarity a(i) @ a(i) = 1. 
    Atributes:
        my_archetypes.n         - order / number of archetypes
        my_archetypes.X         - input matrix
        
        my_archetypes.model     - NMF model 
        my_archetypes.w         - NMF w-matrix 
        my_archetypes.h         - NMF h-matrix
        
        my_archetypes.o         - occupations x archetypes matrix (from w-matrix)
        my_archetypes.on        - occupations x normalized archetypes matrix (from w-matrix) - SOCP number as index. 
        my_archetypes.occ       - occupations x normalized archetypes matrix - Occupation names as index
        
        my_archetypes.f         - features x archetypes matrix (from h-matrix)
        my_archetypes.fn        - features x normalized archetypes matrix
        
    '''
    def __init__(self,X,n):
        self.n = n
        self.X = X
        self.model = NMF(n_components=n, init='random', random_state=0, max_iter = 1000, tol = 0.0000001)
        self.w = self.model.fit_transform(self.X)
        self.o = pd.DataFrame(self.w,index=self.X.index)
        #self.on = self.o.T.apply(norm).T
#         self.occ = self.on.copy()
#         self.occ['Occupations'] = self.occ.index
#        self.occ['Occupations'] = self.occ['Occupations'].apply(onet_socp_name)
#         self.occ = self.occ.set_index('Occupations')
        self.h = self.model.components_
        self.f = pd.DataFrame(self.h,columns=X.columns)
        #self.fn =self.f.T.apply(norm).T
#         self.plot_occupations_dic ={}
#         self.plot_features_dic ={}

class Svd:
    ''''
    Singular value decomposition-as-an-object
        my_svd = Svd(X) returns
        my_svd.u/.s/.vt – U S and VT from the Singular Value Decomposition (see manual)
        my_svd.f        – Pandas.DataFrame: f=original features x svd_features
        my_svd.o        - Pandas.DataFrame: o=occupations x svd_features
        my_svd.volume(keep_volume) 
                        - collections.namedtuple ('dotted dicionary'): 
                          Dimensionality reduction. keeps 'keep_volume' of total variance
                          
                          
    '''
    def __init__(self,X):
        self.u,self.s,self.vt = svd(np.array(X))
        self.f = pd.DataFrame(self.vt,columns=X.columns)
        self.o = pd.DataFrame(self.u,columns=X.index)
        
    def volume(self,keep_volume):
        ''' 
        Dimensionality reduction, keeps 'keep_volume' proportion of original variance
        Type: collections.namedtuple ('dotted dictionary')
        Examples of usage:
        my_svd.volume(0.9).s - np.array: eigenvalues for 90% variance 
        my_svd.volume(0.8).f - dataframe: features for 80% variance
        my_svd.volume(0.5).o - dataframe: occupations for 50% variance      
        '''
        dotted_dic = collections.namedtuple('dotted_dic', 's f o')
        a1 = self.s.cumsum()
        a2 = a1/a1[-1]
        n_max = np.argmin(np.square(a2 - keep_volume))
        cut_dic = dotted_dic(s= self.s[:n_max],f= self.f.iloc[:n_max], o= self.o.iloc[:n_max])
        return cut_dic
        

In [69]:
import nltk
#nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer 
# Init the Wordnet Lemmatizer
lemmatizer = WordNetLemmatizer()

# def drop_stopwords(wordvec,language='English'):
#     wv = np.array(wordvec)
#     stw = np.array(stopwords.words(language))
#     without_stopwords = wv[[not word in stw for word in wv]]
#     return without_stopwords

# def lemmatize(wordvec):
#     return [lemmatizer.lemmatize(word,'v') for word in wordvec ]

# def char_clean(string):
#     string = string.lower()
#     string = re.sub('\d',' ', string )
#     string = re.sub('\W+',' ', string )
#     string = re.sub(' [a-y] ',' ', string )
#     return re.sub(' [a-y]{2} ',' ', string )

# def nlp_prep(string):
#     wordvec = char_clean(string).split()
#     return np.array(lemmatize(drop_stopwords(wordvec)))

def nlp_prep(string):
    return gensim.utils.simple_preprocess(string)

def multiply_counters(b,a):
    return Counter({k:a[k]*v for k,v in b.items()})

def rev_dict(dic):
    '''Reverse a dictionary'''
    return dict(zip(dic.values(),dic.keys()))

def indexate(seq,mirror=True):
    '''
    Create a lookup-table: Enumerate a sequence and return as a dictionary
    If mirror=True, make the lookup-table bi-directional. Requires that no elements of the sequence 
    overlap with the enumeration. 
    '''
    lst = list(seq)
    dic = dict(list(enumerate(lst,0)))
    if mirror:
        dic.update(rev_dict(dic))
    return dic

def keymap(f,dic):
    return {f(k):v for k,v in dic.items()}

def valuemap(f,dic):
    return {k:f(v) for k,v in dic.items()}

def keyvaluemap(f,g,dic):
    return {f(k):g(v) for k,v in dic.items()}

def identity(item): 
    return item

In [70]:
# Create the word-vectors for the dictations
txt_files = !ls Documents/*txt   #The names of dictation files in the 'Documents'-directory

raw_txt = {}
txt ={}
for file in txt_files:
    ix = file.replace('.txt','').replace('Documents/','')
    raw_txt[ix]=open(file).read()  
    txt[ix]=Counter(nlp_prep(raw_txt[ix]))
word_counts = sum(txt.values(),Counter())
word_weight = Counter({k: 1/v for k, v in word_counts.items()})

txt_w = {}
for key in txt.keys():
    txt_w[key] = multiply_counters(txt[key],word_weight)
    
# Create index of keys for txt and txt_w
tix = indexate(txt.keys())

# Create dict with index of words in corpus
wix = indexate(word_counts.keys())

In [71]:
txw = keyvaluemap(tix.get, 
                  lambda x: keyvaluemap(wix.get,
                                        identity,
                                        x),
                  txt_w)

In [77]:
aaa = pd.DataFrame(txt_w).fillna(0)

In [79]:
bbb = aaa @ aaa.T

In [86]:
bbb.cov()

Unnamed: 0,aaa,aanesthesia,aasm,ab,abdomen,abdomens,abdominal,abg,abilify,ability,...,zinc,zithromax,zocor,zofran,zoloft,zosyn,zyloprim,zyprexa,zyrtec,zyvox
aaa,0.000558,-2.434796e-05,-0.000050,-9.589120e-06,1.159480e-06,-3.371506e-06,0.000017,5.603671e-05,-0.000016,-5.848444e-06,...,-2.062887e-05,-1.243030e-05,-0.000010,4.578278e-05,-7.655956e-06,-1.505456e-05,-2.081601e-05,-1.140103e-05,-2.088494e-06,-2.021926e-05
aanesthesia,-0.000024,2.671589e-03,-0.000081,-1.969913e-05,-2.118016e-05,-1.235617e-05,-0.000033,-4.020161e-05,-0.000032,-1.285274e-05,...,-3.858176e-05,-1.943993e-05,-0.000023,-2.739198e-05,-2.269867e-05,-3.504176e-05,-4.318661e-05,-2.530320e-05,-1.142199e-05,-4.752094e-05
aasm,-0.000050,-8.132050e-05,0.011615,-7.801195e-05,-4.986029e-05,-7.155588e-06,-0.000071,-9.278388e-05,-0.000085,-3.217475e-05,...,-7.759656e-05,-5.184587e-05,-0.000064,-6.290988e-05,-5.504842e-05,-8.921624e-05,-9.115555e-05,-5.705262e-05,-3.487863e-05,-1.095743e-04
ab,-0.000010,-1.969913e-05,-0.000078,2.778070e-03,1.132671e-06,-2.897572e-06,-0.000014,-3.855647e-06,-0.000027,-1.083902e-05,...,-1.160662e-05,9.894999e-07,-0.000019,1.109211e-04,-1.917111e-05,1.460214e-04,-3.101879e-05,-1.816972e-05,-7.801218e-06,-2.251570e-05
abdomen,0.000001,-2.118016e-05,-0.000050,1.132671e-06,4.725803e-06,-2.021547e-06,0.000010,5.196995e-06,0.000002,6.904186e-07,...,2.173844e-06,9.527245e-07,0.000002,1.386731e-06,1.323161e-06,3.970017e-06,1.417660e-06,8.599251e-06,8.145850e-07,2.004941e-06
abdomens,-0.000003,-1.235617e-05,-0.000007,-2.897572e-06,-2.021547e-06,6.675193e-04,0.000004,-3.149785e-06,-0.000002,-3.635460e-06,...,-7.049092e-06,-4.678002e-06,-0.000004,-4.173178e-06,-3.949064e-06,-4.431781e-06,-4.732986e-06,4.983785e-06,-3.479991e-06,-1.007051e-05
abdominal,0.000017,-3.274132e-05,-0.000071,-1.387797e-05,9.758267e-06,4.417156e-06,0.000095,4.069432e-05,-0.000016,-8.923395e-06,...,1.660380e-06,-1.317276e-05,0.000019,-1.333069e-06,-8.071695e-06,1.888991e-05,2.545654e-05,-1.178589e-05,-9.292050e-06,-3.041382e-05
abg,0.000056,-4.020161e-05,-0.000093,-3.855647e-06,5.196995e-06,-3.149785e-06,0.000041,4.767488e-04,0.000333,-1.365063e-05,...,-2.434693e-05,-8.481195e-06,-0.000013,-1.356666e-05,-1.752547e-05,2.891277e-06,-3.267227e-05,-1.798485e-05,-1.087552e-05,-1.881133e-05
abilify,-0.000016,-3.217093e-05,-0.000085,-2.736033e-05,2.265513e-06,-1.910898e-06,-0.000016,3.333548e-04,0.003503,-1.471185e-05,...,-3.649587e-06,-1.617366e-05,-0.000004,-1.790689e-05,-1.763313e-05,-2.610659e-05,-2.414128e-05,-1.882077e-05,-9.634386e-06,-3.576590e-05
ability,-0.000006,-1.285274e-05,-0.000032,-1.083902e-05,6.904186e-07,-3.635460e-06,-0.000009,-1.365063e-05,-0.000015,3.685877e-04,...,-1.218923e-05,-5.857801e-06,-0.000010,-5.675211e-06,-4.012115e-06,-1.256255e-05,-1.218259e-05,-8.522539e-06,-7.892666e-07,-1.339101e-05


In [85]:
bbb['abdominal'].drop(['abdominal','abdomen'])

aaa              0.012821
aanesthesia      0.000000
aasm             0.000000
ab               0.000000
abdomens         0.015385
abg              0.010769
abilify          0.000000
ability          0.003846
ablation         0.000000
able             0.007407
abnormal         0.001183
abnormalities    0.015385
abnormality      0.005594
about            0.007484
above            0.005128
abraded          0.000000
abruptly         0.000000
abscess          0.004142
absence          0.015385
absent           0.003846
absolute         0.000000
absolutely       0.000000
absorbable       0.000000
abuse            0.003297
abuses           0.000000
ac               0.000000
academy          0.000000
accelerated      0.000000
acceleration     0.030769
accepting        0.000000
                   ...   
xr               0.000000
xx               0.000000
xxx              0.004858
xxxx             0.000000
xxxxx            0.007101
year             0.006215
years            0.008858
yeast       

In [82]:
bbb['abdomens'][bbb['abdomens']>0]

abdomens        1.000000
abdominal       0.015385
adjust          0.100000
admitted        0.018349
again           0.026316
albuterol       0.032258
alcohol         0.019608
allergies       0.012346
allowing        0.034483
amlodipine      0.133333
and             0.003438
angina          0.100000
anicteric       0.029412
antiplatelet    0.500000
arterial        0.076923
artery          0.033898
as              0.009231
aspirin         0.024691
assessment      0.004348
at              0.002174
ativan          0.062500
atraumatic      0.016949
atrovent        0.052632
barlow          0.090909
be              0.003676
bed             0.027778
being           0.021739
benign          0.166667
bicarb          0.038462
bilaterally     0.014925
                  ...   
systems         0.003690
temperature     0.007812
thank           0.007246
the             0.003802
then            0.012048
therapy         0.006116
three           0.012739
to              0.003557
today           0.007194


In [359]:
aaa = Archetypes(adf.T,2)
aaa.f.T[1].sort_values(ascending = False)

16     0.427379
32     0.423443
21     0.419628
12     0.414794
29     0.410791
11     0.409553
22     0.407678
18     0.407597
33     0.405502
10     0.404485
24     0.404115
38     0.401489
40     0.398286
26     0.398179
31     0.396803
39     0.394816
28     0.391772
13     0.389734
17     0.384372
14     0.382879
20     0.382632
19     0.380671
23     0.378795
30     0.378239
34     0.375931
25     0.374667
37     0.373321
9      0.372016
15     0.359568
27     0.352732
         ...   
213    0.004919
170    0.004853
47     0.003349
61     0.003239
167    0.002667
210    0.002308
45     0.002269
54     0.002173
6      0.001504
60     0.000000
196    0.000000
214    0.000000
67     0.000000
221    0.000000
229    0.000000
64     0.000000
63     0.000000
62     0.000000
236    0.000000
49     0.000000
48     0.000000
237    0.000000
244    0.000000
57     0.000000
246    0.000000
4      0.000000
53     0.000000
52     0.000000
51     0.000000
232    0.000000
Name: 1, Length: 249, dt

In [337]:
aaa = Svd(adf)

In [354]:
(aaa.f.T[0]+aaa.f.T[7]).sort_values(ascending = False)

surgical           0.082297
admitted           0.066616
age                0.065073
notable            0.054833
nausea             0.054485
home               0.054474
cough              0.050766
social             0.050390
constipation       0.050373
terazosin          0.049403
allergies          0.049378
metoprolol         0.048406
diarrhea           0.047965
gu                 0.047051
presentation       0.046702
dysuria            0.046331
shortness          0.045822
bladder            0.044988
bethanechol        0.044666
vomiting           0.043999
catheterization    0.043403
cell               0.043233
jaundice           0.042953
lisinopril         0.041977
hospital           0.041506
cancer             0.040800
twice              0.040380
diagnostic         0.040252
known              0.039954
city               0.039802
                     ...   
heart             -0.113306
cyanosis          -0.113546
post              -0.113573
respiratory       -0.113716
assessment        -0

In [312]:
df = pd.DataFrame(adjmat.todense())
df.index = df.index.map(txt_index)
df.columns = list(map(word_dic.get,df.columns))
aaa = Archetypes(df,10)

In [331]:
corr = (adjmat.T @ adjmat).todense()
df = pd.DataFrame(corr)
#df.index = df.index.map(txt_index)
df.index = df.index.map(word_dic)
df.columns = df.index

In [297]:
len(txt[txt_index[0]])

234

In [91]:
from striprtf.striprtf import rtf_to_text
import sys

if len(sys.argv) != 2:
    print('Convert an rtf file to plain text')
    print('Usage : %s filename.rtf' % sys.argv[0])

Convert an rtf file to plain text
Usage : /anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py filename.rtf


In [92]:
sys.argv[1]

'-f'

In [93]:
!pwd


shell-init: error retrieving current directory: getcwd: cannot access parent directories: No such file or directory
pwd: error retrieving current directory: getcwd: cannot access parent directories: No such file or directory
pwd: error retrieving current directory: getcwd: cannot access parent directories: No such file or directory


In [94]:
pwd

UsageError: CWD no longer exists - please use %cd to change directory.


In [None]:
!ls