# Import packages

In [None]:
import pandas as pd
import numpy as np
import sklearn
import re
import nltk
import matplotlib.pyplot as plt

from cms import Preprocess
from typing import List
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity

#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('wordnet')

# Read file

In [None]:
path = './'
file = 'Hospital Review Data for NLP processing.xlsx'

In [None]:
df = pd.read_excel(path + file)

In [None]:
df

# Preprocess data

In [None]:
# Stopwords, lemmatization, and tokenizing
df['text'] = df['Review Comment'].apply(lambda text: Preprocess(text).process_as_string())

In [None]:
df

# Create tfidf matrix

In [None]:
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df['text'])

In [None]:
pd.DataFrame(data=tfidf_matrix.toarray(), columns=tfidf.get_feature_names())

In [None]:
tfidf_matrix.get_shape()

In [None]:
tfidf.get_feature_names()

# Build Kmeans clustering

In [None]:
n_clusters = 3
km = KMeans(n_clusters=n_clusters, random_state=1)
km.fit(tfidf_matrix)
clusters = km.labels_.tolist()

In [None]:
df['label'] = clusters

In [None]:
# Uncomment to see entire dataframe
#pd.set_option("display.max_colwidth", -1)
#pd.set_option("display.max_row", None)
df

# Apply PCA for data visualization

In [None]:
# Calculate distance
dist = 1 - cosine_similarity(tfidf_matrix)

In [None]:
pca = PCA(n_components=2)
pos = pca.fit_transform(dist)
xs, ys = pos[:, 0], pos[:, 1]

In [None]:
xys = pd.DataFrame(dict(x=xs, y=ys, label=clusters))

In [None]:
xys

In [None]:
groups = xys.groupby('label')

In [None]:
names = {0: 'Good/Neutral', 1: 'Negative', 2: 'Good/Neutral'}
fig, ax = plt.subplots(figsize=(17, 9))
ax.margins(0.05)

for name, group in groups:
    ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, label=names[name])

for i in range(len(xys)):
    ax.text(xys.x[i], xys.y[i], xys.index[i])

ax.legend(numpoints=1)

In [None]:
output = df[['AT_PHYSN_NPI', 'label']]

# Write out as csv

In [None]:
#output.to_csv('./output', index=False)