In [None]:
import sys
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sys.path.append('../src')
import cb_utils

sns.set(style="darkgrid")
pd.options.display.max_columns = 500

%load_ext autoreload
%autoreload 2

In [None]:
cache = True
auths = cb_utils.get_table('vw_ds_auth_mm', use_cache=cache)
claims = cb_utils.get_table('vw_ds_claims_mm', use_cache=cache)
visits = cb_utils.get_table('vw_ds_visit_features_mm', use_cache=cache)

In [None]:
df = cb_utils.merge_member_month_dfs(claims, auths, visits)

In [None]:
df.head()

In [None]:
claims.head()

In [None]:
from sklearn import decomposition
dxs = [c for c in claims.columns if '_dx' in c]
x = df[dxs].fillna(0)
y = df.grp.astype(int).values

pca = decomposition.PCA(n_components=2)

pca.fit(x)
X = pca.transform(x)

pca_df = pd.DataFrame(X, columns=['a', 'b'])
pca_df['grp'] = pd.Series(y)
sns.relplot(x='a', y='b', hue='grp', data=pca_df, height=10)

In [None]:
pca.explained_variance_ratio_

In [None]:
import plotly.express as px
np.random.seed(1)

x = df[dxs].fillna(0)
y = df.grp.astype(int).values

pca = decomposition.PCA(n_components=3)

pca.fit(x)
X = pca.transform(x)

pca_df = pd.DataFrame(X, columns=['a', 'b', 'c'])
pca_df['grp'] = pd.Series(y)

fig = px.scatter_3d(pca_df, x='a', y='b', z='c', color='grp',opacity=0.1, height=1000)
fig.show()

In [None]:
pca.explained_variance_ratio_

In [None]:
from sklearn import decomposition
grped = df[dxs + ['member_id', 'attd_pcs_visit_hrs']].fillna(0).groupby('member_id', as_index=False).mean()
x = grped[dxs]

pca = decomposition.PCA(n_components=2)

pca.fit(x)
X = pca.transform(x)

pca_df = pd.DataFrame(X, columns=['a', 'b'])
pca_df['attd_pcs_visit_hrs'] = grped.attd_pcs_visit_hrs
sns.relplot(x='a', y='b', hue='attd_pcs_visit_hrs', size='attd_pcs_visit_hrs', data=pca_df, height=10, alpha=0.7)

In [None]:
from sklearn import decomposition
grped = df[dxs + ['member_id', 'attd_pcs_visit_hrs']].fillna(0)
x = grped[dxs]

pca = decomposition.PCA(n_components=2)

pca.fit(x)
X = pca.transform(x)

pca_df = pd.DataFrame(X, columns=['a', 'b'])
pca_df['attd_pcs_visit_hrs'] = grped.attd_pcs_visit_hrs
sns.relplot(x='a', y='b', hue='attd_pcs_visit_hrs', size='attd_pcs_visit_hrs', data=pca_df, height=10, alpha=0.7)

In [None]:
# test LDA w/hrs bucket as target
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
grped = df[dxs + ['member_id', 'attd_pcs_visit_hrs']].fillna(0).groupby('member_id', as_index=False).mean()
x = grped[dxs]
y = pd.cut(grped.attd_pcs_visit_hrs, 5, labels=['a', 'b', 'c', 'd', 'e'])

lda = LinearDiscriminantAnalysis(n_components=2)

lda.fit(x, y)
X = lda.transform(x)

lda_df = pd.DataFrame(X, columns=['a', 'b'])
lda_df['attd_pcs_visit_hrs'] = grped.attd_pcs_visit_hrs
sns.relplot(x='a', y='b', hue='attd_pcs_visit_hrs', size='attd_pcs_visit_hrs', data=lda_df, height=10, alpha=0.7)

In [None]:
x = 

In [None]:
p = pca.components_[1]
coef =  sorted([(b, a) for a, b in zip(dxs, p)])
coef

In [None]:
from sklearn.cluster import MiniBatchKMeans
grped = df[dxs + ['member_id', 'attd_pcs_visit_hrs', 'attd_pcs_missed_hrs']].groupby('member_id', as_index=False).mean().fillna(0)
X = grped.fillna(0)

kmeans = MiniBatchKMeans(n_clusters=7, random_state=0, batch_size=256)

preds = kmeans.fit_predict(X)
preds
# pca_df = pd.DataFrame(X, columns=['a', 'b'])
# pca_df['grp'] = pd.Series(y)
# sns.relplot(x='a', y='b', hue='grp', data=pca_df)

In [None]:
kmeans_df = grped.assign(cluster=preds)
sns.relplot(x='attd_pcs_missed_hrs', y='attd_pcs_visit_hrs', hue='cluster', data=kmeans_df, height=10)

In [None]:
kmeans_df.cluster.describe()

In [None]:
mem_avg = kmeans_df[dxs + ['member_id', 'cluster']].fillna(0).groupby('member_id', as_index=False).mean()
melted = mem_avg.melt(id_vars=['member_id', 'cluster'], var_name='dx', value_name='dx_val')
g = sns.relplot(x='dx_val', y='cluster', col='dx', col_wrap=4, hue='dx', data=melted)