## Setup

In [10]:
import pandas as pd
import numpy as np

import scipy.sparse

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD

%matplotlib inline
from matplotlib import pyplot as plt

import jellyfish

from bokeh.plotting import figure, output_file, show, ColumnDataSource
from bokeh.models import HoverTool, CategoricalColorMapper
from bokeh.palettes import Category20
from bokeh.io import output_notebook
output_notebook()

## Training Dataset

### Load training data

In [2]:
df_full = pd.read_csv('../data/real_data/copd_train.csv', sep='^', header=None, names=['Text', 'COPD'])

# Lowercase data
df_full['Text2'] = df_full.Text.str.lower()

# Drop duplicates
df__no_dupes = df_full.drop_duplicates().copy().reset_index()

### TFIDF

In [3]:
# Create TF-IDF object that split data into sublists from a single word to four words (4-gram)
tfidf = TfidfVectorizer(ngram_range=(1,4))

mat_0 = tfidf.fit_transform(df__no_dupes.Text2)

voc_reverse = {v:k for k,v in tfidf.vocabulary_.items()}

### Distance features

Engineer features that capture the edit distance from the four proxy terms.

In [4]:
df__no_dupes['jaro_copd'] = df__no_dupes.Text2.map(lambda s: jellyfish.jaro_winkler('copd', s))

df__no_dupes['jaro_copd_full'] = df__no_dupes.Text2.map(lambda s: jellyfish.jaro_winkler('chronic obstructive pulmonary disease', s))

df__no_dupes['dl_copd'] = df__no_dupes.Text2.map(lambda s: 1 - (jellyfish.damerau_levenshtein_distance('copd', s) / max(len('copd'), len(s))))

df__no_dupes['dl_copd_full'] = df__no_dupes.Text2.map(lambda s: 1 - (jellyfish.damerau_levenshtein_distance('chronic obstructive pulmonary disease', s) / max(len('chronic obstructive pulmonary disease'), len(s))))

Combine the new distance features with the TF-IDF matrix created earlier.

In [5]:
distance_cols = ['jaro_copd', 'jaro_copd_full', 'dl_copd', 'dl_copd_full']

mat = scipy.sparse.hstack([mat_0, df__no_dupes[distance_cols].values])

### t-SNE Embedding

#### Applied dimensionality reduction to make t-SNE more computationally reasonable

In [8]:
svd = TruncatedSVD(n_components=10)
mat_svd = svd.fit_transform(mat)

#### Create t-SNE embedding

In [9]:
tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=500, random_state=23)
new_values = tsne_model.fit_transform(mat_svd)

#### Plot embedding results

In [11]:
# Split out the embedding coordinates
x = []
y = []
for value in new_values:
    x.append(value[0])
    y.append(value[1])

# Create color mapper for COPD and non-COPD
c_map = CategoricalColorMapper(factors=['0', '1'], palette=['black', 'red'])

# Create Bokeh datasource
source = ColumnDataSource(
        data=dict(
            x=x,
            y=y,
            desc=df__no_dupes.Text2,
            cluster=[str(x) for x in df__no_dupes.COPD]
        )
    )

# Set up Hover tooltip
hover = HoverTool(
        tooltips=[
            ("index", "$index"),
            ("Cluster", "@cluster"),
            ("Text", "@desc"),
        ]
    )

# Create figure with tools
p = figure(plot_width=700, plot_height=700, tools="pan,wheel_zoom,box_zoom,reset,save",
           title="Mouse over the dots")
p.add_tools(hover)

# Add scatterplot to figure
p.scatter('x', 'y', size=2, color=dict(field='cluster', transform=c_map), source=source)

# Render plot
show(p)

  elif np.issubdtype(type(obj), np.float):
