# K-Means

In [None]:
import cudf
import cuml

import cuxfilter as cxf

In [None]:
km = cuml.KMeans(n_clusters=5)

In [None]:
km.fit(gdf)
gdf['cluster'] = km.labels_
km.cluster_centers_

# DBScan

In [None]:
dbscan = cuml.DBSCAN(eps=5000)

In [None]:
infected_df = gdf[gdf['infected'] == 1].reset_index()
infected_df['cluster'] = dbscan.fit_predict(infected_df[['northing', 'easting']])
infected_df['cluster'].nunique()

# Logistic Regression

In [None]:
logreg = cuml.LogisticRegression()

In [None]:
logreg.fit(gdf[['age', 'sex']], gdf['infected'])

In [None]:
logreg_coef = logreg.coef_
logreg_int = logreg.intercept_

print("Coefficients: [age, sex]")
print([logreg_coef[0], logreg_coef[1]])

print("Intercept:")
print(logreg_int[0])

In [None]:
class_probs = logreg.predict_proba(gdf[['age', 'sex']])
class_probs

In [None]:
X_train, X_test, y_train, y_test  = cuml.train_test_split(gdf[['age', 'sex']], gdf['infected'], train_size=0.9)
logreg = cuml.LogisticRegression()
logreg.fit(X_train, y_train)
y_test_pred = logreg.predict_proba(X_test, convert_dtype=True)[1]
y_test_pred.index = X_test.index

# KNN

In [None]:
knn = cuml.NearestNeighbors(n_neighbors=3)

In [None]:
road_locs = road_nodes[['east', 'north']]
knn.fit(road_locs)

In [None]:
distances, indices = knn.kneighbors(hospitals[['easting', 'northing']], 3) # order has to match the knn fit order (east, north)

# XGBoost

In [None]:
import cudf
import cuml
import cupy as cp

from cuml.preprocessing.model_selection import train_test_split

# visualization
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import graphviz

# model analysis
import numpy as np
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

import xgboost as xgb

# xgboost version 1.0 or later is required to directly convert from cudf Dataframes to xgboost DMatrix format
print('XGBoost version: ', xgb.__version__)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(gdf[['age', 'sex', 'northing', 'easting']], gdf['infected'])
del(gdf)

In [None]:
params = {
    'max_depth':    8,
    'max_leaves':   2**8,
    'tree_method':  'gpu_hist',
    'objective':    'binary:logistic',
    'grow_policy':  'lossguide',
    'eval_metric':  'logloss',
    'subsample':    '0.8'
}

In [None]:
dtrain = xgb.DMatrix(x_train, y_train)

In [None]:
%time model = xgb.train(params, dtrain, num_boost_round=100)

In [None]:
ax = xgb.plot_importance(model, height=.8)
ax.grid(False)
ax.set_title('F score by feature')
plt.show()

xgb.plot_tree(model, num_trees=0, rankdir='LR')

# get current figure to set the size
fig = matplotlib.pyplot.gcf()
fig.set_size_inches(100, 100)

In [None]:
dtest = xgb.DMatrix(x_test)
%time y_pred = model.predict(dtest)
y_test_cpu = cp.asnumpy(cp.array(y_test))
false_pos_rate, true_pos_rate, thresholds = roc_curve(y_test_cpu, y_pred)