# Census classification demo

In [1]:
! pip install tqdm

Collecting tqdm
  Downloading tqdm-4.61.2-py2.py3-none-any.whl (76 kB)
[K     |████████████████████████████████| 76 kB 863 kB/s eta 0:00:01
[?25hInstalling collected packages: tqdm
Successfully installed tqdm-4.61.2


In [1]:
import pandas as pd
import numpy as np

## 1. Read data sample

In [2]:
df = pd.read_csv('../data/adult.csv', sep=',', nrows=300).replace({'?': np.nan}).dropna()

In [3]:
df

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
5,34,Private,198693,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,25,Private,236267,HS-grad,9,Never-married,Machine-op-inspct,Unmarried,White,Male,0,1590,40,United-States,<=50K
296,30,Private,236543,9th,5,Married-civ-spouse,Other-service,Husband,White,Male,0,0,32,El-Salvador,>50K
297,23,Private,318483,HS-grad,9,Never-married,Sales,Own-child,White,Male,0,0,40,United-States,<=50K
298,34,Self-emp-not-inc,163756,Assoc-voc,11,Never-married,Farming-fishing,Not-in-family,White,Male,27828,0,60,United-States,>50K


## Create anomalous data

In [4]:
outliers = df.copy()

In [5]:
outliers['educational-num'] = 16
outliers['age'] = outliers['age'] + 50
outliers['native-country'] = 'Germany'
outliers['education'] = 'Doctorate'
outliers['race'] = 'Other'
outliers['hours-per-week'] = 60
outliers['capital-loss'] = 1590
outliers['capital-gain'] = 27828

## 2. Perform inference

We assume that you have already deployed the model as an application.

In [6]:
import grpc
from tqdm import tqdm
from hydrosdk import Cluster, Application

In [7]:
cluster = Cluster(
    http_address="<addr>",
    grpc_address="<addr>",
    grpc_credentials=grpc.ssl_channel_credentials()   # TLS certificates installed
)

In [8]:
app = Application.find(cluster, "<census_app_name>")
app.lock_while_starting()
predictor = app.predictor()

### infer ok data

In [9]:
results = []
for x in tqdm(df.sample(200).to_dict('records')):
    result = predictor.predict(x)
    results.append(result['income'])

100%|███████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:36<00:00,  5.53it/s]


### infer anomalies

In [10]:
results = []
for x in tqdm(outliers.sample(200).to_dict('records')):
    result = predictor.predict(x)
    results.append(result['income'])

100%|███████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:36<00:00,  5.55it/s]
