In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from category_encoders import OneHotEncoder
from sklearn.preprocessing import Imputer
import json

%matplotlib inline

# Example Observations

# 1. Clean Observations

In [2]:
df = pd.read_csv('train.csv')
print(df.shape)
df.head()

(8164, 14)


Unnamed: 0,id,birth date,job type,school level,domestic status,profession,domestic relationship type,ethnicity,gender,earned dividends,interest earned,monthly work,country of origin,target
0,15463,1983-12-26,private,secondary,single,mechanic,never married,afro american,Female,0,0,160,u.s.,1
1,2202,1962-12-31,private,college graduate,spouse passed,secretarial,never married,white and privileged,Female,0,0,160,u.s.,1
2,29145,1979-12-27,private,college graduate,married 2,C-level,has husband,white and privileged,Female,0,0,200,u.s.,0
3,11330,1965-12-30,private,entry level college,married 2,sales,has husband,white and privileged,Female,0,0,208,canada,0
4,20822,1967-12-30,self-emp-inc,entry level college,divorce pending,C-level,never married,white and privileged,Female,0,0,300,u.s.,1


- 10 clean observations from the actual training set.    



- Dirty observations:
- Missing Values (`monthly work` with a missing value (NaN), and `gender` with missing value (NaN))
- Categories never seen before (e.g., Male inside `Gender`, New_Level inside `school level`)
- Missing column (job type)
- `school level` as integer
- `monthly work` as str (non-numerical)
- `monthly work` as a numerical parseble str
- birth date with different format (dd-mm-yyyy)
- empty observation
- empty payload


In [3]:
examples = df.sample(10, random_state=1)
examples = examples.drop(['id', 'target'], axis=1)
examples

Unnamed: 0,birth date,job type,school level,domestic status,profession,domestic relationship type,ethnicity,gender,earned dividends,interest earned,monthly work,country of origin
3875,1994-12-23,private,entry level college,single,C-level,not living with family,white and privileged,Female,0,0,200,u.s.
4265,1958-01-01,private,secondary-7 through 8,d,other,not living with family,white and privileged,Female,0,0,160,u.s.
4402,1953-01-02,private,secondary,spouse passed,secretarial,not living with family,white and privileged,Female,0,0,80,u.s.
5161,1984-12-25,private,secondary,single,other,living with extende family,american indian,Female,0,0,144,u.s.
4261,1953-01-02,private,entry level college,spouse passed,secretarial,not living with family,white and privileged,Female,0,0,160,u.s.
39,1957-01-01,private,entry level college,single,estate employee,not living with family,white and privileged,Female,0,0,160,u.s.
3811,1997-12-22,private,entry level college,single,sales,not living with family,white and privileged,Female,0,0,120,u.s.
2975,1997-12-22,private,entry level college,single,other,living with child,white and privileged,Female,0,0,80,u.s.
4761,1950-01-03,unknown,entry level college,single,unknown,never married,afro american,Female,0,0,160,u.s.
473,1973-12-28,unknown,entry level college,d,unknown,never married,white and privileged,Female,0,0,40,poland


In [4]:
examples = [k.to_dict() for _id, k in examples.T.items()]

examples = [{'id': i, 'observation': k} for i, k in enumerate(examples)]
examples

[{'id': 0,
  'observation': {'birth date': '1994-12-23',
   'country of origin': 'u.s.',
   'domestic relationship type': 'not living with family',
   'domestic status': 'single',
   'earned dividends': 0,
   'ethnicity': 'white and privileged',
   'gender': 'Female',
   'interest earned': 0,
   'job type': 'private',
   'monthly work': 200,
   'profession': 'C-level',
   'school level': 'entry level college'}},
 {'id': 1,
  'observation': {'birth date': '1958-01-01',
   'country of origin': 'u.s.',
   'domestic relationship type': 'not living with family',
   'domestic status': 'd',
   'earned dividends': 0,
   'ethnicity': 'white and privileged',
   'gender': 'Female',
   'interest earned': 0,
   'job type': 'private',
   'monthly work': 160,
   'profession': 'other',
   'school level': 'secondary-7 through 8'}},
 {'id': 2,
  'observation': {'birth date': '1953-01-02',
   'country of origin': 'u.s.',
   'domestic relationship type': 'not living with family',
   'domestic status': 'sp

In [5]:
#[json.dumps(k) for k in examples]

In [6]:
with open('clean_observations.json', 'w') as fh:
    json.dump(examples, fh)

In [7]:
!cat clean_observations.json

[{"id": 0, "observation": {"monthly work": 200, "birth date": "1994-12-23", "domestic status": "single", "domestic relationship type": "not living with family", "profession": "C-level", "earned dividends": 0, "job type": "private", "country of origin": "u.s.", "school level": "entry level college", "gender": "Female", "ethnicity": "white and privileged", "interest earned": 0}}, {"id": 1, "observation": {"monthly work": 160, "birth date": "1958-01-01", "domestic status": "d", "domestic relationship type": "not living with family", "profession": "other", "earned dividends": 0, "job type": "private", "country of origin": "u.s.", "school level": "secondary-7 through 8", "gender": "Female", "ethnicity": "white and privileged", "interest earned": 0}}, {"id": 2, "observation": {"monthly work": 80, "birth date": "1953-01-02", "domestic status": "spouse passed", "domestic relationship type": "not living with family", "profession": "secretarial", "earned dividends": 0, "job type": "private", "co

# Dirty ones
- Dirty observations:
- Missing Values (`monthly work` with a missing value (NaN), and `gender` with missing value (NaN))
- Categories never seen before (e.g., Male inside `Gender`, New_Level inside `school level`)
- Missing column (job type)
- `school level` as integer
- `monthly work` as str (non-numerical)
- `monthly work` as a numerical parseble str
- birth date with different format (dd-mm-yyyy)
- empty observation
- empty payload


In [8]:
import copy
dirty_examples = copy.deepcopy(examples[:9])

In [9]:
for example in dirty_examples:
    example['id'] = 100 + example['id']

In [10]:
# 1. Missing Values (monthly work with a missing value (NaN), and gender with missing value (NaN))
dirty_examples[0]['observation']['monthly work'] = None
dirty_examples[0]['observation']['gender'] = None
dirty_examples[0]

{'id': 100,
 'observation': {'birth date': '1994-12-23',
  'country of origin': 'u.s.',
  'domestic relationship type': 'not living with family',
  'domestic status': 'single',
  'earned dividends': 0,
  'ethnicity': 'white and privileged',
  'gender': None,
  'interest earned': 0,
  'job type': 'private',
  'monthly work': None,
  'profession': 'C-level',
  'school level': 'entry level college'}}

In [11]:
# 2. Categories never seen before (e.g., Male inside Gender, New_Level inside school level)
dirty_examples[1]['observation']['gender'] = 'Male'
dirty_examples[1]['observation']['school level'] = 'geniOus'
dirty_examples[1]

{'id': 101,
 'observation': {'birth date': '1958-01-01',
  'country of origin': 'u.s.',
  'domestic relationship type': 'not living with family',
  'domestic status': 'd',
  'earned dividends': 0,
  'ethnicity': 'white and privileged',
  'gender': 'Male',
  'interest earned': 0,
  'job type': 'private',
  'monthly work': 160,
  'profession': 'other',
  'school level': 'geniOus'}}

In [12]:
# 3. Missing column (job type)
del dirty_examples[2]['observation']['job type']
dirty_examples[2]['observation']

{'birth date': '1953-01-02',
 'country of origin': 'u.s.',
 'domestic relationship type': 'not living with family',
 'domestic status': 'spouse passed',
 'earned dividends': 0,
 'ethnicity': 'white and privileged',
 'gender': 'Female',
 'interest earned': 0,
 'monthly work': 80,
 'profession': 'secretarial',
 'school level': 'secondary'}

In [13]:
# 4. school level as integer
dirty_examples[3]['observation']['school level'] = 111
dirty_examples[3]['observation']

{'birth date': '1984-12-25',
 'country of origin': 'u.s.',
 'domestic relationship type': 'living with extende family',
 'domestic status': 'single',
 'earned dividends': 0,
 'ethnicity': 'american indian',
 'gender': 'Female',
 'interest earned': 0,
 'job type': 'private',
 'monthly work': 144,
 'profession': 'other',
 'school level': 111}

In [14]:
# monthly work as str (non-numerical)
dirty_examples[4]['observation']['monthly work'] = 'r32g'
dirty_examples[4]['observation']

{'birth date': '1953-01-02',
 'country of origin': 'u.s.',
 'domestic relationship type': 'not living with family',
 'domestic status': 'spouse passed',
 'earned dividends': 0,
 'ethnicity': 'white and privileged',
 'gender': 'Female',
 'interest earned': 0,
 'job type': 'private',
 'monthly work': 'r32g',
 'profession': 'secretarial',
 'school level': 'entry level college'}

In [15]:
# monthly work as str (non-numerical)
dirty_examples[5]['observation']['monthly work'] = '40'
dirty_examples[5]['observation']

{'birth date': '1957-01-01',
 'country of origin': 'u.s.',
 'domestic relationship type': 'not living with family',
 'domestic status': 'single',
 'earned dividends': 0,
 'ethnicity': 'white and privileged',
 'gender': 'Female',
 'interest earned': 0,
 'job type': 'private',
 'monthly work': '40',
 'profession': 'estate employee',
 'school level': 'entry level college'}

In [16]:
# birth date with different format (dd-mm-yyyy)
dirty_examples[6]['observation']['birth date'] = '01-01-1946'
dirty_examples[6]['observation']

{'birth date': '01-01-1946',
 'country of origin': 'u.s.',
 'domestic relationship type': 'not living with family',
 'domestic status': 'single',
 'earned dividends': 0,
 'ethnicity': 'white and privileged',
 'gender': 'Female',
 'interest earned': 0,
 'job type': 'private',
 'monthly work': 120,
 'profession': 'sales',
 'school level': 'entry level college'}

In [17]:
# empty observation
dirty_examples[7]['observation'] = {}
dirty_examples[7]['observation']

{}

In [18]:
# empty payload
dirty_examples[8] = {}

In [19]:
with open('dirty_observations.json', 'w') as fh:
    json.dump(dirty_examples, fh)

In [20]:
!cat dirty_observations.json

[{"id": 100, "observation": {"monthly work": null, "country of origin": "u.s.", "domestic status": "single", "domestic relationship type": "not living with family", "profession": "C-level", "earned dividends": 0, "job type": "private", "birth date": "1994-12-23", "school level": "entry level college", "gender": null, "ethnicity": "white and privileged", "interest earned": 0}}, {"id": 101, "observation": {"monthly work": 160, "country of origin": "u.s.", "domestic status": "d", "domestic relationship type": "not living with family", "profession": "other", "earned dividends": 0, "job type": "private", "birth date": "1958-01-01", "school level": "geniOus", "gender": "Male", "ethnicity": "white and privileged", "interest earned": 0}}, {"id": 102, "observation": {"monthly work": 80, "country of origin": "u.s.", "domestic status": "spouse passed", "domestic relationship type": "not living with family", "profession": "secretarial", "earned dividends": 0, "birth date": "1953-01-02", "school le

In [24]:
import requests

list_app_names = []

def simulate(app_names, observations):
    responses = {}
    
    for app_name in app_names:
        responses[app_name] = []
        for observation in observations:
            responses[app_name].append(
                requests.post(
                    'https://{}.herokuapp.com/predict'.format(app_name), 
                    json=observation))
    return responses

In [None]:
app_names = ['<your app name>']

In [32]:
outputs = simulate(app_names, dirty_examples + examples)