In [None]:
# example of using in-built analytics 'load_input' function.
# old code
from demyst.analytics import Analytics
analytics = Analytics(username="code-challenge-johnathan@demystdata.com", password="y(z%CrM32aXW^CQw8cUe2Mx83uG[")
import numpy as np
import pandas as pd
input_f = analytics.load_input("input_file.csv")
analytics.search(input_f)
df = pd.read_csv('input_file.csv', encoding = "latin", header=0)
df_copy = df.dropna()
print(df_copy.shape)
df_copy['country'] = 'USA'
df_copy.rename(index=str, columns={"postcode": "post_code"}, inplace=True)
results4 = analytics.enrich_and_download(["attom_expanded_profile_report"], df_copy, validate=True)
print(analytics.sample_enrich(df_copy).head(3))
print(results4.shape)

In [None]:
analytics.attribute_search(name="city")


# Beginning of Exploratory Data Analysis

In [13]:
import warnings
warnings.filterwarnings('ignore')
from demyst.analytics import Analytics
analytics = Analytics(username="code-challenge-johnathan@demystdata.com", password="y(z%CrM32aXW^CQw8cUe2Mx83uG[")
from demyst.common import connectors
import numpy as np
import pandas as pd
df = pd.read_csv('input_file.csv', encoding = "latin", header=0)
df_copy = df.copy()
print(df_copy.shape)
analytics.validate(df_copy)


(11229, 5)


Column,Status,Description
street,All Valid,All values in this column are good to go.
state,Some Invalid,16.8% of the values of this column failed validation. One example of an invalid value is 'NJ '. Click here for documentation for this column.
city,All Valid,All values in this column are good to go.
postcode,Unrecognized Column Name,This column name is not supported. Click here for a list of all supported column names.
safety_flag,Unrecognized Column Name,This column name is not supported. Click here for a list of all supported column names.


The two columns 'postcode' and 'safety_flag' aren't recognised as valid column names.
We're able to infer the valid column names from https://demyst.com/docs/demyst-live/types/.

Based on the column names found in the link listed avoce, we need to change the column name 'postcode' to 'post-code'.
'safety_flag' is also not recognised as a valid name, but we can leave it in the dataframe for now because that is our 'target' variable.
We will also need to fix the 'state' variables as there are invalid data points present as shown in the output above.

In [14]:
df_copy.rename(index=str, columns={"postcode": "post_code"}, inplace=True)
analytics.validate(df_copy)


Column,Status,Description
street,All Valid,All values in this column are good to go.
state,Some Invalid,16.8% of the values of this column failed validation. One example of an invalid value is 'NJ '. Click here for documentation for this column.
city,All Valid,All values in this column are good to go.
post_code,Some Invalid,0.1% of the values of this column failed validation. One example of an invalid value is 'nan'. Click here for documentation for this column.
safety_flag,Unrecognized Column Name,This column name is not supported. Click here for a list of all supported column names.


<!-- The column that has name 'safety_flag' has been inferred to be of type 'Boolean'. This is useful to know - even though we can't use this column for a lookup search for data enrichment, the DemystData api is at least able to infer the *type* of this target variable. -->
Now that we've changed the name of 'postcode' to a valid column name (post-code), the analysis recognises this column and has shown that there is a small proportion (0.1%) of invalid data, including the presence of 'nans'.
Likewise, the 'state' variable contains invalid data, so we need to fix those invalid data points additionally.

### Clean data that has invalid inputs

There are 'nans' present in the 'state' and 'post_code' variables, so we will drop those from the dataframe

In [17]:
df_copy.isna().sum()

street         0
state          9
city           9
post_code      9
safety_flag    0
dtype: int64

In [18]:
print(df_copy.shape)
# this will drop *all* 'nans', regardless of which variable is contained
df_copy.dropna(inplace=True)
print(df_copy.shape)

(11229, 5)
(11219, 5)


We managed to remove 10 records from the input data after executing the 'dropna' function, in an 'inplace' manner

In [19]:
print(df_copy['state'].unique())

['FL' 'IN' 'VA' 'OH' 'MA' 'MN' 'WI' 'NC' 'CO' 'CA' 'NY' 'NH' 'CT' 'MD'
 'NJ' 'KY' 'WA' 'IA' 'LA' 'GA' 'IL' 'TX' 'AZ' 'DE' 'MI' 'RI' 'OR' 'AR'
 'HI' 'PA' 'SC' 'AL' 'TN' 'WY' 'NV' 'NE' 'MT' 'MS' 'OK' 'KS' 'MO' 'ID'
 'DC' 'UT' 'NM' 'ND' 'ME' 'VT' 'AK' 'SD' 'NJ  ' 'NY  ' 'CA  ' 'OR  '
 'GA  ' 'MI  ' 'AZ  ' 'TX  ' 'PA  ']


There is whitespace present in several of the data points, so we should remove them

In [20]:
# remove whitespace found in state names above
df_copy['state'] = df_copy['state'].str.strip()

There is an inconsistent schema for the 'city' variable as seen below ('VACKAVILLE' vs 'Lakeland')

In [22]:
df_copy['city'].unique()

array(['Lakeland', 'Indianapolis', 'Troutville', ..., 'VACKAVILLE',
       'San Juan Capistrano', 'White Pains'], dtype=object)

In [23]:
df_copy['city'].value_counts()

New York            245
Brooklyn            158
San Francisco       155
Los Angeles         138
Chicago              99
San Antonio          83
San Diego            83
Houston              82
San Jose             74
Bronx                65
Dallas               54
Miami                50
Philadelphia         43
Atlanta              43
Sacramento           42
Oakland              41
Orlando              37
Austin               37
Charlotte            34
Seattle              33
Phoenix              32
Jacksonville         31
El Paso              31
Las Vegas            30
Washington           27
Fort Worth           27
Long Beach           26
Pittsburgh           25
Portland             25
Tampa                25
                   ... 
Marina Del Rey        1
Addis                 1
Chestnut Hill         1
Lexington Park        1
PITTSBURGH            1
ROSEVILLE             1
Rehoboth              1
Dale                  1
East Windsor          1
Solvay                1
Westford        

Because the majority of city names conform to the schema with a capitalised first letter for each part of the name,
we need to transform other data points (such as 'long beach' and 'VACKAVILLE') to a schema like:
'Long Beach', and 'Vackaville'

In [24]:
df_copy['city'] = df_copy['city'].str.title()


In [25]:
df_copy['city'].value_counts()

New York              261
Brooklyn              167
San Francisco         158
Los Angeles           142
Chicago               104
San Diego              88
San Antonio            85
Houston                84
San Jose               76
Bronx                  73
Dallas                 54
Miami                  54
Sacramento             45
Philadelphia           43
Oakland                43
Atlanta                43
Austin                 39
Orlando                37
Charlotte              36
Las Vegas              35
Seattle                35
Jacksonville           33
Phoenix                32
El Paso                32
Washington             30
Long Beach             29
Fort Worth             29
Pittsburgh             26
Portland               26
Staten Island          26
                     ... 
Saint Peter             1
Cookeville              1
Summerspoint            1
Aumsville               1
Morrilton               1
Cazenovia               1
Union Beach             1
Bainbridge I

As seen above, now the 'state' variable contains data points that all conform to the same schema

Based on the 'postcode' output below, there are variable types of coding schemas present in the dataset.
We need to ensure that each postcode is of fixed length. 

In [26]:
df_copy['post_code'].unique()

array(['33812-5237', '46227', '24175-6054', ..., '14626', '92192',
       '92782'], dtype=object)

The majority of postcodes are 5 digits long, so let's ensure that *every* postcode is of that length

In [27]:
df_copy['post_code'].value_counts().head(6)

94107    25
94103    21
10001    20
10016    16
10003    14
10012    13
Name: post_code, dtype: int64

In [28]:
# we need to ensure we have a consistent schema for the postcodes
def transform_postcodes(postcode):
    '''Converts postcodes to 5-digits. Assumes that the input is a Pandas Series'''
    l = len(postcode)
    if l == 4:
        return postcode.zfill(5)
    if l == 5:
        return postcode
    #if l == 9 or l == 10:
    #    return postcode[:5]
    else:
        return postcode[:5]

#df_copy.loc[:, 'post-code'] = df_copy['post-code'].astype('str').apply(transform_postcodes)

In [31]:
a = df_copy['post_code'].astype('str').apply(transform_postcodes)
a.value_counts()

94107    26
94103    22
10001    22
10016    17
10003    15
10012    13
11201    13
10019    13
11222    13
94110    12
10022    12
10013    12
94025    11
10018    11
90069    11
10014    11
94111    10
07024    10
92101    10
10010    10
94610    10
94114    10
94109    10
92660    10
11249    10
12866    10
92130     9
10940     9
11215     9
77449     9
         ..
75251     1
53718     1
08512     1
31320     1
07827     1
62930     1
90210     1
96717     1
29154     1
73096     1
61615     1
86322     1
64089     1
84009     1
81507     1
85224     1
67207     1
89509     1
30054     1
27703     1
81401     1
41129     1
73127     1
37026     1
62821     1
88766     1
14020     1
70774     1
94703     1
56011     1
Name: post_code, Length: 6597, dtype: int64

Now we're ready to apply the 'transform_postcodes' function to the 'post_code' variable

In [37]:
df_copy['post_code'] = df_copy['post_code'].astype('str').apply(transform_postcodes)


In [38]:
analytics.validate(df_copy)


Column,Status,Description
street,All Valid,All values in this column are good to go.
state,All Valid,All values in this column are good to go.
city,All Valid,All values in this column are good to go.
post_code,All Valid,All values in this column are good to go.
safety_flag,Unrecognized Column Name,This column name is not supported. Click here for a list of all supported column names.


Everything appears 'good to go' for data enrichment *however*, we also need to ensure that there aren't any duplicated data points present.    


In [42]:
df_copy[df_copy.duplicated()].shape[0]

21

21 duplicates were found, so let's drop them

In [43]:
df_copy.drop_duplicates(inplace=True)


Now we're ready to search for any data sets that we could use for enrichment for our input data

In [44]:
analytics.search(df_copy)


Unnamed: 0,post_code,country,city,state,street
Option 1,☒,☐,☒,☒,☒


The above output shows that 'attom_expanded_profile_report' could be used to enrich our input dataframe.
However, we don't have the column/attribute 'country' present in our data, so we need to include that to get a proper match.
We can infer that the 'country' column can be set to 'USA', as the output from running df['state'].unique() shows that each of the two letter acronyms match the 'ANSI' schema or 'USPS' coding schema of all the american states.
So let's add that column to the sanitised dataframe.


In [45]:
df_copy['country'] = 'USA'


In [46]:
analytics.search(df_copy)


Unnamed: 0,post_code,country,city,state,street
Option 1,☒,☒,☒,☒,☒


Now all the data matches!

In [220]:
results4 = analytics.enrich_and_download(["attom_expanded_profile_report"], df_copy, validate=True)


Verifying providers...
Starting enrichment...
Enrich Job ID: 13179


IntProgress(value=1, max=2)

HTML(value='Checking status...')

In [224]:
results4.shape

(11219, 10)

In [225]:
analytics.search(results4, strict=False)

In [189]:
results4.head(3)

Unnamed: 0,inputs.city,inputs.country,inputs.post_code,inputs.safety_flag,inputs.state,inputs.street,attom_expanded_profile_report.row_id,attom_expanded_profile_report.client_id,attom_expanded_profile_report.attom_id,attom_expanded_profile_report.error
0,Lakeland,USA,33812-5237,False,FL,3160 Otto Dr,0,,162278089,
1,Indianapolis,USA,46227,False,IN,940 Tulip Dr,1,,28461481,
2,Troutville,USA,24175-6054,False,VA,68 Rocky Top Rd,2,,214628965,


In [98]:
results3 = analytics.enrich_and_download(["attom_expanded_profile_report"], df, validate=False)


Verifying providers...
Starting enrichment...
Enrich Job ID: 13164


IntProgress(value=1, max=2)

HTML(value='Checking status...')

In [133]:
results3.head(3)

Unnamed: 0,inputs.city,inputs.postcode,inputs.safety_flag,inputs.state,inputs.street,attom_expanded_profile_report.row_id,attom_expanded_profile_report.client_id,attom_expanded_profile_report.error
0,Lakeland,33812-5237,False,FL,3160 Otto Dr,0,,"type: insufficient_input , message: Inputs are..."
1,Indianapolis,46227,False,IN,940 Tulip Dr,1,,"type: insufficient_input , message: Inputs are..."
2,Troutville,24175-6054,False,VA,68 Rocky Top Rd,2,,"type: insufficient_input , message: Inputs are..."


In [102]:
df.shape

(11229, 5)

In [103]:
results3.shape

(11229, 8)

In [239]:
a.product_outputs(["state"])


In [126]:
s = connectors.Connectors()
s.fetch('attom_expanded_profile_report', inputs = [inputs], sample_mode=False, config={'mode': 'sample', 'return_flattened_data': True, 
                                                 'return_raw_data': True, 'return_meta_fields': True})

__init__() missing 1 required positional argument: 'config'
