### Table of Contents 
##### 1. Importing libraries and reading data
##### 2. Exploratory Data Analysis 
##### 3. Data Cleaning and checking of coverage rates

### Importing Libraries and Reading data

In [1]:
#importing modules and reading data

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 
test = pd.read_csv('/users/johnstonkirimo/Projects/Zesty/data/test.csv')
all_addrs = pd.read_csv('/users/johnstonkirimo/Projects/Zesty/data/all_addresses.csv')

### Exploratory Data Analysis

In [2]:
#overview of df1 dataset

all_addrs.head()

Unnamed: 0,address,city,state,zip
0,941 Thorpe St,Rock Springs,WY,82901
1,2809 HARRIS DR,Antioch,CA,94509
2,1441 Eastlake Ave,Los Angeles,CA,90033
3,7 Eucalyptus,Newport Beach,CA,92657
4,1400 Lachman Ln,Los Angeles,CA,90272


In [3]:
#overview of df2 dataset
test.head()

Unnamed: 0,uid,address,city,state,zip
0,1,941 Thorpe St,Rock Springs,WY,82901.0
1,2,2809 HARRIS DR,Antioch,CA,94509.0
2,3,1441 Eastlake Ave,Los Angeles,CA,90033.0
3,4,7 ucayptus,Newport Beach,CA,92657.0
4,5,1400 Lachman Ln,,CA,90272.0


In [4]:
#number of rows and columns
all_addrs.shape

(130000, 4)

In [5]:
#number of rows and columns 
test.shape

(99249, 5)

In [6]:
#info on dataset
all_addrs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130000 entries, 0 to 129999
Data columns (total 4 columns):
address    129705 non-null object
city       129984 non-null object
state      129984 non-null object
zip        129984 non-null object
dtypes: object(4)
memory usage: 4.0+ MB


In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99249 entries, 0 to 99248
Data columns (total 5 columns):
uid        99249 non-null int64
address    99249 non-null object
city       97967 non-null object
state      91816 non-null object
zip        91694 non-null float64
dtypes: float64(1), int64(1), object(3)
memory usage: 3.8+ MB


In [8]:
#examinining specific columns : address 
all_addrs.groupby('address').size()

address
#NAME?                  504
- -                     167
- 10th Ave                2
- 110th St W              2
- 115th St E              2
                       ... 
Yaqui Pass Rd             4
Yerington Ave             1
Zephyr Rd                 1
Zinfandel Dr              1
undefined Marines Dr      1
Length: 96279, dtype: int64

In [9]:
#examinining specific columns : address (df2)
test.groupby('address').size()

address
#NAM?              4
#NAME?           206
#NAME? #11606      1
#NAME? #13850      1
#NAME? #18941      1
                ... 
w commons          1
walnut rd          1
wildomar           1
willis ave         1
x st               1
Length: 97121, dtype: int64

In [11]:
#examinining specific columns: top 5 cities
test.city.value_counts(dropna=False).head()

Los Angeles    5594
San Diego      2833
San Jose       1889
Sacramento     1704
Bakersfield    1291
Name: city, dtype: int64

In [12]:
#examinining percent distribution: top 5 states
test.state.value_counts(normalize = True, dropna=False)[:5]

CA     0.789056
NaN    0.074892
ca     0.066721
TX     0.008433
IL     0.006086
Name: state, dtype: float64

In [13]:
#checking for missing values
test.isnull().sum()

uid           0
address       0
city       1282
state      7433
zip        7555
dtype: int64

In [14]:
#checking for missing values - all_addrs
all_addrs.isnull().sum()

address    295
city        16
state       16
zip         16
dtype: int64

In [15]:
#examine city using groupby

all_addrs.groupby('city').size()

city
-               524
ACTON             2
ADELANTO          1
AGOURA HILLS      2
AHWAHNEE          1
               ... 
Zebulon           1
Zenia             1
Zephyr Cove       1
Zion              1
Zionsville        2
Length: 3799, dtype: int64

In [16]:
#examine state using groupby

all_addrs.groupby('state').size()

state
-                1
AK               7
AL              82
AR              57
AZ             314
Auckland         1
CA          120208
CHIH             2
CHIS             1
CO             715
CT             329
CÓRDOBA          1
DC               2
DE              10
FL             292
GA             170
GAUTENG          2
HI               6
IA              75
ID              84
IL             892
IN             117
KS             111
KY              94
LA              65
MA             202
MD              42
ME              35
MI              86
MN             134
MO              90
MS             115
MT              56
NC             545
ND              40
NE              52
NH              29
NJ              98
NM              73
NV             146
NY             875
OH             469
OK              94
ON               1
OR             131
PA             425
RI              20
SC              52
SD              26
TN             140
TX            1251
UT              56
VA    

In [17]:
#examine state using groupby

test.groupby('city').size()

city
ACTON              2
ADELANTO           1
AGOURA HILLS       1
AHWAHNEE           1
AIRWAY HEIGHTS     1
                  ..
ypsilanti          1
yreka              4
yuba city         13
yucaipa            8
yucca valley       5
Length: 4768, dtype: int64

In [18]:
#examine state using groupby

test.groupby('state').size()

state
AK        5
AL       53
AR       35
AZ      200
CA    78313
      ...  
ut        4
va        5
wa       42
wi        6
wy        4
Length: 96, dtype: int64

In [19]:
#check the unique values in the state column
test['state'].unique()

array(['WY', 'CA', 'NY', 'TX', 'TN', 'PA', 'MI', 'FL', 'GA', 'MO', 'AR',
       'CO', 'WA', 'OH', 'AZ', 'MA', 'IL', 'CT', 'SC', 'KY', 'NV', 'MN',
       'AL', 'NC', 'OK', 'WV', 'VA', 'UT', 'VT', 'ID', 'NJ', 'IN', 'MD',
       'SD', 'OR', 'NH', 'NM', 'IA', 'KS', 'LA', 'WI', 'ME', 'NE', 'MS',
       'MT', 'DE', 'ND', 'ca', 'ma', 'ia', 'nd', 'oh', 'id', 'il', 'HI',
       'tx', 'ut', 'pa', 'sd', 'ny', 'RI', 'co', 'wa', 'va', 'mn', 'az',
       'nc', 'tn', nan, 'ne', 'ky', 'nj', 'la', 'ct', 'ks', 'mo', 'md',
       'al', 'wi', 'mi', 'wy', 'AK', 'fl', 'in', 'DC', 'ri', 'or', 'nh',
       'hi', 'nm', 'mt', 'ms', 'ga', 'nv', 'ar', 'ok', 'me'], dtype=object)

#### Some Observations on the address column: 

- There are 504 records of incorrect inputs with the value '#NAME?' in the all_address dataframe
- There are 167 records, with the value '- -' in the all_address dataframe
- Several actual addresses with wrong inputs, starting with '-'
- The test dataframe has similar issues of missing,wrong or incorrectly formatted inputs in the test dataframe

#### Key Observations on the city and state columns:

- all_addrs dataframe has 524 records with missing values, with input '-'
- Missing values/incorrect value, '-'
- Incorrect inputs with names, e.g. GUATENG, instead of abbreviations
- Wrong info/abbreviations used, e.g. ON 
- Total of 59 unique values for state, instead of 50
- Lowercase, uppercase and missing values 

In [20]:
# Examining the zip column
all_addrs.zip.sample(10)

119464    91331
97699     95695
21687     10473
37526     91773
33599     92316
50857     95206
97021     95822
36007     95973
13473     92336
50497     98229
Name: zip, dtype: object

In [21]:
# Examining the zip column
test.zip.sample(10)

17593    94085.0
95762    90720.0
12204    92833.0
53043    95954.0
2622     90805.0
70449    91311.0
18707    91331.0
37986    92592.0
91622    90502.0
27617    93306.0
Name: zip, dtype: float64

In [22]:
#top 5 most common zip codes
all_addrs.zip.value_counts().head()

92008    456
93535    428
92026    411
95969    400
93536    388
Name: zip, dtype: int64

In [23]:
#top 5 most common zip codes
test.zip.value_counts().head()

92008.0    323
92026.0    282
95969.0    281
93535.0    256
93536.0    225
Name: zip, dtype: int64

In [24]:
#examine using groupy
all_addrs.groupby('zip').size()

zip
-        77
10003     1
10009     4
1001      1
10025     2
         ..
99507     1
99508     1
99517     1
99709     2
M9B       1
Length: 5685, dtype: int64

In [25]:
#examine using groupy
test.groupby('zip').size()

zip
1001.0     1
1020.0     3
1027.0     1
1028.0     2
1035.0     1
          ..
99504.0    1
99507.0    1
99508.0    1
99517.0    1
99709.0    1
Length: 5419, dtype: int64

### Data Cleaning 

#### Coverage Rate - the percentage of properties in a file that has correct addresses and matches up with a standard address database

#### To measure the impact of the data cleaning methods applied, let's compare the coverage rate before and after conducting the data cleaning process. We'll begin by checking the current coverage rate:

#### There are several ways of checking the address match between the two dataframes.
#### 1. Match on row level for each column

In [26]:
#convert data type of zip column in test df to string
test.zip.astype(str).apply(lambda x: x.replace('.0',''))

0        82901
1        94509
2        90033
3        92657
4        90272
         ...  
99244    85120
99245    90025
99246    91387
99247    60638
99248    92544
Name: zip, Length: 99249, dtype: object

In [27]:
#For each row in test df, check if there is a match in all_addrs df
is_match =  test[(test.address.isin(all_addrs.address)) & (test.city.isin(all_addrs.city))
                                 & (test.state.isin(all_addrs.state)) & (test.zip.isin(all_addrs.zip))]

In [28]:
match_rate = (len(is_match) * 100) /len(all_addrs)
print("The test dataset has a coverage rate of: {} percent ".format(match_rate)) 

The test dataset has a coverage rate of: 5.811538461538461 percent 


#### 2. Create a single 'full_address' column for each dataframe and compare the two columns 

In [32]:
#create full_address column on each df

all_addrs['full_address'] = all_addrs.address  + ', ' + all_addrs.city + ', ' + all_addrs.state + ' ' + all_addrs.zip
test['full_address'] = test.address + ', '+ test.city + ', ' + test.state + ' ' + test.zip.astype(str)

In [33]:
#let's view the change in test df
test.head(3)

Unnamed: 0,uid,address,city,state,zip,full_address
0,1,941 Thorpe St,Rock Springs,WY,82901.0,"941 Thorpe St, Rock Springs, WY 82901.0"
1,2,2809 HARRIS DR,Antioch,CA,94509.0,"2809 HARRIS DR, Antioch, CA 94509.0"
2,3,1441 Eastlake Ave,Los Angeles,CA,90033.0,"1441 Eastlake Ave, Los Angeles, CA 90033.0"


In [34]:
#let's view the change in all_addrs
all_addrs.head(3)

Unnamed: 0,address,city,state,zip,full_address
0,941 Thorpe St,Rock Springs,WY,82901,"941 Thorpe St, Rock Springs, WY 82901"
1,2809 HARRIS DR,Antioch,CA,94509,"2809 HARRIS DR, Antioch, CA 94509"
2,1441 Eastlake Ave,Los Angeles,CA,90033,"1441 Eastlake Ave, Los Angeles, CA 90033"


In [35]:
#change data type for zip column in test df 

test['zip'] = test.zip.astype(str).apply(lambda x: x.replace('.0',''))

In [36]:
test.head(3)

Unnamed: 0,uid,address,city,state,zip,full_address
0,1,941 Thorpe St,Rock Springs,WY,82901,"941 Thorpe St, Rock Springs, WY 82901.0"
1,2,2809 HARRIS DR,Antioch,CA,94509,"2809 HARRIS DR, Antioch, CA 94509.0"
2,3,1441 Eastlake Ave,Los Angeles,CA,90033,"1441 Eastlake Ave, Los Angeles, CA 90033.0"


In [37]:
#apply change to the full_address column of test df
test['full_address'] = test.address + ', '+ test.city + ', ' + test.state + ' ' + test.zip.astype(str)

In [34]:
test.head(3)

Unnamed: 0,uid,address,city,state,zip,full_address,full_addres
0,1,941 Thorpe St,Rock Springs,WY,82901,"941 Thorpe St, Rock Springs, WY 82901.0","941 Thorpe St, Rock Springs, WY 82901"
1,2,2809 HARRIS DR,Antioch,CA,94509,"2809 HARRIS DR, Antioch, CA 94509.0","2809 HARRIS DR, Antioch, CA 94509"
2,3,1441 Eastlake Ave,Los Angeles,CA,90033,"1441 Eastlake Ave, Los Angeles, CA 90033.0","1441 Eastlake Ave, Los Angeles, CA 90033"


In [38]:
#Now let's see how many full_addresses in test df are also in all_addrs df
is_match2 =  [i for i in test.full_address if i in all_addrs.full_address]

In [39]:
match_rate2 = (len(is_match2) * 100) /len(all_addrs)
print("The test dataset has a new coverage rate of: {} percent ".format(match_rate2)) 

The test dataset has a new coverage rate of: 0.0 percent 


In [37]:
#examine and try new method

In [40]:
test.dtypes

uid              int64
address         object
city            object
state           object
zip             object
full_address    object
dtype: object

In [41]:
all_addrs.dtypes

address         object
city            object
state           object
zip             object
full_address    object
dtype: object

In [42]:
#function to get the common addresses 

def common_elements(list1, list2):
    result = []
    for element in list1:
        if element in list2:
            result.append(element)
    return result

In [43]:
#get the full_address columns as lists 

test_lst = test['full_address'].to_list()
all_addrs_lst = all_addrs['full_address'].to_list()

In [44]:
#check the number of addresses in test df which are also found in all_addrs df
is_match3 = common_elements(test_lst, all_addrs_lst )

In [45]:
match_rate3 = (len(is_match3) * 100) /len(all_addrs)
match_rate3
#print("The test dataset has a new match rate of: {} percent ".format(match_rate3)) 

49.635384615384616

##### It appears using a function to get the common elements between the two lists produces a better match. However, the data still includes all the wrong and incorrectly formatted addreses. So let's start cleaning both datasets.

### Data Cleaning 

In [46]:
#Examine addresses that start with # or -
test[test['address'].str.startswith(('#', '-'))].head()

Unnamed: 0,uid,address,city,state,zip,full_address
76,78,#NAM?,Victorville,CA,92395,"#NAM?, Victorville, CA 92395"
531,537,#NAME?,Lancaster,CA,93535,"#NAME?, Lancaster, CA 93535"
776,784,#NME?,Redlands,CA,92374,"#NME?, Redlands, CA 92374"
1231,1245,#NAME?,,CA,94544,
1485,1502,#NAME?,Escondido,CA,92027,"#NAME?, Escondido, CA 92027"


In [47]:
#Examine addresses that start with # or -
all_addrs[all_addrs['address'].astype(str).str.startswith(('#', '-'))].head()

Unnamed: 0,address,city,state,zip,full_address
32,#NAME?,-,CA,93551,"#NAME?, -, CA 93551"
77,#NAME?,Victorville,CA,92395,"#NAME?, Victorville, CA 92395"
106,#NAME?,Hayward,CA,-,"#NAME?, Hayward, CA -"
529,- E Ave R 10,-,CA,93543,"- E Ave R 10, -, CA 93543"
536,#NAME?,Lancaster,CA,93535,"#NAME?, Lancaster, CA 93535"



#### We can see both datasets have the same issue of wrong or incorrectly formatted values
#### Since the full_address column is made of the first three columns, the same errors are also found in the full_address column 

In [48]:
#count addresses starting with '-' or '#'
is_dirty_all_addrs = all_addrs[all_addrs['address'].astype(str).str.startswith(('#', '-'))]
is_dirty_all_addrs.head()

Unnamed: 0,address,city,state,zip,full_address
32,#NAME?,-,CA,93551,"#NAME?, -, CA 93551"
77,#NAME?,Victorville,CA,92395,"#NAME?, Victorville, CA 92395"
106,#NAME?,Hayward,CA,-,"#NAME?, Hayward, CA -"
529,- E Ave R 10,-,CA,93543,"- E Ave R 10, -, CA 93543"
536,#NAME?,Lancaster,CA,93535,"#NAME?, Lancaster, CA 93535"


In [49]:
len(is_dirty_all_addrs)

792

In [50]:
#Repeat the above for test df
is_dirty_test = test[test['address'].str.startswith(('#', '-'))]
is_dirty_test.head()

Unnamed: 0,uid,address,city,state,zip,full_address
76,78,#NAM?,Victorville,CA,92395,"#NAM?, Victorville, CA 92395"
531,537,#NAME?,Lancaster,CA,93535,"#NAME?, Lancaster, CA 93535"
776,784,#NME?,Redlands,CA,92374,"#NME?, Redlands, CA 92374"
1231,1245,#NAME?,,CA,94544,
1485,1502,#NAME?,Escondido,CA,92027,"#NAME?, Escondido, CA 92027"


In [51]:
len(is_dirty_test)

278

##### We can see that 792 rows in the all_addrs df has wrong/incorrectly formatted addresses while the test df has 278

##### However, there are more missing and incorrect/wrongly formatted values in the other columns as well. Some of the issues we previously found out about the city and state columns include:

- All_addrs dataframe has 524 records with missing values, with input '-'
- Missing values/incorrect value, '-'
- Incorrect inputs with names, e.g. GUATENG, instead of abbreviations
- Wrong info/abbreviations used, e.g. ON
- Total of 59 unique values for state, instead of 50
- Lowercase, uppercase and missing values

In [52]:
#examine where city is '-'
all_addrs[all_addrs['city'] == '-'] [['city','full_address']].head()

Unnamed: 0,city,full_address
32,-,"#NAME?, -, CA 93551"
355,-,"10734 Ivoryton Way, -, CA 95655"
529,-,"- E Ave R 10, -, CA 93543"
637,-,"17503 Dove Willow St, -, CA 91387"
810,-,"30056 Calle Cerritos, -, CA 91351"


In [53]:
#examine where city is '-'
test[test['city'] == '-'] [['city','full_address']].head()

Unnamed: 0,city,full_address


In [54]:
#any null valaues in city and state and zip columns?

test[['address','city','state','zip']].isna().sum()

address       0
city       1282
state      7433
zip           0
dtype: int64

In [55]:
all_addrs[['address','city','state','zip']].isna().sum()

address    295
city        16
state       16
zip         16
dtype: int64

#### Let's get the subset we don't want and remove it, i.e:
- where the address starts with '#', '-' or is null
- where the city or state is '-'
- where the city, state or zip value is null

In [56]:
#subset to be dropped from the all_addrs df
unwanted_all_addrs = all_addrs[all_addrs['address'].astype(str).str.startswith(('#', '-')) | (all_addrs['address'].isna())|
                              (all_addrs['city'].isna()) | (all_addrs['city'] == '-') | (all_addrs['state'].isna()) | (all_addrs['state'] == '-')|
                               (all_addrs['zip'].isna()) | (all_addrs['zip'] == '-')]
unwanted_all_addrs.head(3)

Unnamed: 0,address,city,state,zip,full_address
32,#NAME?,-,CA,93551,"#NAME?, -, CA 93551"
77,#NAME?,Victorville,CA,92395,"#NAME?, Victorville, CA 92395"
106,#NAME?,Hayward,CA,-,"#NAME?, Hayward, CA -"


In [57]:
#Now let's get the subset to be dropped from the test df

unwanted_test = test[test['address'].astype(str).str.startswith(('#', '-')) | (test['address'].isna())|
                              (test['city'].isna()) | (test['city'] == '-') | (test['state'].isna()) | (test['state'] == '-')|
                               (test['zip'].isna()) | (test['zip'] == '-')]
unwanted_test.head(3)

Unnamed: 0,uid,address,city,state,zip,full_address
4,5,1400 Lachman Ln,,CA,90272,
14,15,16404 Parthenia St,,CA,91343,
24,25,6313 Paso Los Cerritos,,CA,95120,


In [58]:
#Let's check the length of the 'unwanted' for each dataset
print("There are {} unwanted dirty rows in the all_addrs dataset".format(len(unwanted_all_addrs)))
print("There are {} unwanted dirty rows in the test dataset".format(len(unwanted_test)))

There are 1350 unwanted dirty rows in the all_addrs dataset
There are 8963 unwanted dirty rows in the test dataset


In [59]:
#create a new column 'clean' to identify distinguish 'dirty' vs 'clean'records in the all_addr dataset

all_addrs['clean'] = np.where(all_addrs['address'].astype(str).str.startswith(('#', '-')) | (all_addrs['address'].isna())|
                              (all_addrs['city'].isna()) | (all_addrs['city'] == '-') | (all_addrs['state'].isna()) | (all_addrs['state'] == '-')|
                               (all_addrs['zip'].isna()) | (all_addrs['zip'] == '-'),0,1) 

In [60]:
all_addrs.groupby('clean').size()

clean
0      1350
1    128650
dtype: int64

In [61]:
#distribution

all_addrs['clean'].value_counts(normalize=True)

1    0.989615
0    0.010385
Name: clean, dtype: float64

In [62]:
#create a new column 'clean' to identify & distinguish 'dirty' vs 'clean'records in the test dataset

test['clean'] = np.where(test['address'].astype(str).str.startswith(('#', '-')) | (test['address'].isna())|
                              (test['city'].isna()) | (test['city'] == '-') | (test['state'].isna()) | (test['state'] == '-')|
                               (test['zip'].isna()) | (test['zip'] == '-') | test['zip'].str.contains('nan'),0,1) 

In [63]:
test.groupby('clean').size()

clean
0    16496
1    82753
dtype: int64

In [64]:
#check distribution of 'clean' vs 'dirty' records 
test['clean'].value_counts(normalize=True)

1    0.833792
0    0.166208
Name: clean, dtype: float64

##### Looking at the distribution, it's clear that 16.6% of the test dataset are rows in the 'unwanted/dirty' criteria

In [65]:
#Now let's get the cleaned version of the test df 
is_clean_test = test.loc[test.clean == 1,:]
is_clean_test.head()

Unnamed: 0,uid,address,city,state,zip,full_address,clean
0,1,941 Thorpe St,Rock Springs,WY,82901,"941 Thorpe St, Rock Springs, WY 82901",1
1,2,2809 HARRIS DR,Antioch,CA,94509,"2809 HARRIS DR, Antioch, CA 94509",1
2,3,1441 Eastlake Ave,Los Angeles,CA,90033,"1441 Eastlake Ave, Los Angeles, CA 90033",1
3,4,7 ucayptus,Newport Beach,CA,92657,"7 ucayptus, Newport Beach, CA 92657",1
5,6,725 ounain View St,Altadena,CA,91001,"725 ounain View St, Altadena, CA 91001",1


In [66]:
#Now let's get the cleaned version of the all_addrs df 
is_clean_all_addrs = all_addrs.loc[all_addrs.clean == 1,:]
is_clean_all_addrs.head()

Unnamed: 0,address,city,state,zip,full_address,clean
0,941 Thorpe St,Rock Springs,WY,82901,"941 Thorpe St, Rock Springs, WY 82901",1
1,2809 HARRIS DR,Antioch,CA,94509,"2809 HARRIS DR, Antioch, CA 94509",1
2,1441 Eastlake Ave,Los Angeles,CA,90033,"1441 Eastlake Ave, Los Angeles, CA 90033",1
3,7 Eucalyptus,Newport Beach,CA,92657,"7 Eucalyptus, Newport Beach, CA 92657",1
4,1400 Lachman Ln,Los Angeles,CA,90272,"1400 Lachman Ln, Los Angeles, CA 90272",1


In [67]:
#Get the full_address columns as lists for each of the 'cleaned' datasets

is_clean_test_lst = is_clean_test['full_address'].to_list()
is_clean_all_addrs_lst = is_clean_all_addrs['full_address'].to_list()

In [68]:
#check the new number of addresses in is_clean_test df which are also found in is_clean_all_addrs df
is_match4 = common_elements(is_clean_test_lst, is_clean_all_addrs_lst)

In [69]:
match_rate4 = len(is_match4) * 100 /len(is_clean_all_addrs_lst)
match_rate4

43.251457442673924

In [70]:
#try another function using list comprehension

def same_elements(first_list, second_list):
    return [element for element in first_lst if element in second_lst]

In [72]:
first_lst = is_clean_test['full_address'].to_list()
second_lst = is_clean_all_addrs['full_address'].to_list()

In [73]:
is_same_clean  = same_elements(is_clean_test_lst,is_clean_all_addrs_lst)

coverage_rate = len(is_same_clean) * 100 /len(is_clean_all_addrs)

In [74]:
coverage_rate

43.251457442673924

#### After trying two different functions with the cleaned datasets, the coverage rate is now 43%. 
#### let's examine the datasets a bit more to check for opportunities of improvement

In [75]:
is_clean_test.head()

Unnamed: 0,uid,address,city,state,zip,full_address,clean
0,1,941 Thorpe St,Rock Springs,WY,82901,"941 Thorpe St, Rock Springs, WY 82901",1
1,2,2809 HARRIS DR,Antioch,CA,94509,"2809 HARRIS DR, Antioch, CA 94509",1
2,3,1441 Eastlake Ave,Los Angeles,CA,90033,"1441 Eastlake Ave, Los Angeles, CA 90033",1
3,4,7 ucayptus,Newport Beach,CA,92657,"7 ucayptus, Newport Beach, CA 92657",1
5,6,725 ounain View St,Altadena,CA,91001,"725 ounain View St, Altadena, CA 91001",1


In [76]:
is_clean_test.shape

(82753, 7)

In [77]:
#xamine unique states in cleaned all_addrs df 
is_clean_all_addrs.state.unique()

array(['WY', 'CA', 'NY', 'TX', 'TN', 'PA', 'MI', 'FL', 'GA', 'MO', 'AR',
       'CO', 'WA', 'OH', 'AZ', 'MA', 'IL', 'CT', 'SC', 'KY', 'NV', 'MN',
       'AL', 'NC', 'OK', 'WV', 'VA', 'UT', 'VT', 'ID', 'NJ', 'IN', 'MD',
       'SD', 'OR', 'NH', 'NM', 'IA', 'KS', 'LA', 'WI', 'ME', 'NE', 'MS',
       'MT', 'CHIS', 'DE', 'ND', 'CÓRDOBA', 'HI', 'RI', 'VER', 'AK', 'DC',
       'GAUTENG', 'CHIH', 'ON'], dtype=object)

In [78]:
#examine unique states in cleaned test df
is_clean_test.state.unique()

array(['WY', 'CA', 'NY', 'TX', 'TN', 'PA', 'MI', 'FL', 'GA', 'MO', 'AR',
       'CO', 'WA', 'OH', 'AZ', 'MA', 'IL', 'CT', 'SC', 'KY', 'NV', 'MN',
       'AL', 'NC', 'OK', 'WV', 'VA', 'UT', 'VT', 'ID', 'NJ', 'IN', 'MD',
       'SD', 'OR', 'NH', 'NM', 'IA', 'KS', 'LA', 'WI', 'ME', 'NE', 'MS',
       'MT', 'DE', 'ND', 'ca', 'ma', 'ia', 'nd', 'oh', 'id', 'il', 'HI',
       'tx', 'ut', 'pa', 'sd', 'ny', 'co', 'wa', 'va', 'mn', 'az', 'nc',
       'tn', 'ne', 'ky', 'nj', 'la', 'ct', 'ks', 'RI', 'mo', 'md', 'al',
       'wi', 'mi', 'wy', 'AK', 'fl', 'in', 'DC', 'ri', 'or', 'nh', 'hi',
       'nm', 'mt', 'ms', 'ga', 'nv', 'ar', 'ok', 'me'], dtype=object)

##### Here we observe that the is_clean_test dataset has uppercase and lowercase values in the state column - 
##### this is an opportunity to make some changes and increase the coverage rate

In [79]:
#remove records where given state name is not a correct US state

is_clean_all_addrs = is_clean_all_addrs[~is_clean_all_addrs.state.isin(['GAUTENG','CÓRDOBA','CHIH','CHIS','ON'])]
is_clean_all_addrs.shape

(128643, 6)

In [80]:
#make the full address column uppercase

is_clean_test['full_address'] = is_clean_test['full_address'].str.upper()
is_clean_all_addrs['full_address'] = is_clean_all_addrs['full_address'].str.upper()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [81]:
lst_all = is_clean_all_addrs['full_address'].to_list()
lst_tst = is_clean_test['full_address'].to_list()
new_covrg = [x for x in lst_tst if x in lst_all]
cov_rate = len(new_covrg) * 100 /len(lst_all)
print("The coverage rate is now {} percent".format(cov_rate))

The coverage rate is now 48.77062879441555 percent


#### New coverage rate is 48.8%

In [82]:
#Now let's remove duplicates from both lists by using a set()

all_addrs_set = set(lst_all)
test_set = set(lst_tst)

In [83]:
#find common elements in the two sets and print new coverage rate

set_covrg = [x for x in test_set if x in all_addrs_set]
set_covrg_rate = len(set_covrg) * 100 /len(all_addrs_set)

In [84]:
set_covrg_rate

63.8409493077964

#### Coverage rate now at 63.8%