In [1]:
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from shapely.geometry import Point
import geopandas as gpd
from geopandas import GeoDataFrame

t0 = time.time()

# 1. Installation `pyspark`

In [2]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.0.1.tar.gz (204.2 MB)
[K     |████████████████████████████████| 204.2 MB 29 kB/s s eta 0:00:01
[?25hCollecting py4j==0.10.9
  Downloading py4j-0.10.9-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 50.3 MB/s eta 0:00:01
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25ldone
[?25h  Created wheel for pyspark: filename=pyspark-3.0.1-py2.py3-none-any.whl size=204612244 sha256=f67485ccbd26a80bf545f6a07384fa005145a9d24fb1086824687e68a883486e
  Stored in directory: /root/.cache/pip/wheels/5e/34/fa/b37b5cef503fc5148b478b2495043ba61b079120b7ff379f9b
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.0.1
Note: you may need to restart the kernel to use updated packages.


#### Time-estimated

In [3]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext, SparkSession
print('Installation takes %s seconds'%(time.time() - t0))

Installation takes 44.35756754875183 seconds


#### Build-in a spark-session

In [4]:
spark = SparkSession.builder \
                    .master("local") \
                    .appName("Word Count") \
                    .config("spark.some.config.option", "some-value") \
                    .getOrCreate()
spark

In [5]:
sc = spark.sparkContext
sqlContext = SQLContext(sc)

#### Loading `csv.file` with `pandas`

In [6]:
path_file = r'../input/big-data-vers-1/visiting.csv'
vst_df = pd.read_csv(path_file)
vst_df.head()

Unnamed: 0,utc_timestamp,local_timestamp,ad_id,id_type,location_name,top_category,sub_category,street_address,city,state,naics_code,brands,zip_code,minimum_dwell,safegraph_place_id,geohash_5,census_block_group
0,1598441109,1598415909,354cec9d-eb94-4522-bc35-dc14e9d910b2,idfa,Sandrock Canyon Os Park,"Museums, Historical Sites, and Similar Institu...",Nature Parks and Other Similar Institutions,sandrock canyon os park,san diego,ca,712190.0,,92108.0,202.0,sg:2b7ce679bc5a4b8fb1d58a850bbacf76,9mudw,60730090000.0
1,1598400002,1598385602,20e1e1d5-0642-4fe1-9718-94decebe2b3f,aaid,home,,,,,ohio,,,44090.0,1328.0,home,dpmd6,390930900000.0
2,1598410516,1598392516,dc7dece9-4d17-4fa5-9290-6602830e0a0c,aaid,home,,,,palatine,illinois,,,60067.0,476.0,home,dp3rs,170318000000.0
3,1598467918,1598453518,a644d089-1227-4d8e-9127-35ed1b04ed1f,aaid,A Moment In Time Photo Booths,"Other Professional, Scientific, and Technical ...","Photography Studios, Portrait",360 mcclellan ave,hamilton,nj,541921.0,,8610.0,4.0,sg:ff78dda4533e4da992b257b94c04b4af,dr4ue,340210000000.0
4,1598465770,1598440570,4f7418d9-578c-4f03-b1fb-5e8f63bf2599,aaid,QFC (Quality Food Centers),Grocery Stores,Supermarkets and Other Grocery (except Conveni...,22803 44th ave w,mountlake terrace,wa,445110.0,QFC (Quality Food Centers),98043.0,18.0,sg:0096ae4959f849a186471eb96aef0cbf,c23pc,530610500000.0


#### Loading `csv.data` to `spark` & viewing by `pandas`

In [7]:
visiting_df = spark.read.format("csv").option("header", "true").load(path_file)
visiting_df.toPandas().head()

Unnamed: 0,utc_timestamp,local_timestamp,ad_id,id_type,location_name,top_category,sub_category,street_address,city,state,naics_code,brands,zip_code,minimum_dwell,safegraph_place_id,geohash_5,census_block_group
0,1598441109,1598415909,354cec9d-eb94-4522-bc35-dc14e9d910b2,idfa,Sandrock Canyon Os Park,"Museums, Historical Sites, and Similar Institu...",Nature Parks and Other Similar Institutions,sandrock canyon os park,san diego,ca,712190.0,,92108,202.0,sg:2b7ce679bc5a4b8fb1d58a850bbacf76,9mudw,60730093012
1,1598400002,1598385602,20e1e1d5-0642-4fe1-9718-94decebe2b3f,aaid,home,,,,,ohio,,,44090,1328.0,home,dpmd6,390930941004
2,1598410516,1598392516,dc7dece9-4d17-4fa5-9290-6602830e0a0c,aaid,home,,,,palatine,illinois,,,60067,476.0,home,dp3rs,170318036103
3,1598467918,1598453518,a644d089-1227-4d8e-9127-35ed1b04ed1f,aaid,A Moment In Time Photo Booths,"Other Professional, Scientific, and Technical ...","Photography Studios, Portrait",360 mcclellan ave,hamilton,nj,541921.0,,8610,4.0,sg:ff78dda4533e4da992b257b94c04b4af,dr4ue,340210025004
4,1598465770,1598440570,4f7418d9-578c-4f03-b1fb-5e8f63bf2599,aaid,QFC (Quality Food Centers),Grocery Stores,Supermarkets and Other Grocery (except Conveni...,22803 44th ave w,mountlake terrace,wa,445110.0,QFC (Quality Food Centers),98043,18.0,sg:0096ae4959f849a186471eb96aef0cbf,c23pc,530610513003


#### Register the `loaded-dataframe` as table in `SQL`

In [8]:
t0 = time.time()
SQLContext.registerDataFrameAsTable(sc, df = visiting_df, tableName = 'visiting')
print('Attach table "movement" takes %s seconds'%(time.time() - t0))

Attach table "movement" takes 0.24444007873535156 seconds


# 2. EDA.`table: visiting`
## 2.1 Descriptive-statistic information overview.

### 2.1.1. Column-types & counting values: `NA`, `unique`,

In [9]:
count = []
for col in vst_df.columns:
    count.append(len(vst_df[col].unique()))

basic_count = pd.DataFrame({'count_non_null': np.array(len(vst_df) - vst_df.isnull().sum().values), 
                              'NA_percentage(%)': np.round(100*vst_df.isnull().sum().values / len(vst_df), 2),
                             'dtypes': vst_df.dtypes,
                             'count_distinct': count},
                             index = vst_df.columns)
basic_count

Unnamed: 0,count_non_null,NA_percentage(%),dtypes,count_distinct
utc_timestamp,1000000,0.0,int64,133800
local_timestamp,1000000,0.0,int64,142424
ad_id,1000000,0.0,object,346495
id_type,1000000,0.0,object,2
location_name,1000000,0.0,object,164723
top_category,471806,52.82,object,161
sub_category,470750,52.92,object,308
street_address,472595,52.74,object,303198
city,816793,18.32,object,16772
state,957381,4.26,object,114


### 2.1.2. Statistically-descripted information
#### For the numeric data-types

In [10]:
vst_df.describe()

Unnamed: 0,utc_timestamp,local_timestamp,naics_code,zip_code,minimum_dwell,census_block_group
count,1000000.0,1000000.0,471813.0,957205.0,1000000.0,957381.0
mean,1598453000.0,1598435000.0,594339.698739,50779.89679,316.613161,284876500000.0
std,42178.52,42536.34,144305.168279,27241.714194,407.02461,160260800000.0
min,1598400000.0,1598363000.0,3231.0,0.0,0.0,10010200000.0
25%,1598407000.0,1598390000.0,447110.0,30067.0,20.0,130890200000.0
50%,1598457000.0,1598440000.0,711310.0,47396.0,101.0,281499500000.0
75%,1598477000.0,1598460000.0,722511.0,75791.0,552.0,420770100000.0
max,1598573000.0,1598608000.0,928120.0,99929.0,1440.0,780309600000.0


#### For the text-type; find the highest frequency

In [11]:
from collections import Counter 

## Filter the column_names which types be object
object_cols = vst_df.columns[vst_df.dtypes == 'object']

## Initialize list contain values & frequencies of the most_common_value at each column
val_most_common = []
count_most_common = []

## Loop over object-columns
for col in object_cols:
    test_list = vst_df[col]
    test_list = Counter(test_list)
    val_most_common.append(test_list.most_common(1)[0][0])  
    count_most_common.append(test_list.most_common(1)[0][1])

## Viewing
test = pd.DataFrame({'value_most_common': val_most_common,
             'freq_most_common': count_most_common},
            index = object_cols).T

test

Unnamed: 0,ad_id,id_type,location_name,top_category,sub_category,street_address,city,state,brands,safegraph_place_id,geohash_5
value_most_common,e3f2e5ea-0ab7-43e1-90af-7ffc67adedd2,idfa,home,,,,,tx,,home,
freq_most_common,46,517531,484784,528194.0,529250.0,527405.0,183207.0,51689,783743.0,484784,42619.0


or you can viewing by this command!

In [12]:
visiting_df.toPandas().describe()

Unnamed: 0,utc_timestamp,local_timestamp,ad_id,id_type,location_name,top_category,sub_category,street_address,city,state,naics_code,brands,zip_code,minimum_dwell,safegraph_place_id,geohash_5,census_block_group
count,1000000,1000000,1000000,1000000,1000000,1000000.0,1000000.0,1000000.0,1000000.0,1000000,1000000.0,216267,1000000.0,1000000.0,1000000,1000000.0,1000000.0
unique,133800,142424,346495,2,164723,161.0,308.0,303199.0,16772.0,114,314.0,4105,25882.0,1441.0,319077,70859.0,169570.0
top,1598400002,1598385602,e3f2e5ea-0ab7-43e1-90af-7ffc67adedd2,idfa,home,,,,,tx,,Walmart,,5.0,home,,
freq,498,245,46,517531,484784,528194.0,529250.0,527403.0,183207.0,51689,528187.0,15150,42795.0,38362.0,484784,42619.0,42619.0


#### Wraping all up-together

In [13]:
def wrap_data(dataframe):
    count = []
    for col in dataframe.columns:
        count.append(len(vst_df[col].unique()))
    left_df = pd.DataFrame({'count_non_null': np.array(len(dataframe) - vst_df.isnull().sum().values), 
                                  'NA_percentage(%)': np.round(100*dataframe.isnull().sum().values / len(dataframe), 2),
                                 'dtypes': vst_df.dtypes,
                                 'count_distinct': count},
                                 index = dataframe.columns)
    mid_DF = []
    for col_type in ['object', 'datetime64[ns]']:
        object_cols = dataframe.columns[dataframe.dtypes == col_type]
        val_most_common = []
        count_most_common = []

        for col in object_cols:
            test_list = dataframe[col]
            test_list = Counter(test_list)
            val_most_common.append(test_list.most_common(1)[0][0])  
            count_most_common.append(test_list.most_common(1)[0][1])

        test = pd.DataFrame({'value_most_common': val_most_common,
                             'freq_most_common': count_most_common},
                            index = object_cols).T
        mid_DF.append(test)
    mid_df = mid_DF[0].join(mid_DF[1], how = 'outer')
    df = left_df.join(pd.concat([dataframe.describe().drop('count'),
                                 mid_df.fillna('null')]).fillna('no_info').T,
                      how = 'outer').T
    return df

In [14]:
wrap_data(vst_df)

Unnamed: 0,ad_id,brands,census_block_group,city,geohash_5,id_type,local_timestamp,location_name,minimum_dwell,naics_code,safegraph_place_id,state,street_address,sub_category,top_category,utc_timestamp,zip_code
count_non_null,1000000,216257,957381,816793,957381,1000000,1000000,1000000,1000000,471813,1000000,957381,472595,470750,471806,1000000,957205
NA_percentage(%),0,78.37,4.26,18.32,4.26,0,0,0,0,52.82,0,4.26,52.74,52.92,52.82,0,4.28
dtypes,object,object,float64,object,object,object,int64,object,float64,float64,object,object,object,object,object,int64,float64
count_distinct,346495,4105,169570,16772,70859,2,142424,164723,1441,314,319077,114,303198,308,161,133800,25882
mean,no_info,no_info,2.84876e+11,no_info,no_info,no_info,1.59844e+09,no_info,316.613,594340,no_info,no_info,no_info,no_info,no_info,1.59845e+09,50779.9
std,no_info,no_info,1.60261e+11,no_info,no_info,no_info,42536.3,no_info,407.025,144305,no_info,no_info,no_info,no_info,no_info,42178.5,27241.7
min,no_info,no_info,1.00102e+10,no_info,no_info,no_info,1.59836e+09,no_info,0,3231,no_info,no_info,no_info,no_info,no_info,1.5984e+09,0
25%,no_info,no_info,1.3089e+11,no_info,no_info,no_info,1.59839e+09,no_info,20,447110,no_info,no_info,no_info,no_info,no_info,1.59841e+09,30067
50%,no_info,no_info,2.815e+11,no_info,no_info,no_info,1.59844e+09,no_info,101,711310,no_info,no_info,no_info,no_info,no_info,1.59846e+09,47396
75%,no_info,no_info,4.2077e+11,no_info,no_info,no_info,1.59846e+09,no_info,552,722511,no_info,no_info,no_info,no_info,no_info,1.59848e+09,75791


## 2.2. Cleaning data
### 2.1. Fix structure error

### 2.1.1. Convert `utc_timestamp` & `local_timestamp` to `datetime`

In [15]:
import datetime
vst_df['utc_timestamp'] = vst_df['utc_timestamp'].apply(lambda x: datetime.datetime.fromtimestamp(x))
vst_df['local_timestamp'] = vst_df['local_timestamp'].apply(lambda x: datetime.datetime.fromtimestamp(x))
vst_df.head(2)

Unnamed: 0,utc_timestamp,local_timestamp,ad_id,id_type,location_name,top_category,sub_category,street_address,city,state,naics_code,brands,zip_code,minimum_dwell,safegraph_place_id,geohash_5,census_block_group
0,2020-08-26 11:25:09,2020-08-26 04:25:09,354cec9d-eb94-4522-bc35-dc14e9d910b2,idfa,Sandrock Canyon Os Park,"Museums, Historical Sites, and Similar Institu...",Nature Parks and Other Similar Institutions,sandrock canyon os park,san diego,ca,712190.0,,92108.0,202.0,sg:2b7ce679bc5a4b8fb1d58a850bbacf76,9mudw,60730090000.0
1,2020-08-26 00:00:02,2020-08-25 20:00:02,20e1e1d5-0642-4fe1-9718-94decebe2b3f,aaid,home,,,,,ohio,,,44090.0,1328.0,home,dpmd6,390930900000.0


Verify

In [16]:
wrap_data(vst_df)

Unnamed: 0,ad_id,brands,census_block_group,city,geohash_5,id_type,local_timestamp,location_name,minimum_dwell,naics_code,safegraph_place_id,state,street_address,sub_category,top_category,utc_timestamp,zip_code
count_non_null,1000000,216257,957381,816793,957381,1000000,1000000,1000000,1000000,471813,1000000,957381,472595,470750,471806,1000000,957205
NA_percentage(%),0,78.37,4.26,18.32,4.26,0,0,0,0,52.82,0,4.26,52.74,52.92,52.82,0,4.28
dtypes,object,object,float64,object,object,object,datetime64[ns],object,float64,float64,object,object,object,object,object,datetime64[ns],float64
count_distinct,346495,4105,169570,16772,70859,2,142424,164723,1441,314,319077,114,303198,308,161,133800,25882
mean,no_info,no_info,2.84876e+11,no_info,no_info,no_info,no_info,no_info,316.613,594340,no_info,no_info,no_info,no_info,no_info,no_info,50779.9
std,no_info,no_info,1.60261e+11,no_info,no_info,no_info,no_info,no_info,407.025,144305,no_info,no_info,no_info,no_info,no_info,no_info,27241.7
min,no_info,no_info,1.00102e+10,no_info,no_info,no_info,no_info,no_info,0,3231,no_info,no_info,no_info,no_info,no_info,no_info,0
25%,no_info,no_info,1.3089e+11,no_info,no_info,no_info,no_info,no_info,20,447110,no_info,no_info,no_info,no_info,no_info,no_info,30067
50%,no_info,no_info,2.815e+11,no_info,no_info,no_info,no_info,no_info,101,711310,no_info,no_info,no_info,no_info,no_info,no_info,47396
75%,no_info,no_info,4.2077e+11,no_info,no_info,no_info,no_info,no_info,552,722511,no_info,no_info,no_info,no_info,no_info,no_info,75791


$\qquad \Rightarrow$ We have finished the converting-structure of `utc_timestamp` & `local_timestamp`, now we will look at the columns of `id`.

### 2.1.2. For `text-type`
#### (i). For `ad_id` and `id_type`

For all the `text-type-data` likes `ad_id, id_type, city, state, ...`, we must lowercase all of them.

In [17]:
obj_cols = vst_df.columns[vst_df.dtypes == 'object']
for col in obj_cols:
    vst_df[col] = vst_df[vst_df[col].notnull()][col].apply(lambda x: x.lower())

vst_df.head(2)

Unnamed: 0,utc_timestamp,local_timestamp,ad_id,id_type,location_name,top_category,sub_category,street_address,city,state,naics_code,brands,zip_code,minimum_dwell,safegraph_place_id,geohash_5,census_block_group
0,2020-08-26 11:25:09,2020-08-26 04:25:09,354cec9d-eb94-4522-bc35-dc14e9d910b2,idfa,sandrock canyon os park,"museums, historical sites, and similar institu...",nature parks and other similar institutions,sandrock canyon os park,san diego,ca,712190.0,,92108.0,202.0,sg:2b7ce679bc5a4b8fb1d58a850bbacf76,9mudw,60730090000.0
1,2020-08-26 00:00:02,2020-08-25 20:00:02,20e1e1d5-0642-4fe1-9718-94decebe2b3f,aaid,home,,,,,ohio,,,44090.0,1328.0,home,dpmd6,390930900000.0


Now, checking the consitency of the column `id-type`.

In [18]:
vst_df.groupby('id_type').count()[['ad_id']]

Unnamed: 0_level_0,ad_id
id_type,Unnamed: 1_level_1
aaid,482469
idfa,517531


$\qquad \Rightarrow$ So the structure of the column `id_type` is consistent of 2 `groups: "aaid"` and `"idfa"` and there is nothing to fix the structure at this column. 

Next, we will verify the `ad_id`, [Reference](https://en.wikipedia.org/wiki/Ad-ID)

In [19]:
ad_id_df = pd.DataFrame({'ad_id': vst_df['ad_id']})
ad_id_df['len(ad_id)'] = vst_df['ad_id'].apply(lambda x: len(x))
ad_id_df['all_split_-?'] = vst_df['ad_id'].apply(lambda x: vst_df['ad_id'][1].count('-') == 4)
ad_id_df['len(term1)'] = vst_df['ad_id'].apply(lambda x: len(x.split('-')[0]))
ad_id_df['len(term2)'] = vst_df['ad_id'].apply(lambda x: len(x.split('-')[1]))
ad_id_df['len(term3)'] = vst_df['ad_id'].apply(lambda x: len(x.split('-')[2]))
ad_id_df['len(term4)'] = vst_df['ad_id'].apply(lambda x: len(x.split('-')[3]))
ad_id_df['len(term5)'] = vst_df['ad_id'].apply(lambda x: len(x.split('-')[4]))
ad_id_df.head()

Unnamed: 0,ad_id,len(ad_id),all_split_-?,len(term1),len(term2),len(term3),len(term4),len(term5)
0,354cec9d-eb94-4522-bc35-dc14e9d910b2,36,True,8,4,4,4,12
1,20e1e1d5-0642-4fe1-9718-94decebe2b3f,36,True,8,4,4,4,12
2,dc7dece9-4d17-4fa5-9290-6602830e0a0c,36,True,8,4,4,4,12
3,a644d089-1227-4d8e-9127-35ed1b04ed1f,36,True,8,4,4,4,12
4,4f7418d9-578c-4f03-b1fb-5e8f63bf2599,36,True,8,4,4,4,12


Now, checking this result.

In [20]:
ad_id_df.groupby(['len(ad_id)', 'all_split_-?', 'len(term1)',
                 'len(term2)', 'len(term3)', 'len(term4)', 'len(term5)']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,ad_id
len(ad_id),all_split_-?,len(term1),len(term2),len(term3),len(term4),len(term5),Unnamed: 7_level_1
36,True,8,4,4,4,12,1000000


We only have one-level of `groupby-multi-groups`, hence all of `ad_id` has the same structure

$\qquad \Rightarrow$ So, there is nothing to fix the structure at this column.

#### (ii). For the `location_name` & `brands`

Remind that the `brands` has more than `75%` missing values, and we have seen many `non-null` values from `brands` is concided in `location_name`, so the next step for these columns is **checking the `identical-column` at `non-null` values**?

In [21]:
### Chọn ra 2 cột cần quan tâm để nhìn cho đỡ rối
loc_brand_df = vst_df[['location_name', 'brands']]

## Bỏ đi các dòng bị khuyết
loc_brand_df = loc_brand_df.dropna()

## Thêm cột kiểm tra 2 cột trước đó có trùng nhau theo dòng hay không?
loc_brand_df['is_concided'] = (loc_brand_df['brands'] == loc_brand_df['location_name'])

print('Number of observations after dropping the NA-values (at column: "brands") =', loc_brand_df.shape[0])
print('Number of concided-values at the first 2 columns: ', loc_brand_df['is_concided'].sum())
loc_brand_df.head()

Number of observations after dropping the NA-values (at column: "brands") = 216257
Number of concided-values at the first 2 columns:  211448


Unnamed: 0,location_name,brands,is_concided
4,qfc (quality food centers),qfc (quality food centers),True
5,hyatt regency,hyatt regency,True
7,hyatt regency,hyatt regency,True
27,domino's pizza,domino's pizza,True
30,safeway,safeway,True


So, there are `211448` values from `brands` is concided with in `location_name` at the `non-null` observations

In [22]:
print('Percentage of non-concided-values at the first 2 columns: %s (perc) with %s values'% 
          (np.round(100 - 100*loc_brand_df['is_concided'].sum()/loc_brand_df.shape[0], 2), 
           loc_brand_df.shape[0] - loc_brand_df['is_concided'].sum()) )
loc_brand_df[~loc_brand_df['is_concided']].head()

Percentage of non-concided-values at the first 2 columns: 2.22 (perc) with 4809 values


Unnamed: 0,location_name,brands,is_concided
272,hubler ford franklin,ford motor company,False
299,bmw northwest,bmw,False
593,fort myers mitsubishi,mitsubishi motors,False
624,garlyn shelton nissan,nissan,False
1368,bailey ford of plattsburgh,ford motor company,False


$\qquad \Rightarrow$ Hence, you can drop this `column (brands)` since there are more than `78%` missing-values and for the `non-null` there are `4809` values of `non-concided` is `0.4809%` over `1e6` observation at total.

In [23]:
vst_df = vst_df.drop(columns = ['brands'])
vst_df.head(2)

Unnamed: 0,utc_timestamp,local_timestamp,ad_id,id_type,location_name,top_category,sub_category,street_address,city,state,naics_code,zip_code,minimum_dwell,safegraph_place_id,geohash_5,census_block_group
0,2020-08-26 11:25:09,2020-08-26 04:25:09,354cec9d-eb94-4522-bc35-dc14e9d910b2,idfa,sandrock canyon os park,"museums, historical sites, and similar institu...",nature parks and other similar institutions,sandrock canyon os park,san diego,ca,712190.0,92108.0,202.0,sg:2b7ce679bc5a4b8fb1d58a850bbacf76,9mudw,60730090000.0
1,2020-08-26 00:00:02,2020-08-25 20:00:02,20e1e1d5-0642-4fe1-9718-94decebe2b3f,aaid,home,,,,,ohio,,44090.0,1328.0,home,dpmd6,390930900000.0


#### (iii). For the `top_category` & `sub_category`

| type | [description](https://docs.safegraph.com/docs/places-schema#:~:text=A%20SafeGraph%20brand%20is%20defined,same%20logo%20or%20store%20banner.) | Examples |
|:-:|:-|:-:|
| `top_category` |	The label associated with the first 4 digits of the `POI’s NAICS` category. | Automobile Dealers |
| `subcategory` | The label associated with all 6 digits of the `POI’s NAICS` category.	| New Car Dealers |

In these categories, we only correct the `typos` if needed necessary. And to do this, we firstly need to install the required package using the following command in our `python environment`.

In [24]:
pip install pyspellchecker 

Collecting pyspellchecker
  Downloading pyspellchecker-0.5.5-py2.py3-none-any.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 883 kB/s eta 0:00:01
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.5.5
Note: you may need to restart the kernel to use updated packages.


#### Example.

In [25]:
from spellchecker import SpellChecker
t0 = time.time()
spell = SpellChecker()

misspelled = ['let', 'flaf' , 'us', 'os', 'museums', 'wlak','on','the','groun',
             'dropl;', 'notehing', 'herpe']
correct_w = []
for word in misspelled:
    # Get the one `most likely` answer
    correct_w.append(spell.correction(word))
print('Time processing = ', time.time() - t0, '(seconds) with number of word-processed = ', len(misspelled))
print('words after correcting :\n', correct_w)

Time processing =  0.8137831687927246 (seconds) with number of word-processed =  12
words after correcting :
 ['let', 'flat', 'us', 'os', 'museums', 'walk', 'on', 'the', 'group', 'drop', 'nothing', 'here']


Ta thấy rằng hạn chế của `spell.correction` ở đây là vấn đề thời gian xử lý; chỉ mới `12` từ nhưng đã tốn đến `0.6-0.7` giây.

Như vậy với `dataset` này có `1 triệu quan trắc` và giả sử khi ta chỉ xem xét `3 cột ('location_name', 'top_category', 'sub_category')` thì phương pháp này là bất khả thi về thời gian, do đó ta sẽ tạo ra `dictionary` các từ sẽ thay thế dựa trên các giá trị duy nhất ở mỗi cột.

Sau đây, tôi sẽ thực hiện một vòng lặp để khảo sát 3 cột trên

In [26]:
col_list = ['location_name', 'top_category', 'sub_category']
for col in col_list:
    t0 = time.time()
    ## trả về Series các giá trị duy nhất và tần số của chúng
    ser = vst_df[col].value_counts()
    
    ## tổng số các mức giá trị của column
    N = len(ser)
    ## trả về các giá trị duy nhất trong cột 
    uniq = ser.index

    ## chỉ tập trung thay đổi những từ không rõ ràng về chính tả-ngữ pháp (lúc này nó đang có dạng dictionary)
    x = spell.unknown(uniq)
    
    ## chuyển về dạng list
    ser = ser[x].sort_values(ascending = False)
    x = list(ser.index)
    
    ## loop over top15 the unknown-words at each column
    n = min(15, len(x))  ## if your list is less than 10 words, only loop over itself.
    rep = ser.head(n).sum()
    
    ## Lưu ý rằng ta chỉ thay thế các giá trị cần sửa chính tả-ngữ pháp nên không quét hết toàn bộ giá trị trong cột
    total = ser.sum()
    for k in range(n):
        word = list(x)[k]
        vst_df.replace(word, spell.correction(word))
    
    print('Loop over the top %s words (over %s levels) at the column : "%s" then replaced %s over %s (non-null & incorrect) words'
          %(n, N, col, rep, total))
    print('\t Time processing = ', time.time() - t0, '(seconds)')

vst_df.head(3)

Loop over the top 15 words (over 164375 levels) at the column : "location_name" then replaced 61838 over 428801 (non-null & incorrect) words
	 Time processing =  41.3769896030426 (seconds)
Loop over the top 15 words (over 160 levels) at the column : "top_category" then replaced 265822 over 298146 (non-null & incorrect) words
	 Time processing =  148.3722686767578 (seconds)
Loop over the top 15 words (over 307 levels) at the column : "sub_category" then replaced 185283 over 251881 (non-null & incorrect) words
	 Time processing =  134.64703106880188 (seconds)


Unnamed: 0,utc_timestamp,local_timestamp,ad_id,id_type,location_name,top_category,sub_category,street_address,city,state,naics_code,zip_code,minimum_dwell,safegraph_place_id,geohash_5,census_block_group
0,2020-08-26 11:25:09,2020-08-26 04:25:09,354cec9d-eb94-4522-bc35-dc14e9d910b2,idfa,sandrock canyon os park,"museums, historical sites, and similar institu...",nature parks and other similar institutions,sandrock canyon os park,san diego,ca,712190.0,92108.0,202.0,sg:2b7ce679bc5a4b8fb1d58a850bbacf76,9mudw,60730090000.0
1,2020-08-26 00:00:02,2020-08-25 20:00:02,20e1e1d5-0642-4fe1-9718-94decebe2b3f,aaid,home,,,,,ohio,,44090.0,1328.0,home,dpmd6,390930900000.0
2,2020-08-26 02:55:16,2020-08-25 21:55:16,dc7dece9-4d17-4fa5-9290-6602830e0a0c,aaid,home,,,,palatine,illinois,,60067.0,476.0,home,dp3rs,170318000000.0


#### (iv). For `city` and `state`
In these columns, the structure of `state` must be in short, for example: `tx = texas, ca = california, ...`.

Remind that, the column `state` has `114` distinct values while there are `16772` distinct-values in the `city`. Hence,

$\qquad \Rightarrow$ At `column: "state"`, we will create a dictionary then replace all of them to the same type.

$\qquad \Rightarrow$ At `column: "city"`, we only focus on the `short-names (lenght = 2)` to obtain the `full_name_city`

In [27]:
import string

state_names = list(vst_df['state'].value_counts().index)
list_az = string.ascii_lowercase  ## create a list of words: a-z*

states_df = []
for k in range(26):
    states = list_az[k]
    for state in state_names:
        if state.startswith(list_az[k]):
            states = states + ';' + state
    states_df.append(states.split(';'))

print("Startswith \t cases")    
for k in range(26):
    print('"%s" \t %s'%(states_df[k][0], states_df[k][1:]))

Startswith 	 cases
"a" 	 ['alabama', 'al', 'arizona', 'az', 'arkansas', 'ar', 'alaska', 'ak', 'american samoa', 'as', 'ae']
"b" 	 []
"c" 	 ['california', 'ca', 'co', 'colorado', 'connecticut', 'ct']
"d" 	 ['delaware', 'de', 'dc', 'district of columbia']
"e" 	 []
"f" 	 ['florida', 'fl']
"g" 	 ['georgia', 'ga', 'guam', 'gu', 'grand traverse reservation']
"h" 	 ['hi', 'hawaii']
"i" 	 ['illinois', 'il', 'indiana', 'in', 'iowa', 'ia', 'id', 'idaho']
"j" 	 []
"k" 	 ['kentucky', 'ky', 'kansas', 'ks']
"l" 	 ['louisiana', 'la']
"m" 	 ['michigan', 'mi', 'missouri', 'mo', 'minnesota', 'mn', 'ma', 'maryland', 'md', 'mississippi', 'massachusetts', 'ms', 'me', 'mt', 'maine', 'montana']
"n" 	 ['ny', 'new york', 'north carolina', 'nc', 'new jersey', 'nj', 'nv', 'nebraska', 'ne', 'nevada', 'new mexico', 'nm', 'new hampshire', 'nh', 'nd', 'north dakota', 'northern mariana islands']
"o" 	 ['ohio', 'oh', 'ok', 'oklahoma', 'or', 'oregon']
"p" 	 ['pennsylvania', 'pa', 'puerto rico', 'pr']
"q" 	 []
"r" 	 ['r

Hence, the list will be:

            {
                'alabama' : 'al', 
                'arizona' : 'az', 
                'arkansas' :'ar', 
                'alaska': 'ak', 
                'american samoa': 'as', 
                'american samoa': 'ae' 
             }
for the `states` which started by letter `a`.

Lưu ý:
- 1) Hầu hết tên viết tắt của các bang có dạng 2 chữ đầu (ví dụ 'alabama': 'al' và 'arkanras' : 'ar') hoặc viết tắt chữ đầu ở 2 từ khác nhau (như 'american samoa': 'as')
- 2) Tuy nhiên trong một số trường hợp như `arizona` cũng có 2 chữ cái đầu tiên (là `ar`) như `arkanras` nên sẽ phải chuyển thành `az`.
- 3) Như vậy ta không thể vận dụng quy luật 2 chữ đầu ở các bang để định nghĩa một dictionary cho column: `state` này.

Tham khao [state-apendix](https://vi.wikipedia.org/wiki/Danh_s%C3%A1ch_ti%E1%BB%83u_bang_Hoa_K%E1%BB%B3_theo_c%C3%A1ch_vi%E1%BA%BFt_t%E1%BA%AFt)

In [28]:
state_dict = {'alabama' : 'al','arizona' : 'az', 'arkansas' :'ar', 'alaska': 'ak', 'american samoa': 'as', 'american samoa': 'ae', ## letter A
              'california': 'ca', 'colorado': 'co', 'connecticut': 'ct',       ## letter C
              'delaware': 'de', 'district of columbia' : 'dc',                 ## letter D
              'florida': 'fl',                                                 ##        F
              'georgia': 'ga', 'guam': 'gu', 'grand traverse reservation': 'gr', ##      G
              'hawaii': 'hi',
              'illinois': 'il', 'indiana': 'in', 'iowa': 'ia', 'idaho': 'id',
              'kentucky': 'ky', 'kansas': 'ks',
              'louisiana': 'la',
              'michigan': 'mi', 'missouri': 'mo', 'minnesota': 'mn', 'maine': 'me', 'montana': 'mt',    ## M
              'maryland': 'md', 'mississippi' : 'ms', 'massachusetts': 'ma',
              'new york': 'ny', 'north carolina': 'nc', 'new jersey': 'nj', 'nebraska': 'ne', 'nevada': 'nv',    ## N
              'new mexico': 'nm', 'new hampshire' : 'nh', 'north dakota': 'nd', 'northern mariana islands': 'cm',
              'ohio': 'oh', 'oklahoma': 'ok', 'oregon': 'or',
              'pennsylvania': 'pa', 'puerto rico': 'pr', 'palau': 'pw',
              'rhode island': 'ri',
              'south carolina': 'sc', 'south dakota': 'sd',
              'texas': 'tx', 'tennessee': 'tn',
              'utah': 'ut', 'united states virgin islands': 'vi',
              'wiscosin': 'wi', 'wisconsin': 'wi', 'washington': 'wa', 'west virginia': 'wv', 'wyoming': 'wy'
                }

#### Replace.

In [29]:
t0 = time.time()
vst_df = vst_df.replace(state_dict)
print(time.time() - t0)
vst_df.head()

106.30107808113098


Unnamed: 0,utc_timestamp,local_timestamp,ad_id,id_type,location_name,top_category,sub_category,street_address,city,state,naics_code,zip_code,minimum_dwell,safegraph_place_id,geohash_5,census_block_group
0,2020-08-26 11:25:09,2020-08-26 04:25:09,354cec9d-eb94-4522-bc35-dc14e9d910b2,idfa,sandrock canyon os park,"museums, historical sites, and similar institu...",nature parks and other similar institutions,sandrock canyon os park,san diego,ca,712190.0,92108.0,202.0,sg:2b7ce679bc5a4b8fb1d58a850bbacf76,9mudw,60730090000.0
1,2020-08-26 00:00:02,2020-08-25 20:00:02,20e1e1d5-0642-4fe1-9718-94decebe2b3f,aaid,home,,,,,oh,,44090.0,1328.0,home,dpmd6,390930900000.0
2,2020-08-26 02:55:16,2020-08-25 21:55:16,dc7dece9-4d17-4fa5-9290-6602830e0a0c,aaid,home,,,,palatine,il,,60067.0,476.0,home,dp3rs,170318000000.0
3,2020-08-26 18:51:58,2020-08-26 14:51:58,a644d089-1227-4d8e-9127-35ed1b04ed1f,aaid,a moment in time photo booths,"other professional, scientific, and technical ...","photography studios, portrait",360 mcclellan ave,hamilton,nj,541921.0,8610.0,4.0,sg:ff78dda4533e4da992b257b94c04b4af,dr4ue,340210000000.0
4,2020-08-26 18:16:10,2020-08-26 11:16:10,4f7418d9-578c-4f03-b1fb-5e8f63bf2599,aaid,qfc (quality food centers),grocery stores,supermarkets and other grocery (except conveni...,22803 44th ave w,mountlake terrace,wa,445110.0,98043.0,18.0,sg:0096ae4959f849a186471eb96aef0cbf,c23pc,530610500000.0


Next, the `city`, the `top3000` values takes more than `80%` over the whole. 

Lưu ý rằng các giá trị trong `city` nên được viết dưới tên đầy đủ của nó (không được viết tắt như trong `state`), ví dụ như `ny` phải được viết là `new york`

In [30]:
ser = vst_df['city'].value_counts()
ser.head()

ny             10680
houston         6902
chicago         5942
san antonio     4794
los angeles     4580
Name: city, dtype: int64

In [31]:
cities = list(ser.index)
## tìm các thành phố bị viết tắt (có độ dài nhỏ hơn hoặc bằng 2)
short_name_cities = [city for city in cities if len(city) <= 2]
print(short_name_cities)
len(short_name_cities)

['ny', 'wa', 'wy', 'de', 'or', 'in', 'ca', 'nv', 'la', 'ia', 'ks', 'me', 'fl', 'ga', 'al', 'ok']


16

Hence, among the `16772` distinct-values, there are only `16` cities stored as `2-characters`. But firstly, we will count the number of cities in the `short_name_cities` 

In [32]:
ser[short_name_cities]

ny    10680
wa     1683
wy      300
de      175
or      115
in       99
ca       94
nv       86
la       19
ia       16
ks       10
me        9
fl        4
ga        4
al        2
ok        2
Name: city, dtype: int64

Now, we will create a `dictionary` of `city`, named `city_dict` as follow,

In [33]:
vst_df[vst_df['city'] == 'ny'].groupby('state').count()[['city']]

Unnamed: 0_level_0,city
state,Unnamed: 1_level_1
ny,10680


Lưu ý rằng một số thành phố có tên viết tắt giống nhau nhưng đến ở các bang khác nhau, do đó ta chỉ thay đổi tên viết tắt ở các thành phố lớn như `new york`

In [34]:
t0 = time.time()
city_dict = {'ny': 'new york'}

print('city processing takes %s (seconds)'%(time.time() - t0))

city processing takes 8.440017700195312e-05 (seconds)


Như trường hợp dưới đây, `city = "wi"` nhưng lại có 22 bang có cùng tên viết tắt, sẽ rất tốn thời gian để xử lý những bang như vậy nên ta chỉ ưu tiên xử lý tên của các thành phố lớn ở các bang lớn.

In [35]:
vst_df[vst_df['city'] == 'wy'].groupby('state').count()[['city']]

Unnamed: 0_level_0,city
state,Unnamed: 1_level_1
de,7
ia,4
il,2
mi,224
mn,19
oh,20
pa,19
ri,5


#### 2.1.4. `Naics_code` & `zip_code`
- `Naics_code` or `North American Industry Classification System`, there are `6-digit NAICS` code describing the business, such as `441110`.

$\qquad \Rightarrow$ We must verify all the values in this column has the length of 6-digits, then

                    naics_code > 100000 (6 digits)               ===> keep
                               < 100000 (not enough 6 digits)    ===> assign -1 (invalid)
                               is NA                             ===> assign 0 (missing)
But firstly, we must count how many `invalid-naics_code`.                               
                              

In [36]:
def count_int_df(dataframe, col_name):
    query = pd.DataFrame({col_name: dataframe[col_name]})
    query[col_name] = dataframe[col_name].apply(lambda x: x.is_integer())
    return query[col_name].sum(), dataframe[col_name].notnull().sum()
count_int_df(vst_df, 'naics_code')

(471813, 471813)

Hence, there are `471813` non-null values in the column `naics_code` and all of them are `integers`.

Now, we will check whether they have `6 digits` enough?

In [37]:
sr = vst_df['naics_code']
srm = sr[sr < 100000].value_counts()
print('The number of invalid-naics_code = %s percent over the non-null values, and %s percent over whole dataset'
      %(np.round(100*srm.sum() / sr.count(), 3), np.round(100*srm.sum() / vst_df.shape[0], 3)))
srm

The number of invalid-naics_code = 0.224 percent over the non-null values, and 0.106 percent over whole dataset


3231.0    460
5416.0    353
7111.0    143
5418.0    100
Name: naics_code, dtype: int64

In case the number of `invalid-digits` took over `10%` over the `non-null` dataset, you can search the corresponding values from the `top_category` by using this `search-engine`: [naics-code search](https://www.naics.com/code-search/)

Now, we will define the function to detect the invalid naics-code and missing-values at this column

In [38]:
def naics_c(code):
    if code < 100000:
        res = -1
    else:
        res = code
    return res
vst_df['naics_code'] = vst_df['naics_code'].apply(lambda x : naics_c(x))
vst_df['naics_code'] = vst_df['naics_code'].fillna(0)
vst_df.head(2)

Unnamed: 0,utc_timestamp,local_timestamp,ad_id,id_type,location_name,top_category,sub_category,street_address,city,state,naics_code,zip_code,minimum_dwell,safegraph_place_id,geohash_5,census_block_group
0,2020-08-26 11:25:09,2020-08-26 04:25:09,354cec9d-eb94-4522-bc35-dc14e9d910b2,idfa,sandrock canyon os park,"museums, historical sites, and similar institu...",nature parks and other similar institutions,sandrock canyon os park,san diego,ca,712190.0,92108.0,202.0,sg:2b7ce679bc5a4b8fb1d58a850bbacf76,9mudw,60730090000.0
1,2020-08-26 00:00:02,2020-08-25 20:00:02,20e1e1d5-0642-4fe1-9718-94decebe2b3f,aaid,home,,,,,oh,0.0,44090.0,1328.0,home,dpmd6,390930900000.0


- `zip_code`. A `ZIP Code` is a postal code used by the United States Postal Service (USPS). Introduced in 1963, the basic format consisted of five digits. In 1983, an extended ZIP+4 code was introduced; it included the five digits of the ZIP Code, followed by a hyphen and four digits that designated a more specific location.

Remind that we have `957205` non-null values at this column, and they are stored as a `float`. So, the first step at processing this column is verifying all of them (the non-null values) is integers.

In [39]:
count_int_df(vst_df, 'zip_code')

(957205, 957205)

This meant there are `957205` values is non-null (by `vst_df['zip_code'].notnull().sum()`) and all of them are the integers.

Next, verify the type of length,

In [40]:
query = spark.sql("""SELECT length(zip_code) AS len, zip_code
                     FROM visiting
                  """)
query.toPandas().groupby('len').count()

Unnamed: 0_level_0,zip_code
len,Unnamed: 1_level_1
4,42795
5,957205


So, almost of `zip_code` has length of `5` (took about `96%` over whole dataset), and they are 2 main types of `zip_code`, so we don't processing anything more on this column.

#### 2.1.5. `geohash_5`, `safegraph_place_id` and `census_block_group`

In [41]:
query = spark.sql("""SELECT length(geohash_5) AS len, geohash_5
                     FROM visiting
                  """)
query.toPandas().groupby('len').count()

Unnamed: 0_level_0,geohash_5
len,Unnamed: 1_level_1
4,42619
5,957381


This column (`geohash_5`) is similar as in the `zip_code` with almost observations have the length of `5`, so we will not consider the processing at this column.

In [42]:
query = spark.sql("""SELECT length(safegraph_place_id) AS len, safegraph_place_id
                     FROM visiting
                  """)
query.toPandas().groupby('len').count()

Unnamed: 0_level_0,safegraph_place_id
len,Unnamed: 1_level_1
4,527403
35,472597


Noting that, when the `len = 4` that meant the `place_id = "home"`, so we have more than `52.74%` the `safegraph_place_id` is `"home"` and another has the same `type` of `safegraph_place_id`.

In [43]:
query = spark.sql("""SELECT length(census_block_group) AS len, census_block_group
                     FROM visiting
                  """)
query.toPandas().groupby('len').count()

Unnamed: 0_level_0,census_block_group
len,Unnamed: 1_level_1
4,42619
12,957381


To understand, which `'census_block_group'` lengths `4`, look at the following result.

In [44]:
query.toPandas().groupby('census_block_group').count().sort_values(by = 'len', ascending = False).head()

Unnamed: 0_level_0,len
census_block_group,Unnamed: 1_level_1
,42619
131210035001.0,675
360610031001.0,266
481576731011.0,206
320030067001.0,198


Hence, in the final column, we will fill the `null` by `0`

In [45]:
vst_df['census_block_group'] = vst_df['census_block_group'].fillna(0)
vst_df.head()

Unnamed: 0,utc_timestamp,local_timestamp,ad_id,id_type,location_name,top_category,sub_category,street_address,city,state,naics_code,zip_code,minimum_dwell,safegraph_place_id,geohash_5,census_block_group
0,2020-08-26 11:25:09,2020-08-26 04:25:09,354cec9d-eb94-4522-bc35-dc14e9d910b2,idfa,sandrock canyon os park,"museums, historical sites, and similar institu...",nature parks and other similar institutions,sandrock canyon os park,san diego,ca,712190.0,92108.0,202.0,sg:2b7ce679bc5a4b8fb1d58a850bbacf76,9mudw,60730090000.0
1,2020-08-26 00:00:02,2020-08-25 20:00:02,20e1e1d5-0642-4fe1-9718-94decebe2b3f,aaid,home,,,,,oh,0.0,44090.0,1328.0,home,dpmd6,390930900000.0
2,2020-08-26 02:55:16,2020-08-25 21:55:16,dc7dece9-4d17-4fa5-9290-6602830e0a0c,aaid,home,,,,palatine,il,0.0,60067.0,476.0,home,dp3rs,170318000000.0
3,2020-08-26 18:51:58,2020-08-26 14:51:58,a644d089-1227-4d8e-9127-35ed1b04ed1f,aaid,a moment in time photo booths,"other professional, scientific, and technical ...","photography studios, portrait",360 mcclellan ave,hamilton,nj,541921.0,8610.0,4.0,sg:ff78dda4533e4da992b257b94c04b4af,dr4ue,340210000000.0
4,2020-08-26 18:16:10,2020-08-26 11:16:10,4f7418d9-578c-4f03-b1fb-5e8f63bf2599,aaid,qfc (quality food centers),grocery stores,supermarkets and other grocery (except conveni...,22803 44th ave w,mountlake terrace,wa,445110.0,98043.0,18.0,sg:0096ae4959f849a186471eb96aef0cbf,c23pc,530610500000.0


### 2.2. Remove duplicate or irrelevant observation

In [46]:
vst_df = vst_df.drop_duplicates()
vst_df.shape

(1000000, 16)

### 2.3. Handling missing values
#### 2.3.1. Fill `unknown` to `NA values` of the `text-type columns` and `datetime`

In [47]:
def fill_object(dataframe, rep_words = 'unknown'):
    types = dataframe.dtypes
    cols = dataframe.columns
    ob_cols = cols[(types == 'object') | (types == 'datetime64[ns]')]
    for col in ob_cols:
        dataframe[col] = dataframe[col].fillna(rep_words)
    return dataframe
vst_df = fill_object(vst_df)
vst_df.head()

Unnamed: 0,utc_timestamp,local_timestamp,ad_id,id_type,location_name,top_category,sub_category,street_address,city,state,naics_code,zip_code,minimum_dwell,safegraph_place_id,geohash_5,census_block_group
0,2020-08-26 11:25:09,2020-08-26 04:25:09,354cec9d-eb94-4522-bc35-dc14e9d910b2,idfa,sandrock canyon os park,"museums, historical sites, and similar institu...",nature parks and other similar institutions,sandrock canyon os park,san diego,ca,712190.0,92108.0,202.0,sg:2b7ce679bc5a4b8fb1d58a850bbacf76,9mudw,60730090000.0
1,2020-08-26 00:00:02,2020-08-25 20:00:02,20e1e1d5-0642-4fe1-9718-94decebe2b3f,aaid,home,unknown,unknown,unknown,unknown,oh,0.0,44090.0,1328.0,home,dpmd6,390930900000.0
2,2020-08-26 02:55:16,2020-08-25 21:55:16,dc7dece9-4d17-4fa5-9290-6602830e0a0c,aaid,home,unknown,unknown,unknown,palatine,il,0.0,60067.0,476.0,home,dp3rs,170318000000.0
3,2020-08-26 18:51:58,2020-08-26 14:51:58,a644d089-1227-4d8e-9127-35ed1b04ed1f,aaid,a moment in time photo booths,"other professional, scientific, and technical ...","photography studios, portrait",360 mcclellan ave,hamilton,nj,541921.0,8610.0,4.0,sg:ff78dda4533e4da992b257b94c04b4af,dr4ue,340210000000.0
4,2020-08-26 18:16:10,2020-08-26 11:16:10,4f7418d9-578c-4f03-b1fb-5e8f63bf2599,aaid,qfc (quality food centers),grocery stores,supermarkets and other grocery (except conveni...,22803 44th ave w,mountlake terrace,wa,445110.0,98043.0,18.0,sg:0096ae4959f849a186471eb96aef0cbf,c23pc,530610500000.0


#### 2.3.2. Fill `0` to `NA values` at the `numeric columns`

In [48]:
def fill_numeric(dataframe, rep_words = 'unknown'):
    types = dataframe.dtypes
    cols = dataframe.columns
    nm_cols = cols[~((types == 'object') | (types == 'datetime64[ns]'))]
    for col in nm_cols:
        dataframe[col] = dataframe[col].fillna(rep_words)
    return dataframe
vst_df = fill_numeric(vst_df)
vst_df.head()

Unnamed: 0,utc_timestamp,local_timestamp,ad_id,id_type,location_name,top_category,sub_category,street_address,city,state,naics_code,zip_code,minimum_dwell,safegraph_place_id,geohash_5,census_block_group
0,2020-08-26 11:25:09,2020-08-26 04:25:09,354cec9d-eb94-4522-bc35-dc14e9d910b2,idfa,sandrock canyon os park,"museums, historical sites, and similar institu...",nature parks and other similar institutions,sandrock canyon os park,san diego,ca,712190.0,92108,202.0,sg:2b7ce679bc5a4b8fb1d58a850bbacf76,9mudw,60730090000.0
1,2020-08-26 00:00:02,2020-08-25 20:00:02,20e1e1d5-0642-4fe1-9718-94decebe2b3f,aaid,home,unknown,unknown,unknown,unknown,oh,0.0,44090,1328.0,home,dpmd6,390930900000.0
2,2020-08-26 02:55:16,2020-08-25 21:55:16,dc7dece9-4d17-4fa5-9290-6602830e0a0c,aaid,home,unknown,unknown,unknown,palatine,il,0.0,60067,476.0,home,dp3rs,170318000000.0
3,2020-08-26 18:51:58,2020-08-26 14:51:58,a644d089-1227-4d8e-9127-35ed1b04ed1f,aaid,a moment in time photo booths,"other professional, scientific, and technical ...","photography studios, portrait",360 mcclellan ave,hamilton,nj,541921.0,8610,4.0,sg:ff78dda4533e4da992b257b94c04b4af,dr4ue,340210000000.0
4,2020-08-26 18:16:10,2020-08-26 11:16:10,4f7418d9-578c-4f03-b1fb-5e8f63bf2599,aaid,qfc (quality food centers),grocery stores,supermarkets and other grocery (except conveni...,22803 44th ave w,mountlake terrace,wa,445110.0,98043,18.0,sg:0096ae4959f849a186471eb96aef0cbf,c23pc,530610500000.0


### 2.4. Save & stored result.

In [49]:
vst_df.to_csv('clean_visiting.csv')

## 3. Distribution analyzing.