In [4]:
# load raw data and have a peek

import json
file_sensor = '../data/raw/sensorswww_data.txt'       # path of the raw data file
f = open(file_sensor, encoding='utf-8', mode='r')     # read in data from the raw data file, consult: https://github.com/rkern/line_profiler/issues/37

lines = f.readlines()  # read all line at once, avoiding reading different line each time when using .readline()
print("Number of records in the raw data: %d. \n" % len(lines))

# Let's have  a look at the first line.
print("The first record: \n")
print(lines[0])

Number of records in the raw data: 75092. 

The first record: 

{"distinct_id":"595466e9a8e733434ce08de16e927d985e0b5d48","lib":{"$lib":"js","$lib_method":"code","$lib_version":"1.6.20"},"properties":{"$os":"windows","$model":"pc","$os_version":"6.1","$screen_height":800,"$screen_width":1280,"$lib":"js","$lib_version":"1.6.20","$browser":"chrome","$browser_version":"56","$latest_referrer":"","$latest_referrer_host":"","$latest_utm_source":"baidu","$latest_utm_medium":"cpc","$latest_utm_campaign":"通用词","$latest_utm_content":"通用-用户画像","$latest_utm_term":"用户画像","_latest_ch":"demo","_session_referrer":"https://www.baidu.com/baidu.php","_session_referrer_host":"www.baidu.com","session_page_url":"https://www.sensorsdata.cn/?utm_source=baidu&utm_medium=cpc&utm_term=%E7%94%A8%E6%88%B7%E7%94%BB%E5%83%8F&utm_content=%E9%80%9A%E7%94%A8%2D%E7%94%A8%E6%88%B7%E7%94%BB%E5%83%8F&utm_campaign=%E9%80%9A%E7%94%A8%E8%AF%8D","pageUrl":"https://sensorsdata.cn/?ch=demo","pageStayTime":5.692,"pagePosition":2,

In [6]:
# First, let's see how many features in the raw data

main_features = set()
main_features.clear()

print("Main features:\n")

for i in range(len(lines)):              
    line_json = json.loads(lines[i])
    for features, possible_values in line_json.items():
        main_features.add(features)
    
print(main_features, "\n")
print("There are %d main features in the raw data." % len(main_features))

Main features:

{'type', '_nocache', 'event', 'properties', 'lib', 'jssdk_error', 'time', 'distinct_id'} 

There are 8 main features in the raw data.


In [32]:
# Next, let's see the type of each main feature. If they are dictionary, we have to dig deepper.

main_features_type = set()
for i in range(len(lines)):              
    line_json = json.loads(lines[i])
    for features, possible_values in line_json.items():
        main_features_type.add((features, type(line_json[features])))
        
for j in main_features_type:
    print(j)     

('lib', <class 'dict'>)
('event', <class 'str'>)
('properties', <class 'dict'>)
('time', <class 'int'>)
('distinct_id', <class 'str'>)
('type', <class 'str'>)
('_nocache', <class 'str'>)
('jssdk_error', <class 'str'>)


** So we have 8 main features in the raw data. They are:   
\# 1: 'dict'   lib          
\# 2: 'dict'   properties  
\# 3: 'str'    distinct_id  
\# 4: 'str'    type         
\# 5: 'str'    event        
\# 6: 'str'    _nocache     
\# 7: 'int'    time         
\# 8: 'str'    jssdk_error  
**

"lib" and "properties" are libraries, so we have to get to a deeper layer of them.

In [35]:
# sub-features of "lib"

lib_sub_features = set()
lib_sub_features_type = set()

for i in range(len(lines)):              
    line_json = json.loads(lines[i])
    for features, possible_values in line_json["lib"].items():
        lib_sub_features.add(features)
        lib_sub_features_type.add((features, type(line_json["lib"][features])))
    
for j in lib_sub_features_type:
    print(j) 
    
print()
print("There are %d sub-features in lib." % len(lib_sub_features))

('$lib', <class 'str'>)
('$lib_method', <class 'str'>)
('$lib_version', <class 'str'>)

There are 3 sub-features in lib.


In [36]:
properties_sub_features = set()
properties_sub_features_type = set()

for i in range(len(lines)):              
    line_json = json.loads(lines[i])
    for features, possible_values in line_json["properties"].items():
        properties_sub_features.add(features)
        properties_sub_features_type.add((features, type(line_json["properties"][features])))
    
for j in properties_sub_features_type:
    print(j) 
    
print()
print("There are %d sub-features in properties." % len(properties_sub_features))

('$screen_height', <class 'int'>)
('$first_browser_language', <class 'str'>)
('page', <class 'str'>)
('$first_referrer', <class 'str'>)
('siteUrl', <class 'str'>)
('requestBtn', <class 'str'>)
('$lib_version', <class 'str'>)
('$latest_referrer', <class 'str'>)
('pagePosition', <class 'int'>)
('$referrer', <class 'str'>)
('$model', <class 'str'>)
('$utm_term', <class 'str'>)
('$latest_utm_medium', <class 'str'>)
('_latest_ch', <class 'str'>)
('pageStayTime', <class 'float'>)
('isMsg', <class 'bool'>)
('contact', <class 'str'>)
('url_path', <class 'str'>)
('$latest_utm_term', <class 'str'>)
('$latest_referrer_host', <class 'str'>)
('project_name', <class 'str'>)
('$lib', <class 'str'>)
('$title', <class 'str'>)
('company', <class 'str'>)
('name', <class 'str'>)
('$utm_medium', <class 'str'>)
('pageUrl', <class 'str'>)
('$utm_content', <class 'str'>)
('$utm_campaign', <class 'str'>)
('result', <class 'str'>)
('$screen_width', <class 'int'>)
('info', <class 'str'>)
('isSuccess', <class 'bo

**
So we have 61 sub-features in the "properties". Their types are:   
'str', 'int', 'float', 'bool'.
**

## <font color='blue'> Create a list of dictionaries and then convert to dataframe </font>

In [38]:
from pandas.io.json import json_normalize

dict_sensor_raw = [] # create a list to store all the records (type: dictionry)

for i in range(len(lines)):              
    line_json = json.loads(lines[i])
    dict_sensor_raw.append(line_json)

df_sensor_raw = json_normalize(dict_sensor_raw)

In [39]:
df_sensor_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75092 entries, 0 to 75091
Data columns (total 70 columns):
_nocache                               75092 non-null object
distinct_id                            75092 non-null object
event                                  65673 non-null object
jssdk_error                            58 non-null object
lib.$lib                               75092 non-null object
lib.$lib_method                        75092 non-null object
lib.$lib_version                       75092 non-null object
properties.$browser                    65673 non-null object
properties.$browser_version            65673 non-null object
properties.$first_browser_language     9148 non-null object
properties.$first_referrer             9369 non-null object
properties.$first_referrer_host        9369 non-null object
properties.$first_visit_time           9419 non-null object
properties.$ip                         65673 non-null object
properties.$is_first_day               65673

In [45]:
df_sensor_raw.head(10)

Unnamed: 0,_nocache,distinct_id,event,jssdk_error,lib.$lib,lib.$lib_method,lib.$lib_version,properties.$browser,properties.$browser_version,properties.$first_browser_language,...,properties.referrerUrl,properties.requestBtn,properties.result,properties.session_page_url,properties.siteUrl,properties.site_url,properties.url_path,properties.verification_code,time,type
0,654392402996,595466e9a8e733434ce08de16e927d985e0b5d48,index_leave,,js,code,1.6.20,chrome,56.0,,...,,,,https://www.sensorsdata.cn/?utm_source=baidu&u...,,,,,1488791047953,track
1,3040562711955,9939d3e087bca29c42334d96dccd25ca0e06652a,,,js,code,1.6.20,,,zh-CN,...,,,,,,,,,1490958296645,profile_set_once
2,9587552771961,9939d3e087bca29c42334d96dccd25ca0e06652a,$pageview,,js,code,1.6.20,chrome,56.0,,...,,,,https://sensorsdata.cn/?ch=demo,,,,,1488791050856,track
3,652937076129,9939d3e087bca29c42334d96dccd25ca0e06652a,btnClick,,js,code,1.6.20,chrome,56.0,,...,,2.0,,https://sensorsdata.cn/?ch=demo,,,,,1488791051772,track
4,8207407748558,9939d3e087bca29c42334d96dccd25ca0e06652a,btnClick,,js,code,1.6.20,chrome,56.0,,...,,2.0,,https://sensorsdata.cn/?ch=demo,,,,,1488791056032,track
5,4967393021929,595466e9a8e733434ce08de16e927d985e0b5d48,demo_leave,,js,code,1.6.20,chrome,56.0,,...,,,,https://www.sensorsdata.cn/?utm_source=baidu&u...,,,,,1488791057880,track
6,382763295792,c98f9661c89faeedb1109065d146f5be806f2d50,btnClick,,js,code,1.6.20,chrome,54.0,,...,,,,https://www.sensorsdata.cn/?utm_source=baidu&u...,,,,,1488791058601,track
7,6250648113207,c98f9661c89faeedb1109065d146f5be806f2d50,$pageview,,js,code,1.6.20,chrome,54.0,,...,,,,https://www.sensorsdata.cn/?utm_source=baidu&u...,,,,,1488791059334,track
8,6907472183771,9939d3e087bca29c42334d96dccd25ca0e06652a,btnClick,,js,code,1.6.20,chrome,56.0,,...,,2.0,,https://sensorsdata.cn/?ch=demo,,,,,1488791063942,track
9,1770400447899,978ab1876c3063608cd564a1ab90bfd6eaaf8e44,btnClick,,js,code,1.6.20,firefox,51.0,,...,,,,https://www.sensorsdata.cn/?utm_source=baidu&u...,,,,,1488791082530,track


## <font color='blue'> Save the dataframe to a csv file for later use.</font>

In [55]:
df_sensor_raw.to_csv('../data/sensors.csv', index= False, encoding='utf-8')