In [1]:
import json
from collections import defaultdict

def get_counts(sequence):
    counts = {}
    for x in sequence:
        if x in counts:
            counts[x] += 1
        else:
            counts[x] = 1
    return counts

'''使用库函数进行的统计'''
def get_counts2(sequence):
    counts = defaultdict(int)
    for x in sequence:
        counts[x] += 1
    return counts

'''获取前十的访问来源'''
def top_counts(count_dict,n=10):
    value_key_paris = [(count,tz) for tz,count in count_dict.items()]
    value_key_paris.sort()
    return value_key_paris[-n:]

path = 'ch01/usagov_bitly_data2012-03-16-1331923249.txt'

'''使用json的库方法对文件内容进行读取，并存放到数组中'''
records = [json.loads(line)
           for line in open(path)]
records[0]

records[0]['tz']

#print(records[0]['tz'])

time_zone = [rec['tz'] for rec in records
             if 'tz' in rec]

time_zone[:10]

counts = get_counts(time_zone)

counts['America/New_York']

len(time_zone)

top_counts(counts)



[(33, 'America/Sao_Paulo'),
 (35, 'Europe/Madrid'),
 (36, 'Pacific/Honolulu'),
 (37, 'Asia/Tokyo'),
 (74, 'Europe/London'),
 (191, 'America/Denver'),
 (382, 'America/Los_Angeles'),
 (400, 'America/Chicago'),
 (521, ''),
 (1251, 'America/New_York')]

In [2]:
from collections import Counter
'''使用库函数统计前十的区域'''
counts = Counter(time_zone)

counts.most_common(10)

[('America/New_York', 1251),
 ('', 521),
 ('America/Chicago', 400),
 ('America/Los_Angeles', 382),
 ('America/Denver', 191),
 ('Europe/London', 74),
 ('Asia/Tokyo', 37),
 ('Pacific/Honolulu', 36),
 ('Europe/Madrid', 35),
 ('America/Sao_Paulo', 33)]

In [3]:
from pandas import DataFrame,Series
import pandas as pd;import numpy as np

'''使用pandas对时区进行计数'''

frame = DataFrame(records)

'''这里frame的输出形式是摘要视图(summary view)，主要用于较大的
DataFrame对象。frame['tz']所反悔的Series对象有个value_counts方法'''
frame['tz'][:10]

tz_counts = frame['tz'].value_counts()

tz_counts[:10]

America/New_York       1251
                        521
America/Chicago         400
America/Los_Angeles     382
America/Denver          191
Europe/London            74
Asia/Tokyo               37
Pacific/Honolulu         36
Europe/Madrid            35
America/Sao_Paulo        33
Name: tz, dtype: int64

In [4]:
clean_tz = frame['tz'].fillna('Missing')

clean_tz[clean_tz == ''] = 'Unknown'

tz_counts = clean_tz.value_counts()

tz_counts[:10]

America/New_York       1251
Unknown                 521
America/Chicago         400
America/Los_Angeles     382
America/Denver          191
Missing                 120
Europe/London            74
Asia/Tokyo               37
Pacific/Honolulu         36
Europe/Madrid            35
Name: tz, dtype: int64

In [5]:
from matplotlib.pyplot import *

'''利用tz_counts对象的plot方法即可得到一张水平条形图'''
tz_counts[:10].plot(kind='barh',rot=0)
#show()


In [25]:
frame['a'][1]
'''此外，我们还可以对这种数据进行很多处理。比如：a字段含有执行URL短缩
操作的浏览器、设备、应用程序等信息'''
frame['a'][50]
frame['a'][51]

'Mozilla/5.0 (Linux; U; Android 2.2.2; en-us; LG-P925/V10e Build/FRG83G) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1'

In [26]:
results = Series([x.split()[0] for x in frame.a.dropna()])
results[:5]
results.value_counts()[:8]

Mozilla/5.0                 2594
Mozilla/4.0                  601
GoogleMaps/RochesterNY       121
Opera/9.80                    34
TEST_INTERNET_AGENT           24
GoogleProducer                21
Mozilla/6.0                    5
BlackBerry8520/5.0.0.681       4
dtype: int64

In [27]:
'''由于有的agent缺失，首先将它们从数据中移除'''
cframe = frame[frame.a.notnull()]
'''计算各行中是否是Windows'''
operating_system = np.where(cframe['a'].str.contains('Windows'),
                            'Windows','Not Windows')
operating_system[:5]

array(['Windows', 'Not Windows', 'Windows', 'Not Windows', 'Windows'],
      dtype='<U11')

In [28]:
'''根据时区和新得到的操作凶列表对数据进行分组'''
by_tz_os = cframe.groupby(['tz',operating_system])
'''通过size对分组结果进行计数，并利用unstack对计数结果进行重塑'''
agg_counts = by_tz_os.size().unstack().fillna(0)
agg_counts[:10]

Unnamed: 0_level_0,Not Windows,Windows
tz,Unnamed: 1_level_1,Unnamed: 2_level_1
,245.0,276.0
Africa/Cairo,0.0,3.0
Africa/Casablanca,0.0,1.0
Africa/Ceuta,0.0,2.0
Africa/Johannesburg,0.0,1.0
Africa/Lusaka,0.0,1.0
America/Anchorage,4.0,1.0
America/Argentina/Buenos_Aires,1.0,0.0
America/Argentina/Cordoba,0.0,1.0
America/Argentina/Mendoza,0.0,1.0


In [29]:
'''选取最常出现的时区'''
indexer = agg_counts.sum(1).argsort()

indexer[:10]

tz
                                  24
Africa/Cairo                      20
Africa/Casablanca                 21
Africa/Ceuta                      92
Africa/Johannesburg               87
Africa/Lusaka                     53
America/Anchorage                 54
America/Argentina/Buenos_Aires    57
America/Argentina/Cordoba         26
America/Argentina/Mendoza         55
dtype: int64

In [30]:
'''这里也可以生成一张条形图，使用stacked=True来生成一张堆积条形图'''
count_subset.plot(kind='barh',stacked=True)
show()

In [31]:
count_subset = agg_counts.take(indexer)[-10:]
count_subset

Unnamed: 0_level_0,Not Windows,Windows
tz,Unnamed: 1_level_1,Unnamed: 2_level_1
America/Sao_Paulo,13.0,20.0
Europe/Madrid,16.0,19.0
Pacific/Honolulu,0.0,36.0
Asia/Tokyo,2.0,35.0
Europe/London,43.0,31.0
America/Denver,132.0,59.0
America/Los_Angeles,130.0,252.0
America/Chicago,115.0,285.0
,245.0,276.0
America/New_York,339.0,912.0
