-
Notifications
You must be signed in to change notification settings - Fork 0
/
Crawl.py
117 lines (100 loc) · 4.81 KB
/
Crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/python
#-*- coding: utf-8 -*-
import urllib2
import re
import logging
import csv
from collections import OrderedDict
logging.basicConfig(level=logging.INFO)
count_variant=0;count_error=0;count_game=0;count_life=0;count_other=0;count_pet=0;count_pop=0;count_soccer=0;count_song=0;count_TVseries=0;count_food=0;count_movie=0;count_mugen=0;count_anime=0;count_flash=0;
count_twitch=0;count_varshow=0;count_lol=0;count_ACGmusic=0;count_documetary=0;count_basketball=0;count_marvelsports=0;count_miku=0;
output=open("output.txt","wb")
#output=csv.writer(open("output.csv","wb"))
#stat=csv.writer(open("count.csv","wb"))
for beginid in range (1010000,1012680):
userMainUrl="http://www.acfun.tv/v/ac%d" % beginid
req = urllib2.Request(userMainUrl)
try:
resp = urllib2.urlopen(req)
except Exception as ex:
logging.info('failed to get id %d', beginid)
count_error=count_error+1
continue
respHtml = resp.read()
urlpat=re.compile(r'<title>(.*?) - ')
match=urlpat.findall(respHtml)
urlpat2=re.compile(r'<a id="channel-article-title".*>(.*?)</a>')
match2=urlpat2.findall(respHtml)
urlpat3=re.compile(r'<a class.*?title="Up.*?">(.*?)</a>')
match3=urlpat3.findall(respHtml)
urlpat4=re.compile(r'system.views = .*\'(\d+)\'.*?;')
match4=urlpat4.findall(respHtml)
# for item2 in match2:
# logging.info('%d: %s', beginid, item2)
# print match2
if match2 == ['\xe7\xbb\xbc\xe5\x90\x88']:
count_variant=count_variant+1
elif match2 == ['\xe6\xbc\x94\xe5\x94\xb1\xc2\xb7\xe4\xb9\x90\xe5\x99\xa8']:
count_song=count_song+1
elif match2 == ['\xe5\x89\xa7\xe9\x9b\x86']:
count_TVseries=count_TVseries+1
elif match2 == ['\xe6\xb8\xb8\xe6\x88\x8f\xe9\x9b\x86\xe9\x94\xa6']:
count_game=count_game+1
elif match2 == ['\xe8\x90\x8c\xe5\xae\xa0']:
count_pet=count_pet+1
elif match2 == ['\xe7\x94\x9f\xe6\xb4\xbb\xe5\xa8\xb1\xe4\xb9\x90']:
count_life=count_life+1
elif match2 == ['\xe8\xb6\xb3\xe7\x90\x83']:
count_soccer=count_soccer+1
elif match2 == ['\xe6\xb5\x81\xe8\xa1\x8c\xe9\x9f\xb3\xe4\xb9\x90']:
count_pop=count_pop+1
elif match2 == ['\xe7\x94\xb5\xe5\xbd\xb1']:
count_movie=count_movie+1
elif match2 == ['Mugen']:
count_mugen=count_mugen+1
elif match2 == ['\xe7\xbe\x8e\xe9\xa3\x9f']:
count_food=count_food+1
elif match2 == ['\xe5\x8a\xa8\xe7\x94\xbb\xe7\x9f\xad\xe7\x89\x87']:
count_anime=count_anime+1
elif match2 == ['Flash\xe6\xb8\xb8\xe6\x88\x8f']:
count_flash=count_flash+1
elif match2 == ['\xe5\xae\x9e\xe5\x86\xb5\xe8\xa7\xa3\xe8\xaf\xb4']:
count_twitch=count_twitch+1
elif match2 == ['\xe7\xbb\xbc\xe8\x89\xba']:
count_varshow=count_varshow+1
elif match2 == ['\xe8\x8b\xb1\xe9\x9b\x84\xe8\x81\x94\xe7\x9b\x9f']:
count_lol=count_lol+1
elif match2 == ['\xe7\xba\xaa\xe5\xbd\x95\xe7\x89\x87']:
count_documetary=count_documetary+1
elif match2 == ['Vocaloid']:
count_miku=count_miku+1
elif match2 == ['ACG\xe9\x9f\xb3\xe4\xb9\x90']:
count_ACGmusic=count_ACGmusic+1
elif match2 == ['\xe7\xaf\xae\xe7\x90\x83']:
count_basketball=count_basketball+1
elif match2 == ['\xe6\x83\x8a\xe5\xa5\x87\xe4\xbd\x93\xe8\x82\xb2']:
count_marvelsports=count_marvelsports+1
else:
count_other=count_other+1
for item in match:
#item.decode('utf-8')
print item
for item2 in match2:
#item2.decode('utf-8')
print item2
for item3 in match3:
print item3
for item4 in match4:
print item4
output.write("<"+item2+">"+"<"+item3+">"+"<"+item+">"+"<views: "+item4+">"+"\r\n")
# output.writerow([item2,item])
ordered_fieldnames = OrderedDict([('count_variant',None),('count_error',None),('count_song',None),('count_TVseries',None),('count_game',None),('count_pet',None),('count_life',None),('count_soccer',None),('count_pop',None),('count_other',None),('count_food',None),('count_anime',None),('count_movie',None),('count_mugen',None),('count_flash',None),('count_twitch',None),('count_varshow',None),('count_lol',None),('count_documentary',None),('count_marvelsport',None),('count_ACGmusic',None),('count_basketball',None),('count_miku',None)])
with open("count.csv","wb") as fou:
stat=csv.DictWriter(fou,fieldnames=ordered_fieldnames)
stat.writeheader()
stat=csv.writer(fou)
stat.writerow([count_variant,count_error,count_song,count_TVseries,count_game,count_pet,count_life,count_soccer,count_pop,count_other,count_food,count_anime,count_movie,count_mugen,count_flash,count_twitch,count_varshow,count_lol,count_documetary,count_marvelsports,count_ACGmusic,count_basketball,count_miku])
output.close()
#for item in match:
# logging.info('%d: %s', beginid, item)
# output.writerow([item])