Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Newer
Older
100755 310 lines (296 sloc) 9.063 kb
91c2fb08 »
2010-12-06 first re-commit
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
a05aa04c »
2011-05-24 Not used anymore. Superseded by sinaweibo.oauth.py
4 # Not used anymore. Superseded by sinaweibo.oauth.py
5
91c2fb08 »
2010-12-06 first re-commit
6 # sinastorage.py stores data retrieved using sinagetter.sh into a database (see sinaweibo.sql)
7
8 import sys
9 import pg
10 import simplejson
11 import time
12 import datetime
13 import string
14 import types
15
2985f624 »
2010-12-21 added sleeptime in case sina api times out
16 import mypass
91c2fb08 »
2010-12-06 first re-commit
17
18 usage = "sinastorage.py [option::1=user_timeline,2=users,3=friends,4=followers] [name of file to insert in DB] [user_id for friends/followers (source_id)]"
19 table_name = "sinaweibo"
20
2985f624 »
2010-12-21 added sleeptime in case sina api times out
21 pgconn = mypass.getConn()
22
a05aa04c »
2011-05-24 Not used anymore. Superseded by sinaweibo.oauth.py
23 tobeginning = False
19b0e3c1 »
2011-04-11 monthly update
24 tobeginning_grace = 190
f97ef17e »
2011-03-01 added support for comments
25 last_one = ""
26 insert_user = False
2985f624 »
2010-12-21 added sleeptime in case sina api times out
27 doupdate = False
a05aa04c »
2011-05-24 Not used anymore. Superseded by sinaweibo.oauth.py
28 doupdate_user = False
2985f624 »
2010-12-21 added sleeptime in case sina api times out
29 justretweets = False
19b0e3c1 »
2011-04-11 monthly update
30 verbose = False
31 sleeptime = 150
a05aa04c »
2011-05-24 Not used anymore. Superseded by sinaweibo.oauth.py
32 breakafterupdate = False
2985f624 »
2010-12-21 added sleeptime in case sina api times out
33
91c2fb08 »
2010-12-06 first re-commit
34 if len(sys.argv) > 2:
35 try:
36 opt = int(sys.argv[1])
37 fname = str(sys.argv[2])
38 except ValueError:
39 print usage
40 sys.exit()
2985f624 »
2010-12-21 added sleeptime in case sina api times out
41 else:
42 print usage
43 sys.exit()
44
91c2fb08 »
2010-12-06 first re-commit
45 if opt >= 3 and opt <= 4:
46 try:
47 user_id = int(sys.argv[3])
48 except ValueError:
49 print usage
50 sys.exit()
2985f624 »
2010-12-21 added sleeptime in case sina api times out
51 elif opt <= 2:
52 fromstatuses = False
53 for i in range(3,len(sys.argv)):
a05aa04c »
2011-05-24 Not used anymore. Superseded by sinaweibo.oauth.py
54 if sys.argv[i] == "-b" or sys.argv[i] == "--to-beginning":
55 tobeginning = True
19b0e3c1 »
2011-04-11 monthly update
56 if sys.argv[i] == "-v" or sys.argv[i] == "--verbose":
57 verbose = True
a05aa04c »
2011-05-24 Not used anymore. Superseded by sinaweibo.oauth.py
58 if sys.argv[i] == "-q" or sys.argv[i] == "--quiet":
59 verbose = False
2985f624 »
2010-12-21 added sleeptime in case sina api times out
60 if sys.argv[i] == "-u" or sys.argv[i] == "--update":
61 doupdate = True
62 if sys.argv[i] == "-rt" or sys.argv[i] == "--retweets":
63 justretweets = True
64 if sys.argv[i] == "-s" or sys.argv[i] == "--from-statuses":
65 fromstatuses = True
a05aa04c »
2011-05-24 Not used anymore. Superseded by sinaweibo.oauth.py
66 if sys.argv[i] == "-U" or sys.argv[i] == "--user":
67 insert_user = True
68 if sys.argv[i] == "-Uu" or sys.argv[i] == "--update-user":
69 doupdate_user = True
19b0e3c1 »
2011-04-11 monthly update
70 elif opt == 7 or opt == 8:
f97ef17e »
2011-03-01 added support for comments
71 for i in range(3,len(sys.argv)):
72 if sys.argv[i] == "-b" or sys.argv[i] == "--to-beginning":
73 tobeginning = True
74 if sys.argv[i] == "-nd" or sys.argv[i] == "--no-duplicates":
75 tobeginning = False
76 if sys.argv[i] == "-U" or sys.argv[i] == "--user":
77 insert_user = True
91c2fb08 »
2010-12-06 first re-commit
78
79 f = open(fname, "r")
80 content = f.read()
19b0e3c1 »
2011-04-11 monthly update
81 try:
82 js = simplejson.loads(content)
83 except ValueError:
84 print "JSON ERROR: " + fname + "\t" + content
85 sys.exit()
91c2fb08 »
2010-12-06 first re-commit
86 r = dict()
87
f97ef17e »
2011-03-01 added support for comments
88 # in case of error, exit
19b0e3c1 »
2011-04-11 monthly update
89 if content != "[]" and ((len(js) > 0 and 0 in js) or "error" in js):
90 if len(js) > 0 and 0 in js and "error" in js[0]:
91 errorJs = js[0]["error"]
92 elif "error" in js:
93 errorJs = js["error"]
94 else:
95 errorJs = None
96 if errorJs is not None:
97 if verbose:
98 print "FAILURE"
99 print errorJs
100 if errorJs.startswith("40302"):
a3d6637b »
2011-03-01 cleaned up, "in_reply_to_screen_name", print row
101 sys.exit()
19b0e3c1 »
2011-04-11 monthly update
102 if errorJs.startswith("40023"):
a3d6637b »
2011-03-01 cleaned up, "in_reply_to_screen_name", print row
103 sys.exit()
19b0e3c1 »
2011-04-11 monthly update
104 if errorJs.startswith("40031"):
a3d6637b »
2011-03-01 cleaned up, "in_reply_to_screen_name", print row
105 sys.exit()
106 time.sleep(sleeptime)
f97ef17e »
2011-03-01 added support for comments
107 sys.exit()
108
19b0e3c1 »
2011-04-11 monthly update
109 if opt == 1 or opt == 7:
2985f624 »
2010-12-21 added sleeptime in case sina api times out
110 if not isinstance(js, types.ListType):
111 js = [js]
112 if len(js) <= 0:
113 print js
114 sys.exit()
91c2fb08 »
2010-12-06 first re-commit
115 for j in range(len(js)):
116 l = js[j]
117 last_tweet = l
2985f624 »
2010-12-21 added sleeptime in case sina api times out
118 row = None
a3d6637b »
2011-03-01 cleaned up, "in_reply_to_screen_name", print row
119 for a in ["text", "source", "location", "thumbnail_pic", "bmiddle_pic", "original_pic", "screen_name", "in_reply_to_screen_name"]:
f97ef17e »
2011-03-01 added support for comments
120 if a in l and l[a] is not None:
121 l[a] = l[a].encode("utf8")
122 if "retweeted_status" in l and l["retweeted_status"] is not None:
123 l["retweeted_status"] = l["retweeted_status"]["id"]
2985f624 »
2010-12-21 added sleeptime in case sina api times out
124 else:
125 if justretweets:
126 continue
127 if "user" in l:
a3d6637b »
2011-03-01 cleaned up, "in_reply_to_screen_name", print row
128 if l["user"]["screen_name"] is not None:
f97ef17e »
2011-03-01 added support for comments
129 l["screen_name"] = l["user"]["screen_name"].encode("utf8")
a3d6637b »
2011-03-01 cleaned up, "in_reply_to_screen_name", print row
130 elif l["user"]["name"] is not None:
131 l["screen_name"] = l["user"]["name"].encode("utf8")
f97ef17e »
2011-03-01 added support for comments
132 elif l["user"]["domain"] is not None:
a3d6637b »
2011-03-01 cleaned up, "in_reply_to_screen_name", print row
133 l["screen_name"] = l["user"]["domain"].encode("utf8")
2985f624 »
2010-12-21 added sleeptime in case sina api times out
134 else:
f97ef17e »
2011-03-01 added support for comments
135 l["screen_name"] = None
91c2fb08 »
2010-12-06 first re-commit
136 else:
f97ef17e »
2011-03-01 added support for comments
137 l["screen_name"] = None
19b0e3c1 »
2011-04-11 monthly update
138 if "user" in l and l["user"] is not None:
f97ef17e »
2011-03-01 added support for comments
139 l["user_id"] = l["user"]["id"]
91c2fb08 »
2010-12-06 first re-commit
140 try:
f97ef17e »
2011-03-01 added support for comments
141 row = pgconn.insert(table_name, l)
91c2fb08 »
2010-12-06 first re-commit
142 except pg.ProgrammingError, pg.InternalError:
19b0e3c1 »
2011-04-11 monthly update
143 tobeginning_grace_count = 0
a05aa04c »
2011-05-24 Not used anymore. Superseded by sinaweibo.oauth.py
144 if not tobeginning and opt == 7:
19b0e3c1 »
2011-04-11 monthly update
145 tobeginning_grace_count += 1 # have a bit of loose in case the next comments page contains some of the previous in sinacomments.sh iteration
146 print "tobeginning_grace_count: " + str(tobeginning_grace_count)
147 if tobeginning_grace_count > tobeginning_grace:
148 print last_one
149 print "UP-TO-DATE: comments up to date (duplicate found in DB)"
150 break
151 else:
152 if not doupdate:
153 continue
a05aa04c »
2011-05-24 Not used anymore. Superseded by sinaweibo.oauth.py
154 if not tobeginning and not doupdate:
155 breakafterupdate = True
2985f624 »
2010-12-21 added sleeptime in case sina api times out
156 try:
157 if doupdate:
f97ef17e »
2011-03-01 added support for comments
158 row = pgconn.update(table_name, l)
2985f624 »
2010-12-21 added sleeptime in case sina api times out
159 print "updating..."
160 print row
a3d6637b »
2011-03-01 cleaned up, "in_reply_to_screen_name", print row
161 else:
162 pass
2985f624 »
2010-12-21 added sleeptime in case sina api times out
163 except:
164 print "can't insert or update"
165 print last_tweet
a3d6637b »
2011-03-01 cleaned up, "in_reply_to_screen_name", print row
166 pass
2985f624 »
2010-12-21 added sleeptime in case sina api times out
167 if row is not None and l["geo"] is not None:
168 if "type" in l["geo"] and l["geo"]["type"] == "Point" and "coordinates" in l["geo"] and l["geo"]["coordinates"] is not None and len(l["geo"]["coordinates"]) == 2:
169 lat = l["geo"]["coordinates"][0]
170 lng = l["geo"]["coordinates"][1]
171 wkt_point = "POINT(" + str(lat) + " " + str(lng) + ")"
172 sql = "UPDATE %(table_name)s SET geo = ST_GeomFromText('%(wkt_point)s', 4326) WHERE id = %(id)d " % {"table_name": table_name, "wkt_point": wkt_point, "id": row["id"]}
173 try:
174 pgconn.query(sql)
175 except:
176 print sql
177 print "geo error: " + wkt_point
178 #print last_tweet
179 #print "tweets up to date (duplicate found in DB)"
91c2fb08 »
2010-12-06 first re-commit
180 #break
f97ef17e »
2011-03-01 added support for comments
181 # try to add the user
a05aa04c »
2011-05-24 Not used anymore. Superseded by sinaweibo.oauth.py
182 if (insert_user or doupdate_user) and "user" in l and (opt == 7 or (opt == 1 and j == 0)):
f97ef17e »
2011-03-01 added support for comments
183 u = l["user"]
184 u["retrieved"] = "NOW()"
185 for a in ["name", "screen_name", "location", "description", "profile_image_url", "url"]:
186 if a in u and u[a] is not None:
187 u[a] = u[a].encode("utf8")
188 try:
189 pgconn.insert("sinaweibo_users", u)
190 except pg.ProgrammingError, pg.InternalError:
191 try:
a05aa04c »
2011-05-24 Not used anymore. Superseded by sinaweibo.oauth.py
192 if doupdate_user:
193 if verbose:
194 print "user: trying to update instead"
f97ef17e »
2011-03-01 added support for comments
195 pgconn.update("sinaweibo_users", u)
196 except:
197 print "user: an error has occurred (row cannot be updated)"
a05aa04c »
2011-05-24 Not used anymore. Superseded by sinaweibo.oauth.py
198 if verbose:
199 print u
200 if (j == len(js) - 1 or breakafterupdate) and "user" in l:
201 u = l["user"]
202 sql = "UPDATE sinaweibo_users SET posts_updated = NOW() WHERE id = %(id)d " % { "id": u["id"] }
203 pgconn.query(sql)
204 if breakafterupdate:
205 if verbose:
206 print "up to date. breaking..."
207 break
91c2fb08 »
2010-12-06 first re-commit
208 elif opt == 2:
209 table_name += "_users"
2985f624 »
2010-12-21 added sleeptime in case sina api times out
210 if not isinstance(js, types.ListType):
211 js = [js]
212 if len(js) <= 0:
213 print js
214 sys.exit()
215 for l in js:
f97ef17e »
2011-03-01 added support for comments
216 l["retrieved"] = "NOW()"
2985f624 »
2010-12-21 added sleeptime in case sina api times out
217 if fromstatuses and "user" in l:
218 l = l["user"]
f97ef17e »
2011-03-01 added support for comments
219 for a in ["name", "screen_name", "location", "description", "profile_image_url", "url"]:
220 if a in l and l[a] is not None:
221 l[a] = l[a].encode("utf8")
91c2fb08 »
2010-12-06 first re-commit
222 try:
f97ef17e »
2011-03-01 added support for comments
223 pgconn.insert(table_name, l)
19b0e3c1 »
2011-04-11 monthly update
224 if verbose:
a05aa04c »
2011-05-24 Not used anymore. Superseded by sinaweibo.oauth.py
225 print "SUCCESS," + str(l["statuses_count"])
2985f624 »
2010-12-21 added sleeptime in case sina api times out
226 except pg.ProgrammingError, pg.InternalError:
227 print "user duplicate found in DB..."
228 try:
229 if doupdate:
230 print "trying to update instead"
f97ef17e »
2011-03-01 added support for comments
231 pgconn.update(table_name, l)
a3d6637b »
2011-03-01 cleaned up, "in_reply_to_screen_name", print row
232 else:
233 print l
2985f624 »
2010-12-21 added sleeptime in case sina api times out
234 except:
235 print "an error has occurred (row cannot be updated)"
a3d6637b »
2011-03-01 cleaned up, "in_reply_to_screen_name", print row
236 print l
91c2fb08 »
2010-12-06 first re-commit
237 elif opt == 3 or opt == 4:
238 if opt == 3:
239 table_name += "_friends"
240 elif opt == 4:
241 table_name += "_followers"
242 l = js["ids"]
243 for x in l:
244 r = {"source_id": user_id, "target_id": x}
245 r["retrieved"] = "NOW()"
246 try:
247 pgconn.insert(table_name, r)
248 except pg.ProgrammingError, pg.InternalError:
2985f624 »
2010-12-21 added sleeptime in case sina api times out
249 #print "duplicate and cannot update"
250 try:
251 pgconn.update(table_name, r)
252 except:
253 print r
254 print "duplicate and cannot update"
f97ef17e »
2011-03-01 added support for comments
255 elif opt == 8:
256 table_name += "_comments"
19b0e3c1 »
2011-04-11 monthly update
257 tobeginning_grace_count = 0
f97ef17e »
2011-03-01 added support for comments
258 for x in js:
259 last_one = x
260 for a in ["text"]:
261 if a in x and x[a] is not None:
262 x[a] = x[a].encode("utf8")
263 if "user" in x:
264 x["user_id"] = x["user"]["id"]
265 else:
266 u = None
267 if "status" in x:
268 x["status_id"] = x["status"]["id"]
269 try:
270 pgconn.insert(table_name, x)
271 except pg.ProgrammingError, pg.InternalError:
272 if not tobeginning:
19b0e3c1 »
2011-04-11 monthly update
273 tobeginning_grace_count += 1 # have a bit of loose in case the next comments page contains some of the previous in sinacomments.sh iteration
274 print "tobeginning_grace_count: " + str(tobeginning_grace_count)
275 if tobeginning_grace_count > tobeginning_grace:
276 print last_one
277 print "UP-TO-DATE: comments up to date (duplicate found in DB)"
278 break
279 else:
280 if not doupdate:
281 continue
f97ef17e »
2011-03-01 added support for comments
282 try:
283 if doupdate:
284 print "trying to update instead"
285 pgconn.update(table_name, x)
286 except:
287 print "an error has occurred (row cannot be updated)"
288 print x
289 # try to add the user
290 if insert_user and "user" in x:
291 u = x["user"]
292 u["retrieved"] = "NOW()"
293 for a in ["name", "screen_name", "location", "description", "profile_image_url", "url"]:
294 if a in u and u[a] is not None:
295 u[a] = u[a].encode("utf8")
296 try:
297 pgconn.insert("sinaweibo_users", u)
298 except pg.ProgrammingError, pg.InternalError:
299 try:
300 if doupdate:
301 print "user: trying to update instead"
302 pgconn.update("sinaweibo_users", u)
a3d6637b »
2011-03-01 cleaned up, "in_reply_to_screen_name", print row
303 else:
304 print u
f97ef17e »
2011-03-01 added support for comments
305 except:
306 print "user: an error has occurred (row cannot be updated)"
a3d6637b »
2011-03-01 cleaned up, "in_reply_to_screen_name", print row
307 print u
91c2fb08 »
2010-12-06 first re-commit
308
309 f.close()
Something went wrong with that request. Please try again.