In [1]:
import requests
import datetime
import json
import csv
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import traceback
import logging
import seaborn as sns
from pandas.io.json import json_normalize
%matplotlib inline

import sys
default_stdout = sys.stdout
default_stderr = sys.stderr
reload(sys)
sys.stdout = default_stdout
sys.stderr = default_stderr
sys.setdefaultencoding('utf-8')



In [2]:
def check_connect_output_json(url_address):
    """
    
    只是試試網頁有沒有連線成功，若成功就回傳json
    """
    limit_retry_number = 20
    for i in range(limit_retry_number):
        try :
            r = requests.get(url_address)
            if r.status_code == 200:
                r.encoding = "utf-8"
                jsondata = r.json()
                break
            else:
                print( "retry ",i," times! 5sec between and retry only 20 times.")
                print( "else sleep")
                time.sleep(5)
        except (KeyboardInterrupt, SystemExit):
            raise
        except Exception as e:
            print( "retry ",i," times! 5sec between and retry only 20 times.")
            print( "except sleep")
            time.sleep(5)
        if i == 20:
            return {"this url":{"is bad"}}
    return jsondata

def get_page_posts_to_df(url,page_num):
    """
    
    把所有的posts抓下來，沒有判斷跟迴圈，一個分頁有25則posts只要不到一秒。
    """
    jsondata = check_connect_output_json(url)['posts']
    testdf = pd.DataFrame()
    # 只要有分頁連結就一直往下
    x=0
    while 'paging' in jsondata:
        x +=1
        temp=json_normalize(jsondata['data']) 
        testdf=testdf.append(temp,ignore_index=True)
        newurl = jsondata['paging']['next']
        newr= requests.get(newurl)
        newr.encoding="utf-8"
        jsondata = newr.json()
        print( x,time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
        #若超過幾個分頁就先暫停
        if x==page_num:
            break
    return testdf
def clean_page_post(datadf):
    """
    資料抓下來再處理，免得花很多時間抓結果清資料出錯又要重抓，這裡也花不到一秒。
    """
    datadf=datadf[~datadf.applymap(lambda x: x == [] or x is None)]
    datadf.dropna(axis = 1, how="all",inplace=True)
    datadf.columns = ['comments','create_time','id','link','message','name','reactions','shares','type']
    od=['id','create_time','name','message','link','type','comments','reactions','shares']
    datadf=datadf[od]
    datadf.create_time =pd.to_datetime(datadf.create_time,format= "%Y-%m-%dT%H:%M:%S+0000")
    datadf['create_time']=datadf.create_time + datetime.timedelta(hours=8)
    return datadf

def post_reactions(clean_post,access_token):
    """
    
    抓各個posts的reactions居然超花時間，要再花時間加上Threads，應該就會快了。
    """
    id_list = clean_post.id.tolist()
    append_temp = pd.DataFrame()
    print( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) , "start reactions")
    for post_id in id_list:
        react_url=reactions_number(post_id,access_token)
        react_json=check_connect_output_json(react_url)
        react_temp=json_normalize(react_json)
        append_temp= append_temp.append(react_temp)
    append_temp=append_temp[~append_temp.applymap(lambda x: x == [] or x is None)]
    append_temp.dropna(axis = 1, how="all",inplace=True)
    append_temp.columns=['angry','haha','id','like','love','sad','wow']
    od=['id','angry','haha','like','love','sad','wow']
    append_temp=append_temp[od]
    print( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), "finished reactions")
    return append_temp



In [70]:
def reactions_number(status_id,access_token):
    """
    
    取得各個posts的reactions num，本來以為還沒有表情符號的posts的reactions會是空值，實際上只是用0顯示
    """
    status_id = status_id
    base = "https://graph.facebook.com/v2.8"
    node = "/%s" % status_id
    reactions = "/?fields=" \
            "reactions.type(LIKE).limit(0).summary(total_count).as(like)" \
            ",reactions.type(LOVE).limit(0).summary(total_count).as(love)" \
            ",reactions.type(WOW).limit(0).summary(total_count).as(wow)" \
            ",reactions.type(HAHA).limit(0).summary(total_count).as(haha)" \
            ",reactions.type(SAD).limit(0).summary(total_count).as(sad)" \
            ",reactions.type(ANGRY).limit(0).summary(total_count).as(angry)"
    parameters = "&"+access_token
    url = base + node + reactions + parameters
#     print( url)
    return url
def fb_pages_info(page_id,access_token):
    # 選API版本預設2.8
    fb_graph_api = "https://graph.facebook.com/v2.8/"
    # 放粉絲頁ID
    nodes = page_id
    # access_token
    access_token = access_token
    # 所要看的欄位
    fields ="""/?fields=posts{id,name,link,message,shares,created_time,type,comments.limit(0).summary(total_count),reactions.limit(0).summary(total_count)}""" 
    # 把url連接整理好丟出去
    url = fb_graph_api+nodes+fields+"&"+access_token
#     print( url)
    return url

In [77]:
app_id = ""
app_secret = ""
# 請放入自己的fb_app_token
access_token = "access_token="+app_id + "|" + app_secret

page_id_list = ['YaoTurningTaipei','DoctorKoWJ','appledaily.tw',"gamer.com.tw","crazyck101","WoTBlitzTW"]
# 存粹紀錄試過哪幾個粉絲團
page_id = "WoTBlitzTW"

url = fb_pages_info(page_id,access_token) # 所選的粉絲團的資料的url
page_post = get_page_posts_to_df(url,50) # 把url放進去取得資料，50是要抓幾個分頁
clean_post=clean_page_post(page_post) # 清理posts資料
reactions=post_reactions(clean_post,access_token) # 抓posts 的reactions 然後清乾淨
merge_reactions = pd.merge(reactions,clean_post,on="id") #把reactions 跟posts合併，最後的df

# merge_reactions.to_csv(page_id+'.csv',index=False)
# merge_reactions.plot(merge_reactions.create_time,figsize=(12,8))

1 2017-04-17 11:01:12
2 2017-04-17 11:01:13
3 2017-04-17 11:01:13
4 2017-04-17 11:01:14
5 2017-04-17 11:01:14
6 2017-04-17 11:01:15
7 2017-04-17 11:01:16
8 2017-04-17 11:01:16
9 2017-04-17 11:01:17
10 2017-04-17 11:01:17
11 2017-04-17 11:01:18
12 2017-04-17 11:01:19
13 2017-04-17 11:01:20
14 2017-04-17 11:01:20
15 2017-04-17 11:01:21
16 2017-04-17 11:01:22
17 2017-04-17 11:01:27
18 2017-04-17 11:01:29
19 2017-04-17 11:01:29
20 2017-04-17 11:01:30
21 2017-04-17 11:01:31
22 2017-04-17 11:01:31
23 2017-04-17 11:01:32
24 2017-04-17 11:01:32
25 2017-04-17 11:01:33
26 2017-04-17 11:01:34
27 2017-04-17 11:01:34
28 2017-04-17 11:01:35
29 2017-04-17 11:01:36
30 2017-04-17 11:01:36
31 2017-04-17 11:01:37
32 2017-04-17 11:01:38
33 2017-04-17 11:01:38
34 2017-04-17 11:01:39
35 2017-04-17 11:01:40
36 2017-04-17 11:01:40
37 2017-04-17 11:01:41
38 2017-04-17 11:01:41
done
2017-04-17 11:01:41
2017-04-17 11:01:41
2017-04-17 11:01:41
2017-04-17 11:06:26
