In [1]:
import mwclient
import time

site = mwclient.Site('en.wikipedia.org')
page = site.pages['Bitcoin']

In [2]:
revs = list(page.revisions())#创建revisions有序数列

In [6]:
revs[0]

OrderedDict([('revid', 275832581),
             ('parentid', 0),
             ('user', 'Pratyeka'),
             ('timestamp',
              time.struct_time(tm_year=2009, tm_mon=3, tm_mday=8, tm_hour=16, tm_min=41, tm_sec=7, tm_wday=6, tm_yday=67, tm_isdst=-1)),
             ('comment', 'creation (stub)')])

In [7]:
revs = sorted(revs, key=lambda rev: rev["timestamp"]) #反转排列revs数列

In [8]:
revs[0]

OrderedDict([('revid', 275832581),
             ('parentid', 0),
             ('user', 'Pratyeka'),
             ('timestamp',
              time.struct_time(tm_year=2009, tm_mon=3, tm_mday=8, tm_hour=16, tm_min=41, tm_sec=7, tm_wday=6, tm_yday=67, tm_isdst=-1)),
             ('comment', 'creation (stub)')])

In [9]:
from transformers import pipeline#导入transformers使用深度学习模型
sentiment_pipeline = pipeline("sentiment-analysis")#sentiment情感分析

def find_sentiment(text):
    sent = sentiment_pipeline([text[:250]])[0]#选取250项尝试,定为第一项
    score = sent["score"]#score预测情感强度范围0到1
    if sent["label"] == "NEGATIVE":
        score *= -1
    return score

  from .autonotebook import tqdm as notebook_tqdm
No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Downloading: 100%|█████████████████████████████████████████████████████████████████████| 629/629 [00:00<00:00, 631kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading: 100%|██████████████████████████████████████████████████████████████████| 268M/268M [00:33<00:00, 8.07MB/s]
Downloading: 100%|██████████████████████████████████████████████████████████████████| 48.0/48.0 [00:00<00:00, 47.5kB/s]
Downloading: 100%|████████████████████████████████

In [10]:
#edits字典里的key是date，每个data里有编辑次数和评论情感分析分数
edits = {}

for rev in revs:        
    date = time.strftime("%Y-%m-%d", rev["timestamp"])
    #时间转换为字符串
    
    if date not in edits:
        edits[date] = dict(sentiments=list(), edit_count=0)
        #如果日期不在edits里，初始化日期并将情绪分数列表和编辑次数写入字典
    
    edits[date]["edit_count"] += 1
    #wiki一天可编辑多次
    comment = rev.get("comment", "")
    #逐次提取评论
    edits[date]["sentiments"].append(find_sentiment(comment))
    #对评价进行情感分析

In [11]:
from statistics import mean
#循环遍历编辑字典
for key in edits:
    if len(edits[key]["sentiments"]) > 0:
        edits[key]["sentiment"] = mean(edits[key]["sentiments"])
        #当天每个评论情感值取平均作为当天情感值
        edits[key]["neg_sentiment"] = len([s for s in edits[key]["sentiments"] if s < 0]) / len(edits[key]["sentiments"])
        #得到当天消极评论占比
    else:
        edits[key]["sentiment"] = 0
        edits[key]["neg_sentiment"] = 0
    
    del edits[key]["sentiments"]
    #删除原数据,避免数据框里有之前列表

In [12]:
import pandas as pd
#把edits变成pandas数据框以便和市场数据合并
edits_df = pd.DataFrame.from_dict(edits, orient="index")

In [20]:
edits_df

Unnamed: 0,edit_count,sentiment,neg_sentiment
2009-03-08,4,-0.550525,0.750000
2009-03-09,0,0.000000,0.000000
2009-03-10,0,0.000000,0.000000
2009-03-11,0,0.000000,0.000000
2009-03-12,0,0.000000,0.000000
...,...,...,...
2022-12-07,3,0.249740,0.333333
2022-12-08,0,0.000000,0.000000
2022-12-09,0,0.000000,0.000000
2022-12-10,0,0.000000,0.000000


In [21]:
edits_df.index = pd.to_datetime(edits_df.index)

In [22]:
from datetime import datetime

dates = pd.date_range(start="2009-03-08",end=datetime.today())

In [23]:
edits_df = edits_df.reindex(dates, fill_value=0)
#fill_value填满空栏

# edits_df

In [24]:
rolling_edits = edits_df.rolling(30, min_periods=30).mean()

In [25]:
rolling_edits = rolling_edits.dropna()
#去掉没有值的行

In [26]:
rolling_edits

Unnamed: 0,edit_count,sentiment,neg_sentiment
2009-04-06,0.133333,-0.018351,0.025000
2009-04-07,0.000000,0.000000,0.000000
2009-04-08,0.000000,0.000000,0.000000
2009-04-09,0.000000,0.000000,0.000000
2009-04-10,0.000000,0.000000,0.000000
...,...,...,...
2022-12-07,0.200000,-0.019026,0.077778
2022-12-08,0.200000,-0.019026,0.077778
2022-12-09,0.200000,-0.019026,0.077778
2022-12-10,0.200000,-0.019026,0.077778


In [27]:
rolling_edits.to_csv("wikipedia_edits.csv")