In [1]:
import mwclient
import time

In [3]:
site=mwclient.Site('en.wikipedia.org')
pages=site.pages['Bitcoin']

In [5]:
revs=list(pages.revisions())

In [6]:
revs[0]

OrderedDict([('revid', 1252526753),
             ('parentid', 1250769296),
             ('user', 'PiggyGull'),
             ('timestamp',
              time.struct_time(tm_year=2024, tm_mon=10, tm_mday=21, tm_hour=19, tm_min=2, tm_sec=22, tm_wday=0, tm_yday=295, tm_isdst=-1)),
             ('comment', '28.0 release date')])

In [8]:
revs=sorted(revs,key=lambda k:k['timestamp'])

In [9]:
revs[0]

OrderedDict([('revid', 275832581),
             ('parentid', 0),
             ('user', 'Pratyeka'),
             ('timestamp',
              time.struct_time(tm_year=2009, tm_mon=3, tm_mday=8, tm_hour=16, tm_min=41, tm_sec=7, tm_wday=6, tm_yday=67, tm_isdst=-1)),
             ('comment', 'creation (stub)')])

In [13]:
from transformers import pipeline

In [17]:
pipe=pipeline("text-classification")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


In [18]:
def text_analysis(text):
    sent=pipe([text[0:250]])[0]
    score=sent['score']
    if sent['label']=='NEGATIVE':
        score*=-1
    return score

In [19]:
text_analysis("I love lekha")

0.9997971653938293

In [36]:
edits={}

for rev in revs:
    date=time.strftime("%y-%m-%d",rev['timestamp'])

    if date not in edits:
        edits[date]=dict(sentiments=list(),edit_count=0)
    
    edits[date]['edit_count']+=1

    try:
        comment=rev["comment"]
        edits[date]['sentiments'].append(text_analysis(comment))
    except:
        edits[date]['sentiments'].append(0)



In [45]:
del edits['09-03-08']

In [46]:
edits

{'09-08-05': {'sentiments': [0.7481208443641663], 'edit_count': 1},
 '09-08-06': {'sentiments': [0.995745837688446, 0.995745837688446],
  'edit_count': 2},
 '09-08-14': {'sentiments': [0.9300215244293213], 'edit_count': 1},
 '09-10-13': {'sentiments': [0.5404365062713623, -0.9954361319541931],
  'edit_count': 2},
 '09-11-18': {'sentiments': [0.8839513063430786], 'edit_count': 1},
 '09-12-08': {'sentiments': [-0.9869275689125061], 'edit_count': 1},
 '09-12-17': {'sentiments': [-0.9975171089172363], 'edit_count': 1},
 '10-02-23': {'sentiments': [-0.9994946718215942], 'edit_count': 1},
 '10-03-18': {'sentiments': [0.8758779168128967], 'edit_count': 1},
 '10-04-13': {'sentiments': [0.9300215244293213,
   0.815800666809082,
   0.815800666809082,
   0.815800666809082],
  'edit_count': 4},
 '10-04-15': {'sentiments': [0.9300215244293213,
   0.7481208443641663,
   0.7481208443641663,
   0.7481208443641663,
   0.815800666809082,
   0.815800666809082,
   -0.9969743490219116,
   0.815800666809082

In [53]:
from statistics import mean
for date in edits:
    if len(edits[date]['sentiments'])>0:
        edits[date]['sentiment-mean']=mean(edits[date]['sentiments'])
        edits[date]['neg-sentiment']=len([s for s in edits[date]['sentiments'] if s<0])/len(edits[date]['sentiments'])
    else:
        edits[date]['sentiment-mean']=0
        edits[date]['neg-sentiment']=0
    
    del edits[date]['sentiments']

In [54]:
edits

{'09-08-05': {'edit_count': 1,
  'sentiment-mean': 0.7481208443641663,
  'neg-sentiment': 0.0},
 '09-08-06': {'edit_count': 2,
  'sentiment-mean': 0.995745837688446,
  'neg-sentiment': 0.0},
 '09-08-14': {'edit_count': 1,
  'sentiment-mean': 0.9300215244293213,
  'neg-sentiment': 0.0},
 '09-10-13': {'edit_count': 2,
  'sentiment-mean': -0.2274998128414154,
  'neg-sentiment': 0.5},
 '09-11-18': {'edit_count': 1,
  'sentiment-mean': 0.8839513063430786,
  'neg-sentiment': 0.0},
 '09-12-08': {'edit_count': 1,
  'sentiment-mean': -0.9869275689125061,
  'neg-sentiment': 1.0},
 '09-12-17': {'edit_count': 1,
  'sentiment-mean': -0.9975171089172363,
  'neg-sentiment': 1.0},
 '10-02-23': {'edit_count': 1,
  'sentiment-mean': -0.9994946718215942,
  'neg-sentiment': 1.0},
 '10-03-18': {'edit_count': 1,
  'sentiment-mean': 0.8758779168128967,
  'neg-sentiment': 0.0},
 '10-04-13': {'edit_count': 4,
  'sentiment-mean': 0.8443558812141418,
  'neg-sentiment': 0.0},
 '10-04-15': {'edit_count': 8,
  'sen

In [68]:
import pandas as pd

edits_df=pd.DataFrame.from_dict(edits,orient='index')
edits_df=edits_df.sort_index()

In [69]:
edits_df.head(10)

Unnamed: 0,edit_count,sentiment-mean,neg-sentiment
09-08-05,1,0.748121,0.0
09-08-06,2,0.995746,0.0
09-08-14,1,0.930022,0.0
09-10-13,2,-0.2275,0.5
09-11-18,1,0.883951,0.0
09-12-08,1,-0.986928,1.0
09-12-17,1,-0.997517,1.0
10-02-23,1,-0.999495,1.0
10-03-18,1,0.875878,0.0
10-04-13,4,0.844356,0.0


In [70]:
edits_df.index=pd.to_datetime(edits_df.index)

  edits_df.index=pd.to_datetime(edits_df.index)


In [73]:
from datetime import datetime
dates=pd.date_range(start='2009-08-05',end=datetime.today())

In [74]:
dates

DatetimeIndex(['2009-08-05', '2009-08-06', '2009-08-07', '2009-08-08',
               '2009-08-09', '2009-08-10', '2009-08-11', '2009-08-12',
               '2009-08-13', '2009-08-14',
               ...
               '2024-10-17', '2024-10-18', '2024-10-19', '2024-10-20',
               '2024-10-21', '2024-10-22', '2024-10-23', '2024-10-24',
               '2024-10-25', '2024-10-26'],
              dtype='datetime64[ns]', length=5562, freq='D')

In [75]:
edits_df=edits_df.reindex(dates,fill_value=0)

In [77]:
edits_df.shape

(5562, 3)

In [78]:
rolling_edits=edits_df.rolling(30).mean()

In [79]:
rolling_edits

Unnamed: 0,edit_count,sentiment-mean,neg-sentiment
2009-08-05,,,
2009-08-06,,,
2009-08-07,,,
2009-08-08,,,
2009-08-09,,,
...,...,...,...
2024-10-22,1.066667,-0.190901,0.244872
2024-10-23,1.133333,-0.219141,0.278205
2024-10-24,1.133333,-0.219141,0.278205
2024-10-25,1.133333,-0.219141,0.278205


In [81]:
rolling_edits=rolling_edits.dropna()

In [82]:
rolling_edits

Unnamed: 0,edit_count,sentiment-mean,neg-sentiment
2009-09-03,0.500000,-0.043812,0.086508
2009-09-04,0.500000,-0.043812,0.086508
2009-09-05,0.500000,-0.043812,0.086508
2009-09-06,0.500000,-0.043812,0.086508
2009-09-07,0.500000,-0.043812,0.086508
...,...,...,...
2024-10-22,1.066667,-0.190901,0.244872
2024-10-23,1.133333,-0.219141,0.278205
2024-10-24,1.133333,-0.219141,0.278205
2024-10-25,1.133333,-0.219141,0.278205


In [83]:
rolling_edits.to_csv('wikipedia.csv')