In [1]:
from tqdm import tqdm 

import mwclient         # it is used for getting the data from the wikipedia 
import time 

website = mwclient.Site("en.wikipedia.org")            # it is to say that i am using the english wiki
page    = website.pages["Bitcoin"]                     # this is what i am opening in wiki

We are talking out the edits of the bitcoin page in the wikipedia as it will help in understanding the interest of the people in bitcoin and it will also help in understanding some of the sentiments.

In [2]:
revs = list(tqdm(page.revisions()))

17831it [03:20, 88.81it/s] 


In [3]:
revs[0]        # here we got the latest edit in the bitcoin page 

OrderedDict([('revid', 1212219544),
             ('parentid', 1210641921),
             ('user', 'FCBWanderer'),
             ('timestamp',
              time.struct_time(tm_year=2024, tm_mon=3, tm_mday=6, tm_hour=18, tm_min=40, tm_sec=1, tm_wday=2, tm_yday=66, tm_isdst=-1)),
             ('comment', '/* Units and divisibility */')])

In [4]:
revs = sorted(revs, key=lambda rev: rev["timestamp"])            # this is to reverse the process to get the first edit 

In [5]:
revs[0]         # here we got the first edit of the bitcoin page

OrderedDict([('revid', 275832581),
             ('parentid', 0),
             ('user', 'Pratyeka'),
             ('timestamp',
              time.struct_time(tm_year=2009, tm_mon=3, tm_mday=8, tm_hour=16, tm_min=41, tm_sec=7, tm_wday=6, tm_yday=67, tm_isdst=-1)),
             ('comment', 'creation (stub)')])

In [6]:
from transformers import pipeline        # inorder to find the sentiments, we are going to use the library called as transformers
sentiment_pipeline = pipeline("sentiment-analysis") 

# we are going to use a function to use this sentiment pipeline to find the sentiment 
def find_sentiment(text):
    sent = sentiment_pipeline([text[:250]])[0]
    score = sent["score"]
    if sent["label"] == "NEGATIVE":
        score *= -1
    return score

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


In [7]:
find_sentiment('i love you')

0.9998656511306763

In [8]:
find_sentiment('i hate you')

-0.9991129040718079

In [9]:
edits = {}

for rev in tqdm(revs):
    date = time.strftime("%Y-%m-%d", rev["timestamp"])
# we are doing this because above we can see that the timestamp above is in time.struct_time format 
# which can get a little tricky so thats why we are converting it to time.strftime format which will be easy for us 
# strftime stands for string format time 
    
    if date not in edits:
        edits[date] = dict(sentiments = list(), edit_count = 0)
        
    edits[date]["edit_count"] += 1
    
    comment = rev.get("comment", "")
    edits[date]["sentiments"].append(find_sentiment(comment))
     
# so this will give us a dictionary called edits, where each key is a date and then we have the number of times that the bitcoin page was edited on that date and along with the sentiments of the comments of that page

100%|█████████████████████████████████████| 17831/17831 [13:01<00:00, 22.83it/s]


In [10]:
edits

{'2009-03-08': {'sentiments': [-0.9905920624732971,
   0.7481210231781006,
   -0.9907428622245789,
   -0.9688861966133118],
  'edit_count': 4},
 '2009-08-05': {'sentiments': [0.7481210231781006], 'edit_count': 1},
 '2009-08-06': {'sentiments': [0.995745837688446, 0.995745837688446],
  'edit_count': 2},
 '2009-08-14': {'sentiments': [0.9300212860107422], 'edit_count': 1},
 '2009-10-13': {'sentiments': [0.5404373407363892, -0.9954361319541931],
  'edit_count': 2},
 '2009-11-18': {'sentiments': [0.8839510679244995], 'edit_count': 1},
 '2009-12-08': {'sentiments': [-0.9869275689125061], 'edit_count': 1},
 '2009-12-17': {'sentiments': [-0.9975171089172363], 'edit_count': 1},
 '2010-02-23': {'sentiments': [-0.9994946718215942], 'edit_count': 1},
 '2010-03-18': {'sentiments': [0.8758776783943176], 'edit_count': 1},
 '2010-04-13': {'sentiments': [0.9300212860107422,
   0.8158007860183716,
   0.8158007860183716,
   0.8158007860183716],
  'edit_count': 4},
 '2010-04-15': {'sentiments': [0.930021

In [11]:
from statistics import mean 

for key in edits:
    if len(edits[key]["sentiments"]) > 0:
        edits[key]["sentiment"] = mean(edits[key]["sentiments"])
        edits[key]["neg_sentiment"] = len([s for s in edits[key]['sentiments'] if s < 0]) / len(edits[key]["sentiments"])     # it will give the percentage of negative comments in the sentiments
    else:
        edits[key]["sentiment"] = 0
        edits[key]["neg_sentiment"] = 0
        
    del edits[key]["sentiments"]

In [12]:
edits

{'2009-03-08': {'edit_count': 4,
  'sentiment': -0.5505250245332718,
  'neg_sentiment': 0.75},
 '2009-08-05': {'edit_count': 1,
  'sentiment': 0.7481210231781006,
  'neg_sentiment': 0.0},
 '2009-08-06': {'edit_count': 2,
  'sentiment': 0.995745837688446,
  'neg_sentiment': 0.0},
 '2009-08-14': {'edit_count': 1,
  'sentiment': 0.9300212860107422,
  'neg_sentiment': 0.0},
 '2009-10-13': {'edit_count': 2,
  'sentiment': -0.22749939560890198,
  'neg_sentiment': 0.5},
 '2009-11-18': {'edit_count': 1,
  'sentiment': 0.8839510679244995,
  'neg_sentiment': 0.0},
 '2009-12-08': {'edit_count': 1,
  'sentiment': -0.9869275689125061,
  'neg_sentiment': 1.0},
 '2009-12-17': {'edit_count': 1,
  'sentiment': -0.9975171089172363,
  'neg_sentiment': 1.0},
 '2010-02-23': {'edit_count': 1,
  'sentiment': -0.9994946718215942,
  'neg_sentiment': 1.0},
 '2010-03-18': {'edit_count': 1,
  'sentiment': 0.8758776783943176,
  'neg_sentiment': 0.0},
 '2010-04-13': {'edit_count': 4,
  'sentiment': 0.84435591101646

In [13]:
import pandas as pd

edits_df = pd.DataFrame.from_dict(edits, orient = "index")

In [14]:
edits_df

Unnamed: 0,edit_count,sentiment,neg_sentiment
2009-03-08,4,-0.550525,0.750000
2009-08-05,1,0.748121,0.000000
2009-08-06,2,0.995746,0.000000
2009-08-14,1,0.930021,0.000000
2009-10-13,2,-0.227499,0.500000
...,...,...,...
2024-02-23,7,-0.456403,0.714286
2024-02-25,2,-0.005358,0.500000
2024-02-26,1,-0.996016,1.000000
2024-02-27,3,-0.321697,0.666667


In [15]:
edits_df.index = pd.to_datetime(edits_df.index)

In [16]:
from datetime import datetime

dates = pd.date_range(start = "2009-03-08", end = datetime.today())

In [17]:
dates

DatetimeIndex(['2009-03-08', '2009-03-09', '2009-03-10', '2009-03-11',
               '2009-03-12', '2009-03-13', '2009-03-14', '2009-03-15',
               '2009-03-16', '2009-03-17',
               ...
               '2024-03-06', '2024-03-07', '2024-03-08', '2024-03-09',
               '2024-03-10', '2024-03-11', '2024-03-12', '2024-03-13',
               '2024-03-14', '2024-03-15'],
              dtype='datetime64[ns]', length=5487, freq='D')

In [18]:
edits_df = edits_df.reindex(dates, fill_value = 0)

In [19]:
edits_df

Unnamed: 0,edit_count,sentiment,neg_sentiment
2009-03-08,4,-0.550525,0.75
2009-03-09,0,0.000000,0.00
2009-03-10,0,0.000000,0.00
2009-03-11,0,0.000000,0.00
2009-03-12,0,0.000000,0.00
...,...,...,...
2024-03-11,0,0.000000,0.00
2024-03-12,0,0.000000,0.00
2024-03-13,0,0.000000,0.00
2024-03-14,0,0.000000,0.00


In [20]:
rolling_edits = edits_df.rolling(30).mean()

In [21]:
rolling_edits

Unnamed: 0,edit_count,sentiment,neg_sentiment
2009-03-08,,,
2009-03-09,,,
2009-03-10,,,
2009-03-11,,,
2009-03-12,,,
...,...,...,...
2024-03-11,1.333333,-0.180856,0.284921
2024-03-12,1.333333,-0.180856,0.284921
2024-03-13,1.333333,-0.180856,0.284921
2024-03-14,1.200000,-0.164037,0.259921


In [22]:
rolling_edits = rolling_edits.dropna()

In [23]:
rolling_edits

Unnamed: 0,edit_count,sentiment,neg_sentiment
2009-04-06,0.133333,-0.018351,0.025000
2009-04-07,0.000000,0.000000,0.000000
2009-04-08,0.000000,0.000000,0.000000
2009-04-09,0.000000,0.000000,0.000000
2009-04-10,0.000000,0.000000,0.000000
...,...,...,...
2024-03-11,1.333333,-0.180856,0.284921
2024-03-12,1.333333,-0.180856,0.284921
2024-03-13,1.333333,-0.180856,0.284921
2024-03-14,1.200000,-0.164037,0.259921


In [24]:
rolling_edits.to_csv("wikipedia_edits.csv")