<a href="https://colab.research.google.com/github/KizMan-23/sentiment_analysis/blob/main/bitcoin_sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning, module="ipykernel.ipkernel")

In [None]:

!pip install mwclient



In [None]:
import mwclient as mw
import time

In [None]:
site = mw.Site('en.wikipedia.org')
page = site.pages['Bitcoin']

In [None]:
revs = list(page.revisions())

In [None]:
revs[0]

OrderedDict([('revid', 1254525564),
             ('parentid', 1254525362),
             ('user', 'A455bcd9'),
             ('timestamp',
              time.struct_time(tm_year=2024, tm_mon=10, tm_mday=31, tm_hour=12, tm_min=27, tm_sec=56, tm_wday=3, tm_yday=305, tm_isdst=-1)),
             ('comment',
              '/* 2008–2009: Creation */ Per the source: "Moreover, Nakamoto didn\'t care for academic peer review and didn\'t fully connect it to its history. As a result, academics essentially ignored bitcoin for several years. Many academic communities informally argued that Bitcoin couldn\'t work, based on theoretical models or experiences with past systems, despite the fact that it was working in practice."')])

In [None]:
revs = sorted(revs, key=lambda x: x["timestamp"])

In [None]:
revs[0]

OrderedDict([('revid', 275832581),
             ('parentid', 0),
             ('user', 'Pratyeka'),
             ('timestamp',
              time.struct_time(tm_year=2009, tm_mon=3, tm_mday=8, tm_hour=16, tm_min=41, tm_sec=7, tm_wday=6, tm_yday=67, tm_isdst=-1)),
             ('comment', 'creation (stub)')])

In [None]:
len(revs)

17924

In [None]:
from transformers import pipeline

In [None]:
sentiment_pipeline = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [None]:
def find_sentiment(text):
    sent = sentiment_pipeline([text[:250]])[0]
    score = sent["score"]
    if sent["label"] == "NEGATIVE":
        score *= -1
    return score

In [None]:
find_sentiment("I am very appaled")

0.9981369972229004

In [None]:
edits = {}

for rev in revs:
    date = time.strftime("%Y-%m-%d", rev["timestamp"])

    if date not in edits:
        edits[date] = dict(sentiments=list(), edit_count=0)

    edits[date]["edit_count"] += 1


    comment = rev['comment']
    edits[date]["sentiments"].append(find_sentiment(comment))

KeyError: 'comment'

In [None]:
from statistics import mean

for key in edits:
    if len(edits[key]["sentiments"]) > 0:
        edits[key]["sentiment"] = mean(edits[key]["sentiments"])
        edits[key]["neg_sentiment"] = len([s for s in edits[key]["sentiments"] if s < 0]) / len(edits[key]["sentiments"])
    else:
        edits[key]["sentiment"] = 0
        edits[key]["neg_sentiment"] = 0
    del edits[key]["sentiments"]

In [None]:
edits

In [None]:
import pandas as pd

edits_df = pd.DataFrame.from_dict(edits, orient='index')

In [None]:
edits_df.head(10)

Unnamed: 0,edit_count,sentiment,neg_sentiment
2009-03-08,4,-0.550525,0.75
2009-08-05,1,0.748121,0.0
2009-08-06,2,0.995746,0.0
2009-08-14,1,0.930021,0.0
2009-10-13,2,-0.227501,0.5
2009-11-18,1,0.883951,0.0
2009-12-08,1,-0.986927,1.0
2009-12-17,1,-0.997517,1.0
2010-02-23,1,-0.999495,1.0
2010-03-18,1,0.875877,0.0


In [None]:
edits_df.index

DatetimeIndex(['2009-03-08', '2009-08-05', '2009-08-06', '2009-08-14',
               '2009-10-13', '2009-11-18', '2009-12-08', '2009-12-17',
               '2010-02-23', '2010-03-18',
               ...
               '2022-08-02', '2022-08-14', '2022-08-17', '2022-08-23',
               '2022-08-29', '2022-09-01', '2022-09-02', '2022-09-06',
               '2022-09-08', '2022-09-10'],
              dtype='datetime64[ns]', length=2525, freq=None)

In [None]:
edits_df.index = pd.to_datetime(edits_df.index)

In [None]:
from datetime import datetime

dates = pd.date_range(start="2009-03-08",end=datetime.today())

In [None]:
edits_df = edits_df.reindex(dates, fill_value=0.0)

In [None]:
edits_df

Unnamed: 0,edit_count,sentiment,neg_sentiment
2009-03-08,4.0,-0.550525,0.75
2009-03-09,0.0,0.000000,0.00
2009-03-10,0.0,0.000000,0.00
2009-03-11,0.0,0.000000,0.00
2009-03-12,0.0,0.000000,0.00
...,...,...,...
2024-10-31,0.0,0.000000,0.00
2024-11-01,0.0,0.000000,0.00
2024-11-02,0.0,0.000000,0.00
2024-11-03,0.0,0.000000,0.00


In [None]:
rolling_edits = edits_df.rolling(30, min_periods=30).mean()

In [None]:
rolling_edits = rolling_edits.dropna()

In [None]:
rolling_edits.head(10)

Unnamed: 0,edit_count,sentiment,neg_sentiment
2009-04-06,0.133333,-0.018351,0.025
2009-04-07,0.0,0.0,0.0
2009-04-08,0.0,0.0,0.0
2009-04-09,0.0,0.0,0.0
2009-04-10,0.0,0.0,0.0
2009-04-11,0.0,0.0,0.0
2009-04-12,0.0,0.0,0.0
2009-04-13,0.0,0.0,0.0
2009-04-14,0.0,0.0,0.0
2009-04-15,0.0,0.0,0.0


In [None]:
rolling_edits.to_csv("wikipedia_edits.csv")