In [1]:
# Team 12
# Meilun Yao   1076213
# Yingyi Luan 1179002
# Yuntao Lu 1166487
# Jiayi Xu 1165986
# Zheyuan Wu 1166034
import requests
from mastodon import Mastodon, StreamListener
from bs4 import BeautifulSoup
from datetime import datetime
from zoneinfo import ZoneInfo
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
from scipy.special import expit

# Mastodon Access Token and Base URL
MASTODON_ACCESS_TOKEN = "Q8oYzpl2vDz6-SOcyf2upfwpUiKS-K9N4qTVLyo2qfA"
MASTODON_BASE_URL = 'https://mastodon.au'

# Initialize Mastodon API
m = Mastodon(api_base_url=MASTODON_BASE_URL, access_token=MASTODON_ACCESS_TOKEN)

# Sentiment Analysis
nltk.download('vader_lexicon', quiet=True)
sid = SentimentIntensityAnalyzer()

# Load Topic Classification Model code copy from https://huggingface.co/cardiffnlp/tweet-topic-21-multi 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_path = "cardiffnlp/tweet-topic-21-multi"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path).to(device)
class_mapping = model.config.id2label

# Function for Processing Toots
def toot_processing(raw_toot):
    # Convert and Reformat DateTime to Australia/Sydney Timezone
    sydney_timezone = ZoneInfo("Australia/Sydney")
    sydney_datetime = raw_toot['created_at'].astimezone(sydney_timezone)
    created_at = sydney_datetime.strftime("%Y-%m-%d %H:%M:%S %Z")

    # Extract Text Content and Analyze Sentiment
    soup = BeautifulSoup(raw_toot['content'], 'html.parser')
    for link in soup.find_all('a'):
        link.replace_with(link.get_text())
    content = soup.get_text().replace("\n", " ").strip()

    sentiment_score = sid.polarity_scores(content)["compound"]
    
    #code copy from https://huggingface.co/cardiffnlp/tweet-topic-21-multi 
    tokens = tokenizer(content, return_tensors='pt', max_length=512, truncation=True).to(device)
    output = model(**tokens)
    scores = expit(output["logits"][0].detach().cpu().numpy())
    topics = [class_mapping[i] for i, prediction in enumerate((scores >= 0.5) * 1) if prediction]

    return {
        'id': raw_toot['id'],
        'created_at': created_at,
        'content': content,
        'sentiment_score': sentiment_score,
        "topics": topics,
        'url': raw_toot['url'],
        'language': raw_toot['language']
    }

# Listener Class for Streaming Toots
class Listener(StreamListener):
    def __init__(self, upper_limit):
        super().__init__()
        self.count = 0
        self.upper_limit = upper_limit

    def on_update(self, status):
        if status["language"] == "en":
            toot_processed = toot_processing(status)
            print(toot_processed)
            self.count += 1
            if self.count % 50 == 0:
                print(f"Has harvested {self.count} toots.")
            if self.count >= self.upper_limit:
                raise StopStreamingException("Stopping streaming.")

# Custom Exception to Stop Streaming
class StopStreamingException(Exception):
    pass
    
# Main Execution
if __name__ == '__main__':
    header = {"Authorization": f"Bearer {MASTODON_ACCESS_TOKEN}"}
    r = requests.get(f"{MASTODON_BASE_URL}/api/v1/accounts/verify_credentials", headers=header)
    print(r.json())
    listener = Listener(10000)
    try:
        m.stream_public(listener)
    except StopStreamingException as e:
        print(e)





  from .autonotebook import tqdm as notebook_tqdm


{'id': '112343906767590652', 'username': 'zheyuanw', 'acct': 'zheyuanw', 'display_name': '', 'locked': False, 'bot': False, 'discoverable': None, 'group': False, 'created_at': '2024-04-27T00:00:00.000Z', 'note': '', 'url': 'https://mastodon.au/@zheyuanw', 'uri': 'https://mastodon.au/users/zheyuanw', 'avatar': 'https://mastodon.au/avatars/original/missing.png', 'avatar_static': 'https://mastodon.au/avatars/original/missing.png', 'header': 'https://mastodon.au/headers/original/missing.png', 'header_static': 'https://mastodon.au/headers/original/missing.png', 'followers_count': 0, 'following_count': 0, 'statuses_count': 0, 'last_status_at': None, 'noindex': True, 'source': {'privacy': 'public', 'sensitive': False, 'language': None, 'note': '', 'fields': [], 'follow_requests_count': 0, 'hide_collections': None, 'discoverable': None, 'indexable': False}, 'emojis': [], 'roles': [], 'fields': [], 'role': {'id': '-99', 'name': '', 'permissions': '0', 'color': '', 'highlighted': False}}
{'id': 

KeyboardInterrupt: 

In [11]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from zoneinfo import ZoneInfo
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from transformers import pipeline
import json

# Mastodon Access Token and Base URL
MASTODON_ACCESS_TOKEN = "Q8oYzpl2vDz6-SOcyf2upfwpUiKS-K9N4qTVLyo2qfA"
MASTODON_BASE_URL = 'https://mastodon.au'

# Sentiment Analysis
nltk.download('vader_lexicon', quiet=True)
sid = SentimentIntensityAnalyzer()

# Load Topic Classification Pipeline
model_path = "cardiffnlp/tweet-topic-21-multi"
topic_pipeline = pipeline("text-classification", model=model_path, tokenizer=model_path, return_all_scores=True)
class_mapping = {i: label for i, label in enumerate(topic_pipeline.model.config.id2label.values())}

# Function for Processing Toots
def toot_processing(raw_toot):
    # Convert and Reformat DateTime to Australia/Sydney Timezone
    sydney_timezone = ZoneInfo("Australia/Sydney")
    sydney_datetime = datetime.fromisoformat(raw_toot['created_at']).astimezone(sydney_timezone)
    created_at = sydney_datetime.strftime("%Y-%m-%d %H:%M:%S %Z")

    # Extract Text Content and Analyze Sentiment
    soup = BeautifulSoup(raw_toot['content'], 'html.parser')
    for link in soup.find_all('a'):
        link.replace_with(link.get_text())
    content = soup.get_text().replace("\n", " ").strip()

    sentiment_score = sid.polarity_scores(content)["compound"]

    # Perform Topic Classification
    classification_results = topic_pipeline(content, truncation=True)
    topics = [class_mapping[i] for i, score in enumerate(classification_results[0]) if score['score'] >= 0.5]

    return {
        'id': raw_toot['id'],
        'created_at': created_at,
        'content': content,
        'sentiment_score': sentiment_score,
        'topics': topics,
        'url': raw_toot['url'],
        'language': raw_toot['language']
    }

# Function to Stream Public Toots
def stream_public_toots(listener, limit):
    url = f"{MASTODON_BASE_URL}/api/v1/streaming/public"
    headers = {"Authorization": f"Bearer {MASTODON_ACCESS_TOKEN}"}
    with requests.get(url, headers=headers, stream=True) as response:
        if response.status_code != 200:
            raise Exception("Failed to connect to the streaming endpoint")
        for line in response.iter_lines():
            if line:
                try:
                    event = line.decode('utf-8')
                    if event.startswith("data: "):
                        toot = json.loads(event[6:])
                        if isinstance(toot, dict) and toot.get("language") == "en":
                            toot_processed = toot_processing(toot)
                            print(toot_processed)
                            listener.count += 1
                            if listener.count % 50 == 0:
                                print(f"Has harvested {listener.count} toots.")
                            if listener.count >= limit:
                                raise StopStreamingException("Stopping streaming.")
                except (json.JSONDecodeError, TypeError) as e:
                    continue

# Listener Class for Streaming Toots
class Listener:
    def __init__(self):
        self.count = 0

# Custom Exception to Stop Streaming
class StopStreamingException(Exception):
    pass

# Main Execution
if __name__ == '__main__':
    header = {"Authorization": f"Bearer {MASTODON_ACCESS_TOKEN}"}
    r = requests.get(f"{MASTODON_BASE_URL}/api/v1/accounts/verify_credentials", headers=header)
    print(r.json())
    listener = Listener()
    try:
        stream_public_toots(listener, 10000)
    except StopStreamingException as e:
        print(e)









{'id': '112343906767590652', 'username': 'zheyuanw', 'acct': 'zheyuanw', 'display_name': '', 'locked': False, 'bot': False, 'discoverable': None, 'group': False, 'created_at': '2024-04-27T00:00:00.000Z', 'note': '', 'url': 'https://mastodon.au/@zheyuanw', 'uri': 'https://mastodon.au/users/zheyuanw', 'avatar': 'https://mastodon.au/avatars/original/missing.png', 'avatar_static': 'https://mastodon.au/avatars/original/missing.png', 'header': 'https://mastodon.au/headers/original/missing.png', 'header_static': 'https://mastodon.au/headers/original/missing.png', 'followers_count': 0, 'following_count': 0, 'statuses_count': 0, 'last_status_at': None, 'noindex': True, 'source': {'privacy': 'public', 'sensitive': False, 'language': None, 'note': '', 'fields': [], 'follow_requests_count': 0, 'hide_collections': None, 'discoverable': None, 'indexable': False}, 'emojis': [], 'roles': [], 'fields': [], 'role': {'id': '-99', 'name': '', 'permissions': '0', 'color': '', 'highlighted': False}}


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'id': '112455581821466246', 'created_at': '2024-05-17 18:52:38 AEST', 'content': "Zelensky: Russia's Kharkiv Oblast offensive advances as far as 10 km, halted by 1st defense line: https://benborges.xyz/2024/05/17/zelensky-russias-kharkiv.html", 'sentiment_score': -0.3612, 'topics': ['news_&_social_concern'], 'url': 'https://osintua.eu/@benb/112455578998973357', 'language': 'en'}
{'id': '112455581929263645', 'created_at': '2024-05-17 18:51:16 AEST', 'content': 'Gooners, what is your favourite goal against Everton in N5? 🤔', 'sentiment_score': 0.0, 'topics': ['sports'], 'url': 'https://twitter.com/Arsenal/status/1791391022729068752', 'language': 'en'}
{'id': '112455582074629169', 'created_at': '2024-05-17 18:52:08 AEST', 'content': 'Our final @premierleague@sportsbots.xyz fixture of the season awaits 🔜⌛️ #SHUTOT', 'sentiment_score': 0.0772, 'topics': ['sports'], 'url': 'https://twitter.com/SpursOfficial/status/1791391237422915989', 'language': 'en'}
{'id': '112455582764903161', 'created

KeyboardInterrupt: 

In [21]:
#!pip install ijson

In [22]:
#!pip install geopy 

In [3]:
!pip install sseclient

Collecting sseclient
  Downloading sseclient-0.0.27.tar.gz (7.5 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Installing backend dependencies: started
  Installing backend dependencies: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: sseclient
  Building wheel for sseclient (pyproject.toml): started
  Building wheel for sseclient (pyproject.toml): finished with status 'done'
  Created wheel for sseclient: filename=sseclient-0.0.27-py3-none-any.whl size=5587 sha256=ef49207f0071bd97f844da26d3f9ca559aca3d87f86f3522a90b0ce12ad73e40
  Stored in directory: c:\users\zheyu\appdata\local\pip\cache\wheels\31\92\90\140c622b64ef3858608d5bbceb950fc169553911f32c2eca19
Successfully built sseclien