In [33]:
import requests
from mastodon import Mastodon, StreamListener
import json
import html2text
import pytz
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
import torch
from scipy.special import expit
#toot processing
def toots_processing(toots):
    toots_processed = []
    for toot in toots:
        toot_id = toot['id']

        toot_created_at = toot['created_at']
        toot_created_at_reformat = datetime2string(toot_created_at)

        toot_content_html = toot['content']
        toot_content_text = content_processing(toot_content_html)

        toot_url = toot['url']

        toot_language = toot['language']

        toot_processed = {
            'id': toot_id,
            'created_at': toot_created_at_reformat,
            'content': toot_content_text,
            'url': toot_url,
            'language': toot_language
        }
        toots_processed.append(toot_processed)
    return toots_processed


def toot_processing(raw_toot):
    toot_id = raw_toot['id']

    toot_created_at = raw_toot['created_at']
    toot_created_at_reformat = datetime2string(toot_created_at)

    toot_content_html = raw_toot['content']
    toot_content_text = content_processing(toot_content_html)

    toot_sentiment_score = toots_sentiment_analysis(toot_content_text)
    toot_topics = toot_topic_classification(toot_content_text)

    toot_url = raw_toot['url']

    toot_language = raw_toot['language']

    toot_processed = {
        'id': toot_id,
        'created_at': toot_created_at_reformat,
        'content': toot_content_text,
        'sentiment_score': toot_sentiment_score,
        "topics": toot_topics,
        'url': toot_url,
        'language': toot_language
    }
    return toot_processed


def toots_sentiment_analysis(toot_content):
    nltk.download('vader_lexicon')
    sid = SentimentIntensityAnalyzer()
    ss = sid.polarity_scores(toot_content)
    sentiment_score = ss["compound"]
    return sentiment_score


def datetime2string(toot_datetime):
    """
    Convert datetime object to string
    :param toot_datetime:
    :return:
    """
    sydney_timezone = pytz.timezone('Australia/Sydney')
    sydney_datetime = toot_datetime.astimezone(sydney_timezone)
    toot_datetime_string = sydney_datetime.strftime("%Y-%m-%d %H:%M:%S %Z")
    return toot_datetime_string


def content_processing(toot_content):
    """
    Convert html to text
    :param toot_content:
    :return:
    """
    h = html2text.HTML2Text()
    h.ignore_links = True
    toot_content_text = h.handle(toot_content)
    toot_content_text_without_newlines = toot_content_text.replace("\n", "")
    return toot_content_text_without_newlines


def toot_topic_classification(toot):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model_path = f"cardiffnlp/tweet-topic-21-multi"
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSequenceClassification.from_pretrained(model_path).to(device)

    class_mapping = model.config.id2label
    topics = []
    tokens = tokenizer(toot, return_tensors='pt', max_length=512, truncation=True).to(device)
    output = model(**tokens)
    output = {key: value.to("cpu") for key, value in output.items()}
    scores = output["logits"][0].detach().numpy()
    scores = expit(scores)
    predictions = (scores >= 0.5) * 1

    for i in range(len(predictions)):
        if predictions[i]:
            topics.append(class_mapping[i])

    return topics
#MASTODON_ACCESS_TOKEN = "OJaqE-3PYQA1qxnRq-trbUFYeHsdSjE4lR6iC4pFKDA"
    
MASTODON_ACCESS_TOKEN = "Q8oYzpl2vDz6-SOcyf2upfwpUiKS-K9N4qTVLyo2qfA"
toot_count = 0
toots = []
m = Mastodon(api_base_url=f'https://mastodon.au',access_token=MASTODON_ACCESS_TOKEN)
#m = Mastodon(api_base_url=f'https://mastodon.world',access_token=MASTODON_ACCESS_TOKEN)


class StopStreamingException(Exception):
    pass

def crawl_request():
    base_url = 'https://mastodon.au/api/v1/'
    #base_url = 'https://mastodon.world/api/v1/'
    header = {
        "Authorization": f"Bearer {MASTODON_ACCESS_TOKEN}"
    }
    r = requests.get(base_url + '/accounts/verify_credentials', headers=header)
    print(r.json())


class Listener(StreamListener):

    def __init__(self, count, upper_limit):
        super().__init__()
        self.count = count
        self.upper_limit = upper_limit

    def on_update(self, status):
        
        if self.count == 0:
            print("Start harvesting.....")
        if (status["language"]) == "en":
            toot_processed = toot_processing(status)
            print(toot_processed)
            self.count += 1
        if self.count % 50 == 0:

            print("Has harvested {} toots.".format(self.count))
        global toot_count
        toot_count += 1
        if toot_count == 10:
            exit(0)
            return json.dumps(status, indent=2, sort_keys=True, default=str)
            toots.append(status)
            raise StopStreamingException("Received 1000 toots. Stopping streaming.")


if __name__ == '__main__':
    crawl_request()

    listener = Listener(0, 10)
    m.stream_public(listener)
    try:
        m.stream_public(listener)
    except StopStreamingException as e:
        print(e)
        for toot in toots:
            print(toot['content'])
            print()




KeyboardInterrupt



In [35]:
import requests,json
url="https://gateway.api.epa.vic.gov.au/environmentMonitoring/v1/sites"
params={"environmentalSegment":"air"}
headers={
    'User-agent':'curl/8.4.0',
    'Cache-Control':'no-cache',
    'X-API-Key':'f6694fb4cb45496a816c8b630e885f92',
}
response=requests.get(url,params=params,headers=headers)
data=json.loads(response.text)

for record in sorted(data['records'], key=lambda x: x['siteName']):
    print(record['siteName'])
    print(record['geometry']['coordinates'])
    

Alphington
[-37.7784081, 145.0306]
Altona North
[-37.8441238, 144.861618]
Ararat
[-37.2828064, 142.935608]
Bacchus Marsh
[-37.674633, 144.4354]
Bairnsdale
[-37.829895, 147.620209]
Ballarat
[-37.5293274, 143.842438]
Beechworth
[-36.37299, 146.679077]
Benalla
[-36.54932, 145.997223]
Bendigo
[-36.77841, 144.300064]
Boolarra
[-38.36981, 146.2751]
Boolarra South
[-38.44398, 146.2742]
Box Hill
[-37.8287277, 145.1324]
Bright
[-36.73228, 146.9702]
Bright
[-36.7319832, 146.970612]
Brighton
[-37.9135475, 144.998]
Broadford
[-37.2078857, 145.04805]
Brooklyn
[-37.8220978, 144.8471]
Callignee
[-38.36169, 146.588211]
Camperdown
[-38.2248344, 143.140152]
Castlemaine
[-37.0730247, 144.233292]
Churchill
[-38.3043137, 146.414932]
Cobden
[-38.32716, 143.076]
Colac
[-38.3456841, 143.594269]
Dandenong
[-37.98576, 145.1987]
Daylesford
[-37.3410339, 144.130463]
Drysdale
[-38.1913528, 144.550659]
Echuca
[-36.12591, 144.754288]
Flynn
[-38.17107, 146.6933]
Flynns Creek
[-38.24396, 146.6362]
Footscray
[-37.80266

In [39]:
pip install ijson


