In [None]:
!pip install schedule

In [None]:
# import packages
import requests
import json
import pandas as pd
import time
import schedule
from datetime import datetime

In [None]:
class StockTwitAPIScrapper:
    def __init__(self):
        self.intro()
        self.url = "https://api.stocktwits.com/api/2/streams/symbol/{}.json?"
        self.maxId = None
        self.symbol = None
        self.lastRun = None
        self.lastStatus = None
        self.df = pd.DataFrame(
            columns=['id', 'body', 'created_at', 'user', 'source', 'symbols', 'mentioned_users', 'entities'])
        self.getInput()
        self.req_count = 0

    def intro(self):
        message = 'for this scapper it scrap and output dataframe with 8 feilds: id, text body, created time, users, source, symbol, mentioned, entities, and currently only support input for 1 company symbol'
        instruction = 'pls input some constrains for this scrapper'
        print(message)
        print(instruction)

    def getInput(self):
        self.maxId = input(
            'by default scrapper start from current utc time backwards, press enter, else pls enter the max id for the data you want to scrap:')
        while len(self.maxId) != 9 or not self.maxId.isdigit():
            if self.maxId == '':
                break
            self.maxId = input('your max id is not valid, pls re-enter:')
        self.symbol = input('pls enter the company symbol that you want to scrap:')
        while requests.get(self.url.format(self.symbol)).status_code != 200:
            self.symbol = input(
                'seems there is no such company, pls re-enter the company symbol that you want to scrap:')

    def scrap(self, continue_last_run=False, silent=False):

        if self.lastRun is not None:
            continue_last_run = True

        if self.symbol == None:
            print('you did not set the symbol, pls re-initialize a instance')

        # number of queries to run for one file
        query_times = 500

        temp_url = self.url.format(self.symbol)

        if not continue_last_run:
            if self.maxId != None and self.maxId != '':
                temp_url += 'max={}'.format(int(self.maxId) - 1)
            self.df = pd.DataFrame(
                columns=['id', 'body', 'created_at', 'user', 'source', 'symbols', 'mentioned_users', 'entities'])
        else:
            temp_url += 'max={}'.format(int(self.lastRun) - 1)

        for i in range(query_times):
            response = requests.get(temp_url)
            messages = json.loads(response.content)['messages']
            lastid = messages[-1]['id']
            self.df = self.df.append(messages, ignore_index=True)
            temp_url = self.url.format(self.symbol) + 'max={}'.format(int(lastid) - 1)
            self.req_count += 1
            print('num of rows for the current df is ', len(self.df.index))
            if not silent:
                print('run query {} time'.format(i + 1))
            time.sleep(5)

        self.lastRun = lastid
        self.lastStatus = response.status_code
        print('finished, {} queries in total this time'.format(query_times))

In [None]:
app = StockTwitAPIScrapper()

In [None]:
def scheduled_scrap():
    start_time = datetime.utcnow().strftime('%Y%m%d_%H%M%S')
    # Change file output location if necessary
    file_loc = r'C:\Users\Administrator\Desktop\output' + f'\{app.symbol}_{start_time}.csv'
    app.scrap(silent=True)
    app.df.to_csv(file_loc)
    time.sleep(10)
    print('output to ', file_loc)
    app.df = pd.DataFrame(
        columns=['id', 'body', 'created_at', 'user', 'source', 'symbols', 'mentioned_users', 'entities'])

In [None]:
# run task every 15 seconds, starting 10 seconds from now.
# it's the time interval between each execution, excluding the actual execution time
schedule.every(5).seconds.do(scheduled_scrap)

In [None]:
# get all jobs. make sure only one job is here. if not, cancel all the jobs and rerun the schedule
schedule.get_jobs()

In [None]:
# check current request count
app.req_count

In [None]:
# run for certain number of requests
while True:
    schedule.run_pending()
    time.sleep(5)

In [None]:
# app.df.head(20)

In [None]:
# cancel all jobs
# schedule.clear()

In [None]:
# clear all variables
# globals().clear()