In [20]:
#base imports
import os
import sys
parent_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(parent_dir)
poly_dir = os.path.abspath(os.path.join(os.getcwd(), 'data_poly'))
sys.path.append(poly_dir)
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import itertools

# package imports
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
from tqdm import tqdm
import pyarrow.feather as feather
import pickle
from firebase_helper import *
import math
import asyncio
from pandas.io.json import json_normalize
import hashlib

# local imports
import data_poly.poly_getdata as poly_getdata
import data_poly.poly_url as poly_url
import data_poly.poly_helper as poly_helper


# Suppress the UserWarning with a specific message
warnings.filterwarnings(
    "ignore",
    message="DataFrame columns are not unique, some columns will be omitted.",
    category=UserWarning
)

In [2]:
from dotenv import load_dotenv
load_dotenv(dotenv_path="/home/peterzerg/repos/PeterZergQuant/.ENV")
api_key = os.environ.get("POLYGON_APIKEY_MASTER")

In [3]:
fb = firebase_helper()
db = fb.get_db()

In [42]:
#upload functions

def cloud_upload_single(db, collection_name, doc_name, data_dict):
    #upload data to db
    doc_ref = db.collection(collection_name).document(doc_name)
    doc_ref.set(data_dict)

def chunks(data, size):
    data_keys = list(data.keys())
    for i in range(0, len(data_keys), size):
        yield {k: data[k] for k in data_keys[i:i+size]}
        
def cloud_upload_seq(db, collection_name, data_dict, chunk_size=500):
    sub_dicts = list(chunks(data_dict, chunk_size))
    # Iterate through the sub_dicts
    for sub_dict in tqdm(sub_dicts):
        # Create a batch to batch the writes
        batch = db.batch()
        # Iterate through the key-value pairs in the sub_dict
        for key, value in sub_dict.items():
            # Set the document reference in the news_data collection using the key
            doc_ref = db.collection(collection_name).document(str(key))
            # Add the key-value pair to the batch
            batch.set(doc_ref, value)
        # Commit the batch
        batch.commit()
        
async def write_document(doc_ref, data):
    """
    a coroutine for writing a document to Firestore
    """
    await doc_ref.set(data)

async def write_batch(batch, collection_name, task_limit=12):
    """
    write a batch of documents to Firestore asynchronously
    """
    semaphore = asyncio.Semaphore(task_limit) # Create a semaphore to limit the number of concurrent tasks
    coroutines = [] # Create a list to hold the coroutines
    # Iterate through the batch and create a coroutine for each document
    for key, value in batch.items():
        # Set the document reference in the news_data collection using the key
        doc_ref = db.collection(collection_name).document(str(key))
        # Create a coroutine to write the document to Firestore
        coroutine = write_document(doc_ref, value)
        # Append the coroutine to the list
        coroutines.append(coroutine)
    # Run the coroutines concurrently with the semaphore
    async with semaphore:
        await asyncio.gather(*coroutines)

async def cloud_upload(db, data_dict, collection_name, chunk_size=500, task_limit=12):
    """
    upload data dict to collection
    """
    sub_dicts = list(chunks(data_dict, chunk_size))
    # Iterate through the sub_dicts
    for sub_dict in tqdm(sub_dicts):
        batch = {} # Create a batch to batch the writes
        for key, value in sub_dict.items(): # Iterate through the key-value pairs in the sub_dict
            batch[key] = value # Add the key-value pair to the batch
        asyncio.run(write_batch(batch, collection_name, task_limit=task_limit)) # Run the write_batch coroutine asynchronously
        
def delete_collection(db, collection_name):
    collection_ref = db.collection(collection_name)
    # Get all documents in the collection
    docs = collection_ref.stream()
    # Delete each document in the collection
    for doc in docs:
        doc.reference.delete()
    # Delete the collection
    db.collection(collection_name).document().delete()

In [None]:
#sic matching and ticker map cloud set

def cloud_upload_ticker_map(db):
    """
    upload and set ticker map (str->int) to cloud firestore 
    """
    ticker_map_path = "/mnt/d/data/news/ticker_maping_dict.pkl"
    ticker_map_dict = pickle.load(open(ticker_map_path, "rb"))
    #overwrite ticker mapping on db
    doc_ref = db.collection('ticker_map').document('dict')
    doc_ref.set(ticker_map_dict)

def memoize(function):
    """
    cache helper for speed optimization
    """
    cache = {}
    def wrapper(input):
        if input not in cache:
            cache[input] = function(input)
        return cache[input]
    return wrapper

@memoize
def sic_match(input):
    """
    takes a SIC code and return the 10 SIC industries string
    """
    sic_codes = {
        '01': 'agriculture',
        '02': 'agriculture',
        '07': 'agriculture',
        '08': 'agriculture',
        '09': 'agriculture',
        '10': 'mining',
        '11': 'mining',
        '12': 'mining',
        '13': 'mining',
        '14': 'mining',
        '15': 'construction',
        '16': 'construction',
        '17': 'construction',
        **{f"{i:02d}": "manufacturing" for i in range(20, 40)},
        **{f"{i:02d}": "transportation" for i in range(40, 50)},
        '50': 'wholesale',
        '51': 'wholesale',
        **{f"{i:02d}": "retail" for i in range(52, 60)},
        **{f"{i:02d}": "finance" for i in range(60, 68)},
        **{f"{i:02d}": "services" for i in range(70, 90)},
        **{f"{i:02d}": "public_administration" for i in range(91, 100)},
    }
    try:
        return sic_codes[str(input)[0:2]]
    except KeyError:
        raise ValueError("Invalid input. Please enter a two-character string matching a valid SIC code.")

In [34]:
#generate ticker info dict

async def get_all_ticker_info(db):
    ticker_map_dict = db.collection('tickers').document('ticker_hash').get().to_dict()
    url_factory = poly_url.StockUrlFactory(api_key)
    ticker_lc = ticker_map_dict.keys()
    urls_dict = {ticker: url_factory.ReferenceData.ticker_info(url_factory, ticker) for ticker in ticker_lc}
    df_dict = await poly_helper.get_data_from_urls(urls_dict)
    return df_dict

async def ticker_info_ready():
    df_info_dict = await get_all_ticker_info(db)
    upsert_dict = {ticker: df.to_dict('records')[0] for ticker, df in df_info_dict.items()}
    return upsert_dict

In [None]:
def cloud_upload_ticker_info(db, upsert_dict):
    cloud_upload_seq(db, "ticker_info", upsert_dict)

In [None]:
def info_dict_to_sic_map(df_info_dict):
    df = pd.concat(df_info_dict)
    sic_map_dict = {
        str(ticker): sic_match(sic_code[:2]) if isinstance(sic_code, str) else None
        for ticker, sic_code in zip(df.ticker, df.sic_code)
    }
    return sic_map_dict

upload news data to db

In [None]:
# upload news data to db

def convert_arrays_to_lists(value):
    """
    Convert arrays to lists
    """
    if isinstance(value, (list, np.ndarray)):
        return list(value)
    return value

def to_boolean_list(industries):
    """
    convert the industries list to a boolean list
    """
    return [col in industries for col in industry_cols]

def process_news_data(db):
    news_data = pd.read_feather("/mnt/d/data/news/local_us_equity_news") # Load the news data
    doc_ref = db.collection('ticker_sic_map').document('dict') # Get reference to the document
    doc = doc_ref.get() # Retrieve the document data
    # Check if the document exists
    ticker_sic_map = doc.to_dict() if doc.exists else print(f"No such document: {doc_ref.id}")
    #add 10 industry cols
    news_data["industries"] = news_data.tickers.apply(lambda tickers: [ticker_sic_map.get(ticker, None) for ticker in tickers])
    # Define the industry column names and default values
    industry_cols = list(set(['agriculture', 'mining', 'construction', 'manufacturing', 'transportation',
                     'wholesale', 'retail', 'finance', 'services', 'public_administration']))
    # Create a dataframe with the boolean values for each industry
    boolean_df = pd.DataFrame(tqdm(news_data['industries'].apply(to_boolean_list).tolist()), columns=industry_cols)
    # process the news_data df
    news_data = news_data.reset_index(drop=True)
    boolean_df = boolean_df.reset_index(drop=True)
    news_data = pd.concat([news_data, boolean_df], axis=1)
    news_data = news_data.applymap(convert_arrays_to_lists)
    return news_data

def new_data_to_dict(news_data):
    # Convert the DataFrame to a dictionary format
    news_data_dict = news_data.set_index('id').T.to_dict()
    return news_data_dict

def clean_news(news_data):
    flattened_info=json_normalize(news_data["publisher"])
    news_data=pd.concat([news_data.drop('publisher', axis=1), flattened_info], axis=1)
    cols_to_process = ['tickers', 'keywords', 'industries']
    cols_to_process = ['tickers', 'keywords', 'industries']
    for col in cols_to_process:
        news_data[col] = news_data[col].apply(lambda lst:  tuple(lst) if lst is not None else None)
    news_data.drop_duplicates(inplace=True)
    return news_data

def gen_10_industries_df(news_data):
    industry_cols = list(set(['agriculture', 'mining', 'construction', 'manufacturing', 'transportation',
                     'wholesale', 'retail', 'finance', 'services', 'public_administration']))
    industry_data = {} # Create a dictionary to hold the smaller dataframes
    for col in industry_cols: # Iterate through the industry columns
        # Get the most recent 100 rows where the industry boolean column is true
        industry_rows = news_data[news_data[col] == True].sort_values(by='published_utc', ascending=False).head(100)
        # Add the dataframe to the dictionary using the column name as the key
        industry_data[col] = industry_rows.set_index('id').T.to_dict()
        # Print the number of rows added to the dataframe
        print(f"{col}: {len(industry_rows)} rows added")
    return industry_data

In [None]:
news_data = process_news_data(db)

In [None]:
#map ticker to industry (OK)
#get tickers col as list from master news dataframe (rows) (OK)
#map tickers in list 10 industries boolean col (OK)
#create 10 industry dataframes, 100 rows each, order by time (OK)
#write a python program that run every x mins to get new news and append to the master dataframe, and also append to the 10 industries dataframes.
#user requests for news -> look up pref in firebase by user hashid -> merge industries dataframes -> push to front end 100 rows but limit displace 10 rows at a time

In [None]:
news_data.info()

In [None]:
target_data = news_data.tickers.apply(lambda tickers: tuple(itertools.combinations(tickers, 2)))

In [None]:
target_data

In [None]:
import pandas as pd
import itertools
import concurrent.futures

def df_modify(tuple_pair, df, ref_map):
    df.loc[ref_map[tuple_pair[0]], ref_map[tuple_pair[1]]] += 1

def process_row(row, df, ref_map):
    for tuple_pair in row:
        df_modify(tuple_pair, df, ref_map)

def process_data(target_data, df, ref_map, max_workers=12):
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(process_row, row, df, ref_map) for row in target_data]
        concurrent.futures.wait(futures)

In [None]:
ref_map = db.collection('ticker_map').document('dict').get().to_dict()
n = len(ref_map)
index_list = list(range(n))
df = pd.DataFrame(0, index=index_list, columns=index_list)

In [None]:
process_data(target_data, df, ref_map)

In [16]:
url_factory = poly_url.StockUrlFactory(api_key)
url = url_factory.ReferenceData.tickers(url_factory)
tickers =  poly_helper.get_data_from_single_url(url)

In [18]:
def ticker_map_dict_gen():
    # Define a function to create the hashid
    def create_hashid(row):
        cik = row['cik'] if row['cik'] else ''
        composite_figi = row['composite_figi'] if row['composite_figi'] else ''
        hash_str = f"{row['ticker']}{cik}{composite_figi}"
        return hashlib.md5(hash_str.encode()).hexdigest()

    #get tickers on us equity market:
    url_factory = poly_url.StockUrlFactory(api_key)
    url = url_factory.ReferenceData.tickers(url_factory)
    tickers =  poly_helper.get_data_from_single_url(url)
    # Apply the function to each row to create a hashid column
    tickers['hashid'] = tickers.apply(create_hashid, axis=1)
    ticker_map_dict = tickers[['ticker', 'hashid']].set_index('ticker').to_dict()['hashid']
    return ticker_map_dict

In [22]:
ticker_map_dict = tickers[['ticker', 'hashid']].set_index('ticker').to_dict()['hashid']

In [31]:
cloud_upload_single(db, "tickers", "ticker_hash", ticker_map_dict)

In [45]:
def delete_collections(db, collection_name):
    collection_ref = db.collection(collection_name)
    batch_size = 500  # batch size for deleting documents
    docs = collection_ref.limit(batch_size).stream()

    deleted = 0
    for doc in docs:
        doc.reference.delete()
        deleted += 1

    if deleted >= batch_size:
        return delete_collections(db, collection_name)

    collection_ref.delete()
    print(f'Deleted collection {collection_name}')
    return True

delete_collections(db, "ticker_info")

KeyboardInterrupt: 

In [None]:
cloud_upload_single(db, "#update_time", "backend_codebase", datetime.datetime.now())

In [14]:
# Loop through ticker_info documents and extract CIK and Composite FIGI for each ticker
for doc in ticker_info_ref.stream():
    ticker = doc.id
    
    # Check if "is_test" field is present and skip if it is true
    if doc.to_dict().get('is_test') and doc.to_dict().get('is_test') == True:
        continue
    
    value_str = ""
    if 'cik' in doc.to_dict():
        cik = doc.get('cik')
        value_str+="cik"+str(cik)
    if 'composite_figi' in doc.to_dict():
        composite_figi = doc.get('composite_figi')
        value_str+="compfigi"+str(composite_figi)
    if value_str == "":
        print(ticker)
        continue
    # Add ticker and value string to ticker_us_equity_map dictionary
    ticker_us_equity_map[ticker] = value_str

AAQC
AAQC.U
AAQC.WS
ABGI
ABMD
ACDI
ACDI.U
ACDI.WS
ACEV
ACEVU
ACEVW
ACII
ACII.U
ACQR
ACQRU
ACQRW
ACWF
ADRA.U
ADRA.WS
AEAC
AEACU
AEACW
AEHA
AEHAU
AEHAW
AEPPZ
AERC
AERI
AESE
AFAC
AFACU
AFACW
AFAQ
AFAQU
AFAQW
AGAC.WS
AGBAR
AGBAU
AGCB
AGGR
AGGRU
AGGRW
AGTC
AIKI
AKIC
AKICU
AKICW
AKUS
ALBO
AMCI
AMCIU
AMCIW
AMOV
AMPI
AMPI.U
AMPI.WS
ANAC
ANAC.U
ANAC.WS
APN
APN.U
APN.WS
APXH
ARCK
ARCKU
ARCKW
ARGU
ARGUU
ARGUW
ASAX
ASAXU
ASAXW
ASZ
ASZ.U
ATA
ATA.U
ATA.WS
ATAQ.WS
ATAX
AUBAP
AUS
AUS.U
AVCO
AVCT
AVCTW
AVEO
AVYA
AXH
AXH.U
AXH.WS
AYLA
BACA.U
BACA.WS
BAMH
BAMI
BAMR
BCOR
BGSX
BGSX.U
BGSX.WS
BIOT
BIOTU
BIOTW
BLNKW
BLTS
BLTSU
BLTSW
BNFT
BOB
BPYPM
BRMK.WS
BSBE
BSCM
BSFFF
BSJM
BSKY
BSKYU
BSKYW
BSMM
BTCR
BTN
BTRS
CAJ
CCNC
CEA
CECE
CENQ
CENQU
CENQW
CEY
CGABL
CHAA.WS
CHAD
CHG
CHSCL
CHSCM
CHSCN
CHSCO
CHSCP
CINC
CIXX
CLAA
CLAA.U
CLAA.WS
CLAS
CLAS.U
CLAS.WS
CLIM
CLIM.U
CLR
CLRM
CLRMU
CLRMW
CLVS
CMCTP
CNCE
CND
CND.U
CND.WS
COLI
COLIU
COLIW
CORZ
CORZW
COUP
COVA
COVAU
COVAW
COWN
COWNL
CPAQ
CPAQU
CPAQW
CPAR
CPARU
CPARW

user data feed

In [None]:
def get_ticker_vector(user_pref, ticker_map_dict, ticker_sic_map):
    ticker_vector = [0] * len(ticker_map_dict)
    
    for ticker, industry in ticker_sic_map.items():
        if industry is None:
            continue
        if industry in user_pref and user_pref[industry]:
            ticker_vector[ticker_map_dict[ticker]] += 1
        elif industry in user_pref and not user_pref[industry]:
            ticker_vector[ticker_map_dict[ticker]] -= 1
    
    return ticker_vector

In [None]:
# Get all documents from the user_preference collection
user_pref_docs = db.collection('user_preference').stream()

# Initialize an empty dictionary to store user preferences for each ticker
user_ticker_pref = {}

# Iterate through each user preference document
for doc in user_pref_docs:
    # Get the user preference dictionary from the document data
    user_pref = doc.to_dict()
    
    # Initialize a list to store the ticker vector for this user
    ticker_vector = [0] * len(ticker_map_dict)
    
    # Iterate through each ticker in the ticker_sic_map
    for ticker, industry in ticker_sic_map.items():
        # Get the index of this ticker in the ticker_vector using the ticker_map_dict
        ticker_index = ticker_map_dict[ticker]
        
        # Check if the industry for this ticker matches any industry in the user preference
        for pref_industry, pref_value in user_pref.items():
            if industry == pref_industry and pref_value:
                # Increase the value of this ticker's index in the ticker_vector
                ticker_vector[ticker_index] += 1
            elif industry == pref_industry and not pref_value:
                # Decrease the value of this ticker's index in the ticker_vector
                ticker_vector[ticker_index] -= 1
        
    # Add this user's ticker vector to the user_ticker_pref dictionary
    user_ticker_pref[doc.id] = ticker_vector

In [None]:
# Get all documents from the user_preference collection
user_pref_docs = db.collection('user_preference').stream()

# Initialize an empty dictionary to store user preferences for each ticker
user_ticker_pref = {}

# Iterate through each user preference document
for doc in user_pref_docs:
    # Get the user preference dictionary from the document data
    user_pref = doc.to_dict()
    
    # Initialize a list to store the ticker vector for this user
    ticker_vector = [0] * len(ticker_map_dict)
    
    # Iterate through each ticker in the ticker_sic_map
    for ticker, industry in ticker_sic_map.items():
        # Get the index of this ticker in the ticker_vector using the ticker_map_dict
        ticker_index = ticker_map_dict[ticker]
        
        # Check if the industry for this ticker matches any industry in the user preference
        for pref_industry, pref_value in user_pref.items():
            if industry == pref_industry and pref_value:
                # Increase the value of this ticker's index in the ticker_vector
                ticker_vector[ticker_index] += 1
            elif industry == pref_industry and not pref_value:
                # Decrease the value of this ticker's index in the ticker_vector
                ticker_vector[ticker_index] -= 1
        
    # Add this user's ticker vector to the user_ticker_pref dictionary
    user_ticker_pref[doc.id] = ticker_vector

In [None]:
# Get all documents from the user_preference collection
user_pref_docs = db.collection('user_preferences').stream()

# Initialize an empty dictionary to store user preferences for each ticker
user_ticker_pref = {}

# Iterate through each user preference document
for doc in user_pref_docs:
    # Get the user preference dictionary from the document data
    user_pref = doc.to_dict()
    
    # Initialize a list to store the ticker vector for this user
    ticker_vector = [0] * len(ticker_map_dict)
    
    # Iterate through each ticker in the ticker_sic_map
    for ticker, industry in ticker_sic_map.items():
        if ticker=="nan":
            pass
        else:
            # Get the index of this ticker in the ticker_vector using the ticker_map_dict
            if ticker not in ticker_map_dict.keys():
                print(ticker)
            ticker_index = ticker_map_dict[ticker]

            # Check if the industry for this ticker matches any industry in the user preference
            for pref_industry, pref_value in user_pref.items():
                if industry == pref_industry and pref_value:
                    # Increase the value of this ticker's index in the ticker_vector
                    ticker_vector[ticker_index] += 1
                elif industry == pref_industry and not pref_value:
                    # Decrease the value of this ticker's index in the ticker_vector
                    ticker_vector[ticker_index] -= 1
        
    # Add this user's ticker vector to the user_ticker_pref dictionary
    user_ticker_pref[doc.id] = ticker_vector

In [None]:
# Iterate through the user_ticker_pref dictionary and set values in the collection
for user_id, ticker_vector in user_ticker_pref.items():
    # Set the document reference in the user_pref_ticker collection using the user_id
    doc_ref = db.collection('user_pref_ticker').document(user_id)

    # Add the ticker_vector to the document
    doc_ref.set({'ticker_vector': ticker_vector})

In [None]:
news_data["tickers_index"] = news_data.tickers.apply(lambda tickers: [ticker_map_dict.get(ticker, None) for ticker in tickers])

In [None]:
user = user_ticker_pref["2qbeT9d3aCfrSk0PLbsJO10mrV73"]

In [None]:
user_pref_news = list(map(lambda x: np.nanmean([user[i] for i in x if i is not None]), news_data['tickers_index']))

In [None]:
user_pref_docs = db.collection('user_preferences').stream()
industry_news_ref = db.collection('industry_news').document("dict")
industry_news = industry_news_ref.get().to_dict()
print(industry_news.keys())
for doc in user_pref_docs:
    # Get the user preference dictionary from the document data
    user_pref = doc.to_dict()

    # Extract the industry names for which the value is True
    industry_names = [key for key, value in user_pref.items() if value]
    print(industry_names)

    # Create an empty dictionary to store the merged documents
    merged_docs = {}

    # Loop through each industry name and get the corresponding document from Firestore
    for name in industry_names:
        merged_docs.update(industry_news[name])

    # Sort the merged documents by publish_utc in descending order
    sorted_docs = dict(sorted(merged_docs.items(), key=lambda x: x[1]['publish_utc'], reverse=True))

    # Get the 100 most recent rows from the sorted documents
    most_recent_docs = dict(list(sorted_docs.items())[:5])
    
    print(most_recent_docs)


In [None]:
def rename_keys(user_preferences):
    corrected_keys = {
        'algriculture': 'agriculture',
        'construction': 'construction',
        'transportation': 'transportation',
        'manufacuring': 'manufacturing',
        'wholesale': 'wholesale',
        'public_administration': 'public_administration',
        'mining': 'mining',
        'finance': 'finance',
        'retail': 'retail',
        'services': 'services'
    }

    return {corrected_keys[key]: value for key, value in user_preferences.items()}

def correct_user_pref_key():
    """
    no need to call it in the future, one time fix of user pref collection's dict keys
    """
    user_preferences_ref = db.collection('user_preferences')
    for doc in user_preferences_ref.stream():
        user_preferences = doc.to_dict()
        corrected_preferences = rename_keys(user_preferences)
        # Update the document with the corrected key-value pairs
        doc_ref = user_preferences_ref.document(doc.id)
        doc_ref.set(corrected_preferences)