In [71]:
import glob
import os
import json
import pandas as pd
from pymongo import MongoClient
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.common.by import By
import time

In [66]:
def extract_nasdaq():

    # Set driver options
    options=Options()
    options.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/csv")
    options.add_argument("-headless")
    options.add_argument("user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36")
    options.add_argument("--window-size=1920,1080")

    # Start firefox driver, navigate to nasdaq url, and click download button
    driver = webdriver.Firefox(options=options)
    driver.get("https://www.nasdaq.com/market-activity/stocks/screener")
    btn = driver.find_element(
        By.XPATH,
        value="//button[@class='nasdaq-screener__form-button--download ns-download-1']"
    )
    driver.execute_script("arguments[0].click();", btn)
    # btn.click()
    time.sleep(2)

    # Close webdriver
    driver.close()

def transform_nasdaq():
    
    # Path to .csv in downloads with wildcard (*)
    path = os.path.expanduser("~") + "/downloads/nasdaq_screener_*.csv"

    # Read .csv from downloads
    file_list = glob.glob(path)
    file_path = file_list[-1]
    df = pd.read_csv(file_path)

    # Clean
    df["Last Sale"] = df["Last Sale"].replace("[$,]", "", regex=True).astype(float)
    df["Net Change"] = df["Net Change"].astype(float)
    df["% Change"] = df["% Change"].replace("[%]", "", regex=True).astype(float) / 100
    df["IPO Year"] = df["IPO Year"].astype("Int64")
    df = df.rename(columns={
        "Symbol": "symbol",
        "Name": "name",
        "Last Sale": "last_sale",
        "Net Change": "net_change",
        "% Change": "percent_change",
        "Market Cap": "market_cap",
        "Country": "country",
        "IPO Year": "ipo_year",
        "Volume": "volume",
        "Sector": "sector",
        "Industry": "industry"
    })
    
    return df

def load_nasdaq(client, df):

    rows = df.to_dict("records")
    coll = client["bakery"]["nasdaq"]
    result = coll.insert_many(rows)

    return result


In [67]:
extract_nasdaq()

In [68]:
df = transform_nasdaq()

In [72]:
secrets_path = os.path.join(os.path.expanduser('~'), 'git/bakery/bakery/data/secrets.json')
with open(secrets_path, "rb") as f:
    secrets = json.loads(f.read().decode())
    conn_str = secrets["mongo"]["conn_str"]

client = MongoClient(conn_str)

result = load_nasdaq(client, df)