# Machine Learning (CNN-LSTM)-based Stock Price Prediction and Portfolio Optimization

### Extract Data - Web Scraping

In [None]:
%%shell
# Ubuntu no longer distributes chromium-browser outside of snap
#
# Proposed solution: https://askubuntu.com/questions/1204571/how-to-install-chromium-without-snap

# Add debian buster
cat > /etc/apt/sources.list.d/debian.list <<'EOF'
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster.gpg] http://deb.debian.org/debian buster main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster-updates.gpg] http://deb.debian.org/debian buster-updates main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-security-buster.gpg] http://deb.debian.org/debian-security buster/updates main
EOF

# Add keys
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A

apt-key export 77E11517 | gpg --dearmour -o /usr/share/keyrings/debian-buster.gpg
apt-key export 22F3D138 | gpg --dearmour -o /usr/share/keyrings/debian-buster-updates.gpg
apt-key export E562B32A | gpg --dearmour -o /usr/share/keyrings/debian-security-buster.gpg

# Prefer debian repo for chromium* packages only
# Note the double-blank lines between entries
cat > /etc/apt/preferences.d/chromium.pref << 'EOF'
Package: *
Pin: release a=eoan
Pin-Priority: 500


Package: *
Pin: origin "deb.debian.org"
Pin-Priority: 300


Package: chromium*
Pin: origin "deb.debian.org"
Pin-Priority: 700
EOF


In [None]:
!apt-get update
!apt-get install chromium chromium-driver
!pip3 install selenium
import requests
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service

from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

In [None]:
# URL of the webpage
url = "https://www.cnyes.com/usastock/hotprice.aspx?page=hot&kind=sp500"

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, "lxml")

    # Find the table element with a specific ID or class name
    table = soup.find("table", {"id": "ctl05_TBstock"})

    # Define the column index you want to extract (e.g., 1 for the second column)
    column_index = 1

    # Create an empty list to store the valid stock URLs
    valid_stock_urls = []

    # Extract the data from the specified column
    for row in table.find_all('tr'):
        columns = row.find_all('td')
        if len(columns) > column_index:
            stock_url = "https://invest.cnyes.com/usstock/detail/" + columns[column_index].text.strip() + "/market/minipricehistory#fixed"
            try:
                # Try to send a GET request to the stock URL to check if it's valid
                response = requests.get(stock_url)
                if response.status_code == 200:
                    valid_stock_urls.append(stock_url)
            except requests.exceptions.RequestException:
                # Ignore any exceptions and continue with the next URL
                pass

    # Print the extracted data from the column
    # for url in valid_stock_urls:
    #     print(url)

    # Calculate the number of valid URLs
    num_valid_urls = len(valid_stock_urls)
    print(f"Number of valid URLs: {num_valid_urls}")

# Loop through the valid URLs and scrape their content
# for url in valid_stock_urls:
#   response = requests.get(url)



Number of valid URLs: 375


In [None]:
for i in range(len(valid_stock_urls)):
    print(str(i) + ': ' + valid_stock_urls[i].split('/')[5])

0: A
1: AAPL
2: ABC
3: ABT
4: ADBE
5: ADI
6: ADM
7: ADP
8: ADSK
9: AEE
10: AEP
11: AES
12: AFL
13: AGN
14: AIG
15: AIV
16: AIZ
17: AKAM
18: AKS
19: ALL
20: ALTR
21: AMAT
22: AMD
23: AMGN
24: AMP
25: AMT
26: AMZN
27: AN
28: ANF
29: AON
30: APA
31: APD
32: APH
33: ATI
34: AVB
35: AVY
36: AXP
37: AZO
38: BA
39: BAX
40: BBBY
41: BBY
42: BDX
43: BEN
44: BF.B
45: BIG
46: BIIB
47: BK
48: BLL
49: BMY
50: BRK.B
51: BSX
52: BTU
53: BXP
54: C
55: CAG
56: CAH
57: CAT
58: CB
59: CCL
60: CERN
61: CF
62: CHK
63: CHRW
64: CI
65: CIEN
66: CINF
67: CL
68: CLF
69: CLX
70: CMA
71: CMCSA
72: CME
73: CMI
74: CMS
75: CNP
76: CNX
77: COF
78: COG
79: COP
80: COST
81: CPB
82: CRM
83: CSCO
84: CSX
85: CTAS
86: CTL
87: CTSH
88: CTXS
89: CVS
90: CVX
91: D
92: DD
93: DE
94: DELL
95: DF
96: DFS
97: DGX
98: DHI
99: DHR
100: DIS
101: DISCA
102: DNR
103: DO
104: DOV
105: DOW
106: DRI
107: DTE
108: DUK
109: DV
110: DVA
111: DVN
112: EBAY
113: ECL
114: ED
115: EFX
116: EIX
117: EL
118: EMN
119: EMR
120: EOG
121: EQR
122:

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
for download_url in valid_stock_urls[372:]:
    service = Service("/usr/bin/chromedriver")
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    file_save_path = "/content/drive/MyDrive/extracted_data/"+download_url.split('/')[5]
    print(file_save_path)
    options.add_experimental_option("prefs", {"download.default_directory": file_save_path})
    driver = webdriver.Chrome(service=service, options=options)
    try:
        # Open the webpage
        driver.get(download_url)

        # Wait for the page to load (you may need to adjust the wait time)
        time.sleep(3)

        # Find and click the "Download" button
        download_button = driver.find_element("xpath", '//button[text()="Download"]')
        driver.execute_script("arguments[0].click();", download_button)

        # Wait for the modal to appear (you may need to adjust the wait time)
        time.sleep(3)

        # Find and click the "Last 15 years" button
        last_15_years_button = driver.find_element("xpath", '//button[text()="Last 15 years"]')
        last_15_years_button.click()

        # Wait for the file to be generated and downloaded (you may need to adjust the wait time)
        time.sleep(5)
        # driver.quit()
    except:
        print(download_url+"does not contain the data")
    finally:
        driver.close()  # Close the browser window

/content/drive/MyDrive/extracted_data/XRX
/content/drive/MyDrive/extracted_data/YUM
/content/drive/MyDrive/extracted_data/ZION


Mounted at /content/drive
