# Testing Stock Fetcher 

## Imports 

In [1]:
# General 
import numpy as np 
import pandas as pd
from sklearn.impute import KNNImputer

# Custom 
from src.data_fetching import scrape_sp500_wikipedia, fetch_stock_data, prepare_data_for_vae

## Fetching the data 

In [2]:
sp500_df = scrape_sp500_wikipedia()  # Use the function you created to scrape S&P 500 companies
custom_tickers = ['TSLA', 'ZM', 'SNOW']  # Example custom tickers
stock_data, sector_mapping, industry_mapping = fetch_stock_data(sp500_df, custom_tickers) # Fetch data
stock_data_vae = prepare_data_for_vae(stock_data)  # Prepare data for VAE

Fetching stock data...:  26%|██▌       | 131/505 [00:38<02:31,  2.46it/s]$BRK.B: possibly delisted; no price data found  (period=1mo) (Yahoo error = "No data found, symbol may be delisted")
$BRK.B: possibly delisted; no price data found  (period=1y) (Yahoo error = "No data found, symbol may be delisted")
Fetching stock data...:  27%|██▋       | 138/505 [00:42<02:01,  3.02it/s]$BF.B: possibly delisted; no price data found  (period=1mo)
$BF.B: possibly delisted; no price data found  (period=1y)
Fetching stock data...:  43%|████▎     | 219/505 [01:03<01:16,  3.75it/s]SW: Period '1y' is invalid, must be one of ['1d', '5d', '1mo', '3mo', '6mo', 'ytd', 'max']
Fetching stock data...:  87%|████████▋ | 439/505 [02:02<00:19,  3.39it/s]AMTM: Period '1y' is invalid, must be one of ['1d', '5d', '1mo', '3mo', 'ytd', 'max']
Fetching stock data...: 100%|██████████| 505/505 [02:20<00:00,  3.59it/s]

Completed fetching stock data in 140.73 seconds.





In [3]:
# Display the data 
print(stock_data_vae.head())

     Market Cap  Open Price  Close Price  High Price   Low Price  Last Close  \
0  1.414585e+11  249.039049   247.875714  259.589996  240.179993  241.479996   
1  4.074940e+10  102.765239   102.670000  106.930000   98.830002   99.839996   
2  1.573076e+11   29.002857    28.926667   30.430000   27.730000   27.760000   
3  2.112973e+10  106.577619   106.596191  115.320000   98.660004   99.889999   
4  8.033044e+10  225.065240   225.251904  230.360001  220.110001  223.080002   

   52 Week High  52 Week Low  Last Month Volatility  52 Week Volatility  ...  \
0    260.459991   149.139999               0.038552            0.197440  ...   
1    106.930000    47.880001               0.064736            0.310658  ...   
2     31.540001    25.200001               0.059316            0.242262  ...   
3    115.320000    70.309998               0.096640            0.303938  ...   
4    230.360001   190.699997               0.032097            0.166716  ...   

   Industry_Tools & Accessories  Indus

In [4]:
# Print number of columns in total 
print(len(stock_data_vae.columns))

stock_data_vae = stock_data_vae.astype(float)

# Print the data types of the columns
print(stock_data_vae.dtypes)

135
Market Cap                               float64
Open Price                               float64
Close Price                              float64
High Price                               float64
Low Price                                float64
                                          ...   
Industry_Utilities—Regulated Electric    float64
Industry_Utilities—Regulated Gas         float64
Industry_Utilities—Regulated Water       float64
Industry_Utilities—Renewable             float64
Industry_Waste Management                float64
Length: 135, dtype: object


In [5]:
print(stock_data.columns)

Index(['Ticker', 'Company Name', 'Market Cap', 'Sector', 'Industry',
       'Open Price', 'Close Price', 'High Price', 'Low Price', 'Last Close',
       '52 Week High', '52 Week Low', 'Last Month Volatility',
       '52 Week Volatility', 'Yearly Dividend Rate', 'Last Year Return Rate',
       'Sector_encoded', 'Industry_encoded'],
      dtype='object')


In [6]:
# # check for rows with nas 
# print(stock_data_vae.isna().sum())

# Check if any columns have nas
print(stock_data_vae.isna().sum().sum())

# Produce a list of columns with nas, and the amount for each 
print(stock_data_vae.columns[stock_data_vae.isna().any()].tolist())

# Do the same for stock_data 
print(stock_data.columns[stock_data.isna().any()].tolist())

# Calculate the proportion of nas in the stock_data_vae dataframe for each column with nas 
# only for the columns with nas
nas_proportion = stock_data_vae.isna().mean()
print(nas_proportion[stock_data_vae.columns[stock_data_vae.isna().any()].tolist()])

0
[]
[]
Series([], dtype: float64)


In [7]:
# Perform na imputation to the columns with nas
stock_data_vae = stock_data_vae.fillna(method='ffill')

nas_proportion = stock_data_vae.isna().mean()
print(nas_proportion[stock_data_vae.columns[stock_data_vae.isna().any()].tolist()])

Series([], dtype: float64)


In [8]:
df = stock_data_vae.copy()

In [9]:
# Check if there are any missing values left 
print(df.isnull().sum().sum())

0
