# Testing Stock Fetcher 

## Imports 

In [1]:
# General 
import numpy as np 
import pandas as pd
from sklearn.impute import KNNImputer

# Custom 
from src.data_fetching import scrape_sp500_wikipedia, fetch_stock_data, prepare_data_for_vae

## Fetching the data 

In [2]:
sp500_df = scrape_sp500_wikipedia()  # Use the function you created to scrape S&P 500 companies
custom_tickers = ['TSLA', 'ZM', 'SNOW']  # Example custom tickers
stock_data, sector_mapping, industry_mapping = fetch_stock_data(sp500_df, custom_tickers) # Fetch data
stock_data_vae = prepare_data_for_vae(stock_data)  # Prepare data for VAE

Fetching stock data...:  28%|██▊       | 140/505 [00:39<01:44,  3.48it/s]SW: Period '1y' is invalid, must be one of ['1d', '5d', '1mo', '3mo', '6mo', 'ytd', 'max']
Fetching stock data...:  57%|█████▋    | 286/505 [01:20<01:00,  3.62it/s]$BF.B: possibly delisted; no price data found  (period=1mo)
$BF.B: possibly delisted; no price data found  (period=1y)
Fetching stock data...:  76%|███████▋  | 386/505 [01:47<00:34,  3.44it/s]AMTM: Period '1y' is invalid, must be one of ['1d', '5d', '1mo', '3mo', 'ytd', 'max']
Fetching stock data...:  80%|███████▉  | 403/505 [01:51<00:21,  4.79it/s]$BRK.B: possibly delisted; no price data found  (period=1mo) (Yahoo error = "No data found, symbol may be delisted")
$BRK.B: possibly delisted; no price data found  (period=1y) (Yahoo error = "No data found, symbol may be delisted")
Fetching stock data...: 100%|██████████| 505/505 [02:20<00:00,  3.59it/s]

Completed fetching stock data in 140.80 seconds.





In [5]:
# Display the data 
print(stock_data_vae.head())

     Market Cap   Open Price  Close Price   High Price    Low Price  \
0  5.149642e+10  3105.733805  3112.015241  3223.489990  2980.100098   
1  9.215435e+09    69.908571    69.633333    73.199997    64.870003   
2  1.082894e+10    17.072729    16.897391    18.800667    14.680000   
3  1.248629e+11    85.859048    85.955714    88.790001    80.639999   
4  8.644570e+10   887.523810   888.109047   943.020020   862.229980   

    Last Close  52 Week High  52 Week Low  Last Month Volatility  \
0  3046.350098   3256.370117  2510.000000               0.056742   
1    66.059998     73.459999    41.170478               0.071511   
2    15.230000     21.763273    14.255096               0.128044   
3    84.720001     88.790001    50.840000               0.047940   
4   895.919983    943.020020   676.982121               0.047250   

   52 Week Volatility  ...  Industry_Trucking  \
0            0.202641  ...                  0   
1            0.293405  ...                  0   
2            0.37

In [8]:
# Print number of columns in total 
print(len(stock_data_vae.columns))

stock_data_vae = stock_data_vae.astype(float)

# Print the data types of the columns
print(stock_data_vae.dtypes)

154
Market Cap                               float64
Open Price                               float64
Close Price                              float64
High Price                               float64
Low Price                                float64
                                          ...   
Industry_Utilities—Regulated Electric    float64
Industry_Utilities—Regulated Gas         float64
Industry_Utilities—Regulated Water       float64
Industry_Utilities—Renewable             float64
Industry_Waste Management                float64
Length: 154, dtype: object


In [4]:
# # check for rows with nas 
# print(stock_data_vae.isna().sum())

# Check if any columns have nas
print(stock_data_vae.isna().sum().sum())

# Produce a list of columns with nas, and the amount for each 
print(stock_data_vae.columns[stock_data_vae.isna().any()].tolist())

# Do the same for stock_data 
print(stock_data.columns[stock_data.isna().any()].tolist())

# Calculate the proportion of nas in the stock_data_vae dataframe for each column with nas 
# only for the columns with nas
nas_proportion = stock_data_vae.isna().mean()
print(nas_proportion[stock_data_vae.columns[stock_data_vae.isna().any()].tolist()])

0
[]
[]
Series([], dtype: float64)


In [8]:
# Perform na imputation to the columns with nas
stock_data_vae = stock_data_vae.fillna(method='ffill')

nas_proportion = stock_data_vae.isna().mean()
print(nas_proportion[stock_data_vae.columns[stock_data_vae.isna().any()].tolist()])

Series([], dtype: float64)


In [9]:
df = stock_data_vae.copy()

In [10]:
# Check if there are any missing values left 
print(df.isnull().sum().sum())

0
