In [1]:
import yfinance as yf
import os
import pandas as pd

# Create a directory to store S&P 500 stock data
directory = "Data_10"
if not os.path.exists(directory):
    os.makedirs(directory)

# List of the Stocks for training
tickers = ['AAPL', 'MSFT', 'AMD', 'NVDA', 'GOOGL', 'META', 'AMZN', 'CRM', 'ORCL', 'INTC']


# Fetch stock data and save as CSV
for ticker in tickers:
    print(f"Fetching data for: {ticker}")
    stock_data = yf.download(ticker, start="2013-01-01", end="2025-04-06")
    
    # Check if the number of data points is less than 100 days
    if len(stock_data) < 100:
        print(f"Skipping {ticker}, insufficient data: {len(stock_data)} days")
        continue  # Skip this stock and do not save data

    # Keep only the required columns (Open, High, Low, Close, Volume)
    stock_data = stock_data[['Open', 'High', 'Low', 'Close', 'Volume']]
    
    # Save data as CSV
    stock_data.to_csv(f"{directory}/{ticker}.csv")
    print(f"Data for {ticker} has been saved")

print("All eligible stock data has been successfully saved.")


Fetching data for: AAPL
YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  1 of 1 completed


Data for AAPL has been saved
Fetching data for: MSFT


[*********************100%***********************]  1 of 1 completed


Data for MSFT has been saved
Fetching data for: AMD


[*********************100%***********************]  1 of 1 completed


Data for AMD has been saved
Fetching data for: NVDA


[*********************100%***********************]  1 of 1 completed


Data for NVDA has been saved
Fetching data for: GOOGL


[*********************100%***********************]  1 of 1 completed


Data for GOOGL has been saved
Fetching data for: META


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Data for META has been saved
Fetching data for: AMZN
Data for AMZN has been saved
Fetching data for: CRM


[*********************100%***********************]  1 of 1 completed


Data for CRM has been saved
Fetching data for: ORCL


[*********************100%***********************]  1 of 1 completed


Data for ORCL has been saved
Fetching data for: INTC


[*********************100%***********************]  1 of 1 completed

Data for INTC has been saved
All eligible stock data has been successfully saved.





In [2]:
import pandas as pd
import os

# Process each CSV file
for ticker in os.listdir(directory):
    if ticker.endswith(".csv"):
        file_path = os.path.join(directory, ticker)
        
        # Read the CSV file
        df = pd.read_csv(file_path)

        # Delete the second and third rows
        df = df.drop([0, 1]).reset_index(drop=True)

        # Remove rows containing NaN values
        df = df.dropna()

        # Rename the first column to 'Date'
        df.columns.values[0] = 'Date'

        # Save the modified file
        df.to_csv(file_path, index=False)

        print(f"Processing complete: {ticker}")

print("🎉 All CSV files have been cleaned!")

Processing complete: AAPL.csv
Processing complete: ORCL.csv
Processing complete: MSFT.csv
Processing complete: AMD.csv
Processing complete: AMZN.csv
Processing complete: INTC.csv
Processing complete: NVDA.csv
Processing complete: GOOGL.csv
Processing complete: META.csv
Processing complete: CRM.csv
🎉 All CSV files have been cleaned!


In [3]:
import os
import pandas as pd

# Directories
raw_dir = "Data_10"
output_dir = "Data_Clean"
os.makedirs(output_dir, exist_ok=True)

# Step 1: Gather all unique dates
all_dates = set()
for filename in os.listdir(raw_dir):
    if filename.endswith(".csv"):
        df = pd.read_csv(os.path.join(raw_dir, filename), parse_dates=["Date"])
        all_dates.update(df["Date"].tolist())

# Convert to sorted list of dates
all_dates = sorted(pd.to_datetime(list(all_dates)))


# Step 2: Process each stock
for filename in os.listdir(raw_dir):
    if filename.endswith(".csv"):
        ticker = filename.replace(".csv", "")
        filepath = os.path.join(raw_dir, filename)

        df = pd.read_csv(filepath, parse_dates=["Date"])
        df.set_index("Date", inplace=True)
        df.sort_index(inplace=True)

        # Track which dates are missing
        existing_dates = set(df.index)
        missing_dates = [d for d in all_dates if d not in existing_dates]

        # Reindex and forward-fill
        df_reindexed = df.reindex(all_dates, method='ffill')

        # Restore 'Date' column
        df_reindexed.reset_index(inplace=True)
        df_reindexed.rename(columns={'index': 'Date'}, inplace=True)

        # Save cleaned file
        output_path = os.path.join(output_dir, f"{ticker}.csv")
        df_reindexed.to_csv(output_path, index=False)

        print(f"✅ Processed {ticker}")
        if missing_dates:
            print(f"   ➤ Filled {len(missing_dates)} missing dates for {ticker}:")
            for date in missing_dates:
                print(f"     - {date.strftime('%Y-%m-%d')}")
        else:
            print(f"   ➤ No missing dates for {ticker}")

print("\n🎉 Done! All stocks processed and missing dates filled.")


✅ Processed AAPL
   ➤ No missing dates for AAPL
✅ Processed ORCL
   ➤ No missing dates for ORCL
✅ Processed MSFT
   ➤ No missing dates for MSFT
✅ Processed AMD
   ➤ No missing dates for AMD
✅ Processed AMZN
   ➤ No missing dates for AMZN
✅ Processed INTC
   ➤ No missing dates for INTC
✅ Processed NVDA
   ➤ No missing dates for NVDA
✅ Processed GOOGL
   ➤ No missing dates for GOOGL
✅ Processed META
   ➤ No missing dates for META
✅ Processed CRM
   ➤ No missing dates for CRM

🎉 Done! All stocks processed and missing dates filled.


In [4]:
import pandas as pd
import numpy as np

# Function to calculate SMA
def sma(data, window):
    return data.rolling(window=window).mean()

# Function to calculate EMA
def ema(data, window):
    return data.ewm(span=window, adjust=False).mean()

# Function to calculate RSI
def rsi(data, window=14):
    delta = data.diff(1)
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    return 100 - (100 / (1 + rs))

# Function to calculate MACD
def macd(data, short_window=12, long_window=26):
    ema_short = ema(data, short_window)
    ema_long = ema(data, long_window)
    return ema_short - ema_long

# Function to calculate Bollinger Bands
def bollinger_bands(data, window=20):
    sma_ = sma(data, window)
    std_dev = data.rolling(window=window).std()
    upper_band = sma_ + (std_dev * 2)
    lower_band = sma_ - (std_dev * 2)
    return upper_band, lower_band

# Function to calculate On-Balance Volume (OBV)
def obv(data, volume):
    return (np.sign(data.diff()) * volume).cumsum()

print("🎉 All functions for calculating indicators are defined!")

🎉 All functions for calculating indicators are defined!


In [5]:
import os
import pandas as pd

# Directory paths
input_dir = "Data_Clean"
output_dir = "Data_Clean"
os.makedirs(output_dir, exist_ok=True)

# Loop over each file in the directory
for filename in os.listdir(input_dir):
    if filename.endswith(".csv"):
        # Read CSV for each stock
        file_path = os.path.join(input_dir, filename)
        df = pd.read_csv(file_path, parse_dates=["Date"])
        df.set_index("Date", inplace=True)

        # Calculate the indicators
        df['SMA_50'] = sma(df['Close'], 50)
        df['SMA_200'] = sma(df['Close'], 200)
        df['EMA_12'] = ema(df['Close'], 12)
        df['EMA_26'] = ema(df['Close'], 26)
        df['RSI_14'] = rsi(df['Close'], 14)
        df['MACD'] = macd(df['Close'])
        df['Bollinger_Upper'], df['Bollinger_Lower'] = bollinger_bands(df['Close'])
        df['OBV'] = obv(df['Close'], df['Volume'])

        # Calculate 5-day return (percentage change in closing price between day 61 and day 65)
        df['5_day_return'] = df['Close'].shift(-5) / df['Close'] - 1
        df['5_day_return'] = df['5_day_return'] * 100  # Convert to percentage

        # Save the updated CSV file in the new directory
        output_path = os.path.join(output_dir, filename)
        df.to_csv(output_path)

        print(f"✅ Processed {filename}")

print("\n🎉 All indicators and 5-day returns calculated and saved.")


✅ Processed AAPL.csv
✅ Processed ORCL.csv
✅ Processed MSFT.csv
✅ Processed AMD.csv
✅ Processed AMZN.csv
✅ Processed INTC.csv
✅ Processed NVDA.csv
✅ Processed GOOGL.csv
✅ Processed META.csv
✅ Processed CRM.csv

🎉 All indicators and 5-day returns calculated and saved.


In [6]:
import os
import yfinance as yf
import pandas as pd

# Directory paths
input_dir = "Data_Clean"
output_dir = "Data_Clean"
os.makedirs(output_dir, exist_ok=True)

# Function to retrieve VIX and GSPC data using yfinance
def get_vix_gspc_data(start_date, end_date):
    # Download data for VIX and S&P500 (GSPC) from Yahoo Finance
    vix = yf.download('^VIX', start=start_date, end=end_date)['Close']
    gspc = yf.download('^GSPC', start=start_date, end=end_date)['Close']
    return vix, gspc

# Loop over each file in the directory
for filename in os.listdir(input_dir):
    if filename.endswith(".csv"):
        # Read CSV for each stock
        file_path = os.path.join(input_dir, filename)
        df = pd.read_csv(file_path, parse_dates=["Date"])
        df.set_index("Date", inplace=True)

        # Drop rows with NaN values
        df.dropna(inplace=True)

        # Get the start and end date from the stock data for fetching VIX and GSPC data
        start_date = df.index.min().strftime('%Y-%m-%d')
        end_date = df.index.max().strftime('%Y-%m-%d')

        # Retrieve VIX and GSPC data
        vix_data, gspc_data = get_vix_gspc_data(start_date, end_date)

        # Align VIX and GSPC data with the stock's Date index
        df['VIX'] = vix_data.reindex(df.index, method='ffill')  # Forward fill missing data
        df['GSPC'] = gspc_data.reindex(df.index, method='ffill')  # Forward fill missing data

         # Move '5_day_return' to the last column
        cols = [col for col in df.columns if col != '5_day_return']  # Get all columns except '5_day_return'
        cols.append('5_day_return')  # Add '5_day_return' to the end
        df = df[cols]  # Reorder columns

        # Save the updated CSV file in the new directory
        output_path = os.path.join(output_dir, filename)
        df.to_csv(output_path)

        print(f"✅ Processed {filename}")

print("\n🎉 All files processed with VIX and GSPC added.")


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


✅ Processed AAPL.csv
✅ Processed ORCL.csv
✅ Processed MSFT.csv
✅ Processed AMD.csv


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

✅ Processed AMZN.csv
✅ Processed INTC.csv
✅ Processed NVDA.csv
✅ Processed GOOGL.csv



[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


✅ Processed META.csv
✅ Processed CRM.csv

🎉 All files processed with VIX and GSPC added.


In [7]:
import os
import pandas as pd

# Input Directory
input_dir = "Data_Clean"

# Output Directory
output_dir= "Training"

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# List to store dataframes
df_list = []

# Loop over each file in the directory
for filename in os.listdir(input_dir):
    if filename.endswith(".csv"):
        # Read each stock data file
        file_path = os.path.join(input_dir, filename)
        df = pd.read_csv(file_path, parse_dates=["Date"])
        
        # Add a column for the stock ticker (optional)
        ticker = filename.split('.')[0]  # Assuming filename is the ticker symbol, e.g., AAPL.csv
        df['Ticker'] = ticker
        
        # Reorder the columns to place Ticker after Date
        cols = ['Date', 'Ticker'] + [col for col in df.columns if col not in ['Date', 'Ticker']]
        df = df[cols]
        
        # Append the dataframe to the list
        df_list.append(df)

# Concatenate all dataframes into one
combined_df = pd.concat(df_list, axis=0)

# Sort the dataframe by Date to ensure the correct order
combined_df.sort_values(by=["Date", "Ticker"], inplace=True)

# Reset index to avoid any index duplication
combined_df.reset_index(drop=True, inplace=True)

# Save the combined dataframe to a single CSV file
csv_path = os.path.join(output_dir, "Combined_Data.csv")
combined_df.to_csv(csv_path, index=False)

print(f"✅ All data has been combined into {output_dir}")

# Display the first few rows to verify the column order
combined_df.head()


✅ All data has been combined into Training


Unnamed: 0,Date,Ticker,Open,High,Low,Close,Volume,SMA_50,SMA_200,EMA_12,EMA_26,RSI_14,MACD,Bollinger_Upper,Bollinger_Lower,OBV,VIX,GSPC,5_day_return
0,2013-10-16,AAPL,15.495887,15.549728,15.447617,15.505789,251101200,15.032551,13.979203,15.18983,15.04501,62.303597,0.144821,15.554754,14.534793,-2499042000.0,14.71,1721.540039,4.759481
1,2013-10-16,AMD,4.1,4.1,4.03,4.09,34138300,3.7236,3.34195,3.897477,3.840017,61.363635,0.057461,4.066061,3.667939,581574000.0,14.71,1721.540039,-23.227384
2,2013-10-16,AMZN,15.419,15.54,15.2775,15.5245,43414000,15.01884,13.95472,15.468081,15.382953,44.24672,0.085127,16.190246,15.028954,567948000.0,14.71,1721.540039,5.240101
3,2013-10-16,CRM,50.696837,52.636007,50.696837,52.287952,5546400,48.406231,43.628603,51.223758,50.579232,46.932021,0.644525,53.7203,49.767677,16103800.0,14.71,1721.540039,1.445403
4,2013-10-16,GOOGL,22.063435,22.373763,22.017109,22.366291,80223696,21.837339,20.964062,21.855881,21.870292,60.675127,-0.014411,22.534776,21.243958,774225000.0,14.71,1721.540039,14.852515


In [8]:
# Summary of the combined dataframe
summary = combined_df.describe(include='all')

# Check for missing values (NaN)
missing_values = combined_df.isna().sum()

# Check for infinite values
infinite_values = (combined_df == float('inf')).sum()

# Display the summary, missing values, and infinite values
summary, missing_values, infinite_values


(                                 Date Ticker          Open          High  \
 count                           28800  28800  28800.000000  28800.000000   
 unique                            NaN     10           NaN           NaN   
 top                               NaN   AAPL           NaN           NaN   
 freq                              NaN   2880           NaN           NaN   
 mean    2019-07-06 09:36:29.999999744    NaN     97.408535     98.564013   
 min               2013-10-16 00:00:00    NaN      0.345419      0.348702   
 25%               2016-08-24 18:00:00    NaN     30.880708     31.227677   
 50%               2019-07-08 12:00:00    NaN     60.746297     61.532066   
 75%               2022-05-13 18:00:00    NaN    141.937927    143.712909   
 max               2025-03-28 00:00:00    NaN    735.345771    740.251419   
 std                               NaN    NaN     96.429016     97.524954   
 
                  Low         Close        Volume        SMA_50       SMA_

In [9]:
import pandas as pd

input_dir = "Training"
file_name = "Combined_Data.csv"

file_path = os.path.join(input_dir, file_name)

# Load the combined dataset
combined_df = pd.read_csv(file_path, parse_dates=["Date"])

# Sort by Date (in case it's not sorted already)
combined_df.sort_values(by=["Date"], inplace=True)

# Define the split dates (assuming the date format is consistent)
train_end_date = '2020-12-31'  # Last date for the training set
valid_end_date = '2022-12-31'  # Last date for the validation set

# Split the data
train_set = combined_df[combined_df['Date'] <= train_end_date]
valid_set = combined_df[(combined_df['Date'] > train_end_date) & (combined_df['Date'] <= valid_end_date)]
test_set = combined_df[combined_df['Date'] > valid_end_date]

# Save to CSV
output_dir = "Training"

train_path = f"{output_dir}/train_2013_2020.csv"
valid_path = f"{output_dir}/valid_2021_2022.csv"
test_path = f"{output_dir}/test_2023_2025.csv"

train_set.to_csv(train_path, index=False)
valid_set.to_csv(valid_path, index=False)
test_set.to_csv(test_path, index=False)

print(f"Training set saved: {train_path} ({len(train_set)} rows)")
print(f"Validation set saved: {valid_path} ({len(valid_set)} rows)")
print(f"Testing set saved: {test_path} ({len(test_set)} rows)")

print("✅ Data split into train, validation, and test sets and saved to CSV.")


Training set saved: Training/train_2013_2020.csv (18160 rows)
Validation set saved: Training/valid_2021_2022.csv (5030 rows)
Testing set saved: Training/test_2023_2025.csv (5610 rows)
✅ Data split into train, validation, and test sets and saved to CSV.


In [10]:
# Summary of the combined dataframe
summary = train_set.describe(include='all')

# Check for missing values (NaN)
missing_values = train_set.isna().sum()

# Check for infinite values
infinite_values = (train_set == float('inf')).sum()

# Display the summary, missing values, and infinite values
summary, missing_values, infinite_values

(                                 Date Ticker          Open          High  \
 count                           18160  18160  18160.000000  18160.000000   
 unique                            NaN     10           NaN           NaN   
 top                               NaN   AAPL           NaN           NaN   
 freq                              NaN   1816           NaN           NaN   
 mean    2017-05-25 06:46:47.048457984    NaN     58.084823     58.724272   
 min               2013-10-16 00:00:00    NaN      0.345419      0.348702   
 25%               2015-08-05 18:00:00    NaN     25.253262     25.494974   
 50%               2017-05-24 12:00:00    NaN     41.975027     42.423750   
 75%               2019-03-15 18:00:00    NaN     75.794016     76.707315   
 max               2020-12-31 00:00:00    NaN    298.751960    303.240850   
 std                               NaN    NaN     52.909331     53.539829   
 
                  Low         Close        Volume        SMA_50       SMA_

In [11]:
# Summary of the combined dataframe
summary = valid_set.describe(include='all')

# Check for missing values (NaN)
missing_values = valid_set.isna().sum()

# Check for infinite values
infinite_values = (valid_set == float('inf')).sum()

# Display the summary, missing values, and infinite values
summary, missing_values, infinite_values

(                                 Date Ticker         Open         High  \
 count                            5030   5030  5030.000000  5030.000000   
 unique                            NaN     10          NaN          NaN   
 top                               NaN   ORCL          NaN          NaN   
 freq                              NaN    503          NaN          NaN   
 mean    2022-01-01 07:15:08.946321920    NaN   137.098250   138.907166   
 min               2021-01-04 00:00:00    NaN    10.959212    11.722392   
 25%               2021-07-02 00:00:00    NaN    73.024525    74.012537   
 50%               2021-12-31 00:00:00    NaN   126.130001   128.083808   
 75%               2022-07-05 00:00:00    NaN   177.409794   180.260223   
 max               2022-12-30 00:00:00    NaN   379.889566   382.527147   
 std                               NaN    NaN    86.746820    87.675682   
 
                 Low        Close        Volume       SMA_50      SMA_200  \
 count   5030.000000 

In [12]:
# Summary of the combined dataframe
summary = test_set.describe(include='all')

# Check for missing values (NaN)
missing_values = test_set.isna().sum()

# Check for infinite values
infinite_values = (test_set == float('inf')).sum()

# Display the summary, missing values, and infinite values
summary, missing_values, infinite_values

(                                 Date Ticker         Open         High  \
 count                            5610   5610  5610.000000  5610.000000   
 unique                            NaN     10          NaN          NaN   
 top                               NaN   NVDA          NaN          NaN   
 freq                              NaN    561          NaN          NaN   
 mean    2024-02-13 23:24:03.850267392    NaN   189.116086   191.356106   
 min               2023-01-03 00:00:00    NaN    14.462149    14.552078   
 25%               2023-07-26 00:00:00    NaN   106.338674   108.300414   
 50%               2024-02-14 00:00:00    NaN   163.267322   165.320463   
 75%               2024-09-05 00:00:00    NaN   234.672244   237.979479   
 max               2025-03-28 00:00:00    NaN   735.345771   740.251419   
 std                               NaN    NaN   130.774188   132.103862   
 
                 Low        Close        Volume       SMA_50      SMA_200  \
 count   5610.000000 