# Data Collection
## Pull data from Yahoo Finance

In [11]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import yfinance as yf

In [12]:
# Define ticker symbols and date range
tickers = [
    "AAPL",  # Apple
    "MSFT",  # Microsoft
    "NVDA",  # Nvidia
    "SAP",   # SAP
    "TSM"    # Taiwan Semi
]

# Set start date to January 1, 2015
start_date = '2015-01-01'

# Set end date to today's date
end_date = pd.Timestamp.today().strftime('%Y-%m-%d')

# Fetch historical data
data = {}

for ticker in tickers:
    # Download raw data
    df = yf.download(
        ticker,
        start=start_date,
        end=end_date,
        auto_adjust=False,
        progress=False  
    )

    # Add ticker column
    df['Ticker'] = ticker

    # Resery index to have Date as a column
    df.reset_index(inplace=True)

    # Save individial cleaned DataFrame
    data[ticker] = df

    print(f"Downloaded data for {ticker}")
    display(df.head(2))


Downloaded data for AAPL


Price,Date,Adj Close,Close,High,Low,Open,Volume,Ticker
Ticker,Unnamed: 1_level_1,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL,Unnamed: 8_level_1
0,2015-01-02,24.237551,27.3325,27.860001,26.8375,27.8475,212818400,AAPL
1,2015-01-05,23.554745,26.5625,27.1625,26.352501,27.0725,257142000,AAPL


Downloaded data for MSFT


Price,Date,Adj Close,Close,High,Low,Open,Volume,Ticker
Ticker,Unnamed: 1_level_1,MSFT,MSFT,MSFT,MSFT,MSFT,MSFT,Unnamed: 8_level_1
0,2015-01-02,39.93306,46.759998,47.419998,46.540001,46.66,27913900,MSFT
1,2015-01-05,39.565838,46.330002,46.73,46.25,46.369999,39673900,MSFT


Downloaded data for NVDA


Price,Date,Adj Close,Close,High,Low,Open,Volume,Ticker
Ticker,Unnamed: 1_level_1,NVDA,NVDA,NVDA,NVDA,NVDA,NVDA,Unnamed: 8_level_1
0,2015-01-02,0.483038,0.50325,0.507,0.49525,0.50325,113680000,NVDA
1,2015-01-05,0.47488,0.49475,0.50475,0.4925,0.50325,197952000,NVDA


Downloaded data for SAP


Price,Date,Adj Close,Close,High,Low,Open,Volume,Ticker
Ticker,Unnamed: 1_level_1,SAP,SAP,SAP,SAP,SAP,SAP,Unnamed: 8_level_1
0,2015-01-02,58.992451,70.040001,70.360001,69.639999,69.790001,683600,SAP
1,2015-01-05,56.600422,67.199997,68.360001,67.059998,68.300003,1678800,SAP


Downloaded data for TSM


Price,Date,Adj Close,Close,High,Low,Open,Volume,Ticker
Ticker,Unnamed: 1_level_1,TSM,TSM,TSM,TSM,TSM,TSM,Unnamed: 8_level_1
0,2015-01-02,16.676708,22.280001,22.459999,22.0,22.450001,6074100,TSM
1,2015-01-05,16.272514,21.74,22.17,21.709999,22.139999,9031800,TSM


In [13]:
# Combine all data into a single DataFrame
combined_data = pd.concat(data.values(), ignore_index=True)
print("Combined data shape:", combined_data.shape)
display(combined_data.head(2))

# Save combined data to CSV to Data folder
combined_data.to_csv('../Data/Stock_data.csv', index=False)


Combined data shape:

 (13660, 32)


Price,Date,Adj Close,Close,High,Low,Open,Volume,Ticker,Adj Close,Close,...,High,Low,Open,Volume,Adj Close,Close,High,Low,Open,Volume
Ticker,Unnamed: 1_level_1,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL,Unnamed: 8_level_1,MSFT,MSFT,...,SAP,SAP,SAP,SAP,TSM,TSM,TSM,TSM,TSM,TSM
0,2015-01-02,24.237551,27.3325,27.860001,26.8375,27.8475,212818400.0,AAPL,,,...,,,,,,,,,,
1,2015-01-05,23.554745,26.5625,27.1625,26.352501,27.0725,257142000.0,AAPL,,,...,,,,,,,,,,


In [14]:
combined_data.head(2)

Price,Date,Adj Close,Close,High,Low,Open,Volume,Ticker,Adj Close,Close,...,High,Low,Open,Volume,Adj Close,Close,High,Low,Open,Volume
Ticker,Unnamed: 1_level_1,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL,Unnamed: 8_level_1,MSFT,MSFT,...,SAP,SAP,SAP,SAP,TSM,TSM,TSM,TSM,TSM,TSM
0,2015-01-02,24.237551,27.3325,27.860001,26.8375,27.8475,212818400.0,AAPL,,,...,,,,,,,,,,
1,2015-01-05,23.554745,26.5625,27.1625,26.352501,27.0725,257142000.0,AAPL,,,...,,,,,,,,,,
