# SemiConductor Stock Market Analysis 


In [87]:
# Use Yahoo Finance to collect stock information
# Use LSTM to collect and predict future stock prices
# Introduction of other factors to the stock analysis process

# stock market prediction based on the performance of other stocks in the market in the same category

# Focus will be AMD

# depending on the perforamnce of other stocks such as Nvidia, Intel, etc. then the output should change

In [88]:
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt

from datetime import datetime

# yahoo Finance API library (useful?)
import yfinance as yf

import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras.layers import Dense, LSTM

from pandas_datareader import data as pdr
yf.pdr_override()

## Model Preparation

Spreads through 3 years of stock prices, prepare data for analysis

In [89]:
# get stock closing prices
def fetch_stock(stock_symbol, start_years_ago=3):
    end = datetime.now()
    start = datetime(end.year - start_years_ago, end.month, end.day)
    df = pdr.get_data_yahoo(stock_symbol, start=start, end=end)
    # globals()[stock_symbol] = yf.download(stock_symbol, start=start, end=end)
    # df = globals()[stock_symbol] = yf.download(stock_symbol, start=start, end=end)
    return df.filter(['Adj Close']), df

In [90]:
def normalize_data(dataset):
    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(dataset)
    return scaler, scaled_data

In [91]:
def split_data(scaled_data, window_size=60):
    X, y = [], []
    for i in range(len(scaled_data) - window_size):
        X.append(scaled_data[i:i+window_size, 0])
        y.append(scaled_data[i+window_size, 0])
    return np.array(X), np.array(y)

In [92]:
def prepare_test_data(scaled_data, training_data_len, window_size):
    test_data = scaled_data[training_data_len - window_size:, :]
    test_set = []
    for i in range(window_size, len(test_data)):
        test_set.append(test_data[i - window_size:i, 0])
    # test_set = np.array(test_set)
    # test_set = np.reshape(test_set, (test_set.shape[0], test_set.shape[1], 1))
    return np.array(test_set).reshape(-1, window_size, 1)

# Testing original model setup

In [None]:
# Function that runs the full LSTM model and transforms data of last 3 years, providing predicted values
# if too lazy to pre setup
def model_data(stock_symbol, epochs=100, batch_size=32, window_size=60):
    # Fetch and normalize data
    data, df = fetch_stock(stock_symbol)
    dataset = data.values
    training_data_len = int(np.ceil(len(dataset) * 0.95))
    scaler, scaled_data = normalize_data(dataset)

    # Create training and testing sets
    X, y = split_data(scaled_data, window_size)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Build and train the LSTM model
    model = Sequential([
    LSTM(50, return_sequences=True, input_shape=(X_train.shape[1], 1)),
    LSTM(50, return_sequences=False),
    Dense(25),
    Dense(1)
    ])
    
    model.compile(optimizer='adam', loss='mse')

    model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test), verbose=0)


    # Make predictions
    test_set = prepare_test_data(scaled_data, training_data_len, window_size)
    predictions = model.predict(test_set)
    scaled_pred = scaler.inverse_transform(predictions)

    # Step 5: Prepare results
    valid = data[training_data_len:]
    valid['Predictions'] = scaled_pred

    return df, valid, scaled_pred

In [94]:
# # outputs the results from model
# stock_list = ['NVDA','TSM', 'INTC', 'AVGO', 'QCOM', 'AMD']

# df_nvda_3, valid_nvda, pred_nvda = model_data(stock_list[0])

# current_price = df_nvda_3['Adj Close'].iloc[-1]
# threshold = 0.03

# predicted_price = pred_nvda[-1]
# if predicted_price > current_price * (1 + threshold):
#     print("Consider buying the stock.")
# elif predicted_price < current_price * (1 - threshold):
#     print("Consider selling the stock.")
# else:
#     print("Hold the stock.")

# # takes around 1 minute to predict the next suggestion

# Stock prediction with other stocks
Modify model to take predictions of last relevant stocks to influence price calculation

In [140]:
# use the Nasdaq index as a macroeconomic indicator
# add additional averages, RSI, or bollinger bands as features

# stock prediction focuses semiconductor stocks

# semiconductor stocks
# nvidia, taiwan semiconductor manufacturing, intel, broadcom, qualcomm, AMD

# 3 years
stock_list = ['NVDA','TSM', 'INTC', 'AVGO', 'QCOM', 'AMD']

# nvda_close, df_nvda = fetch_stock('NVDA')
# tsm_close, df_tsm = fetch_stock('TSM')
# intc_close, df_intc = fetch_stock('INTC')
# avgo_close, df_avgo = fetch_stock('AVGO')
# qcom_close, df_qcom = fetch_stock('QCOM')
# amd_close, df_amd = fetch_stock('AMD')

In [None]:
def preprocess_data(stock_symbol, epochs=50, batch_size=16, window_size=60):
    # Fetch data
    end = datetime.now()
    start = datetime(end.year - 5, end.month, end.day)
    df = pdr.get_data_yahoo(stock_symbol, start=start, end=end)

    # choosing only adj close prices
    data = df.filter(['Adj Close'])
    # convert to numpy array
    dataset = data.values
    # calculate the length of training data
    training_data_len = int(np.ceil(len(dataset) * .95))

    # Scale the data
    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(dataset)

    # Prepare the feature (X) and target (y) sequences
    X, y = [], []
    for i in range(len(scaled_data) - window_size):
        X.append(scaled_data[i:i+window_size])
        y.append(scaled_data[i+window_size])
        
    # Convert the lists into numpy arrays
    X, y = np.array(X), np.array(y)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = Sequential([
    LSTM(50, return_sequences=True, input_shape=(X_train.shape[1], 1)),
    LSTM(50, return_sequences=False),
    Dense(25),
    Dense(1)
    ])
    
    model.compile(optimizer='adam', loss='mse')

    model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test), verbose=0)

    # Make predictions
    test_data = scaled_data[training_data_len - window_size:, :]
    test_set = []
    for i in range(window_size, len(test_data)):
        test_set.append(test_data[i-window_size:i, 0])

    test_set = np.array(test_set)
    test_set = np.reshape(test_set, (test_set.shape[0], test_set.shape[1], 1))

    prediction = model.predict(test_set)
    scaled_pred = scaler.inverse_transform(prediction)

    # Prepare results
    valid = data[training_data_len:]
    valid['Predictions'] = scaled_pred

    return df, valid, scaled_pred

In [125]:
def preprocess_data_multiple_stocks(stock_symbols, epochs=100, batch_size=32, window_size=60):
    # Fetch data for multiple stocks
    end = datetime.now()
    start = datetime(end.year - 5, end.month, end.day)
    
    # Fetch data for the list of stock symbols
    data = pdr.get_data_yahoo(stock_symbols, start=start, end=end)
    
    # Select only the 'Close' prices for all stocks
    stock_data = data['Close']
    
    # Convert the data into a numpy array (2D: [samples, features])
    dataset = stock_data.values
    
    # Calculate the length for training data
    training_data_len = int(np.ceil(len(dataset) * .95))
    
    # Scale the data using MinMaxScaler
    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(dataset)
    
    # Prepare the feature (X) and target (y) sequences
    X, y = [], []
    for i in range(len(scaled_data) - window_size):
        X.append(scaled_data[i:i+window_size])  # All stock data for the window
        y.append(scaled_data[i+window_size, 0])  # Target stock's next day close price (first column)

    # Convert the lists into numpy arrays
    X, y = np.array(X), np.array(y)
    
    # Split the data into training and testing sets (80% train, 20% test)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Build the LSTM model
    model = Sequential([
        LSTM(50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])),
        LSTM(50, return_sequences=False),
        Dense(25),
        Dense(1)
    ])
    
    model.compile(optimizer='adam', loss='mse')

    # Train the model
    model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test), verbose=0)
    
    # Make predictions
    test_data = scaled_data[training_data_len - window_size:, :]
    test_set = []
    for i in range(window_size, len(test_data)):
        test_set.append(test_data[i - window_size:i, :])  # Including all stocks' data for the window

    test_set = np.array(test_set)
    test_set = np.reshape(test_set, (test_set.shape[0], test_set.shape[1], test_set.shape[2]))

    # Prediction
    prediction = model.predict(test_set)

    # Reshape the prediction to (num_samples, 1) for inverse transformation
    prediction = prediction.reshape(-1, 1)  # Flatten the output for inverse transformation

    # Inverse transform the predictions for the target stock (the first column)
    scaled_pred = scaler.inverse_transform(np.hstack((prediction, np.zeros((prediction.shape[0], scaled_data.shape[1] - 1)))))
    
    # Prepare results for the target stock
    valid = stock_data[training_data_len:]
    valid['Predictions'] = scaled_pred[:, 0]  # Only take the first column for the target stock

    return data, valid, scaled_pred


In [144]:
def preprocess_data_multiple_stocks(stock_symbols, target_symbol='NVDA', epochs=100, batch_size=32, window_size=60):
    # Fetch data for multiple stocks
    end = datetime.now()
    start = datetime(end.year - 5, end.month, end.day)
    
    # Fetch data for the list of stock symbols
    data = pdr.get_data_yahoo(stock_symbols, start=start, end=end)
    
    # Select only the 'Close' prices for all stocks
    stock_data = data['Close']
    
    # Separate the target stock (NVDA) and other stocks
    target_data = stock_data[target_symbol]  # The target stock data (NVDA)
    other_stocks_data = stock_data.drop(columns=[target_symbol])  # All other stocks
    
    # Combine the data: We want the target stock's history based on all other stocks
    dataset = pd.concat([other_stocks_data, target_data], axis=1).values
    
    # Calculate the length for training data
    training_data_len = int(np.ceil(len(dataset) * .95))
    
    # Scale the data using MinMaxScaler
    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(dataset)
    
    # Prepare the feature (X) and target (y) sequences
    X, y = [], []
    for i in range(len(scaled_data) - window_size):
        X.append(scaled_data[i:i+window_size, :-1])  # All stock data for the window, excluding target stock
        y.append(scaled_data[i+window_size, -1])  # Target stock's next day close price (last column)

    # Convert the lists into numpy arrays
    X, y = np.array(X), np.array(y)
    
    # Split the data into training and testing sets (80% train, 20% test)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Build the LSTM model
    model = Sequential([
        LSTM(50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])),
        LSTM(50, return_sequences=False),
        Dense(25),
        Dense(1)
    ])
    
    model.compile(optimizer='adam', loss='mse')

    # Train the model
    model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test), verbose=0)
    
    # Make predictions
    test_data = scaled_data[training_data_len - window_size:, :]
    test_set = []
    for i in range(window_size, len(test_data)):
        test_set.append(test_data[i - window_size:i, :-1])  # Use the history of other stocks

    test_set = np.array(test_set)
    test_set = np.reshape(test_set, (test_set.shape[0], test_set.shape[1], test_set.shape[2]))

    # Prediction
    prediction = model.predict(test_set)

    # Reshape the prediction to (num_samples, 1) for inverse transformation
    prediction = prediction.reshape(-1, 1)  # Flatten the output for inverse transformation

    # Inverse transform the predictions for the target stock (the last column)
    scaled_pred = scaler.inverse_transform(np.hstack([np.zeros((prediction.shape[0], scaled_data.shape[1] - 1)), prediction]))

    
    # Prepare results for the target stock
    valid = target_data[training_data_len:]
    valid = valid.to_frame()  # Convert to DataFrame if it's a Series
    valid['Predictions'] = scaled_pred[:, -1]  # Only take the last column for the target stock

    return data, valid, scaled_pred


In [145]:
data, valid, scaled_pred = preprocess_data_multiple_stocks(stock_list)

[*********************100%%**********************]  6 of 6 completed




In [146]:
valid.head()

Unnamed: 0_level_0,NVDA,Predictions
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2024-09-04,106.209999,114.957789
2024-09-05,107.209999,113.475879
2024-09-06,102.830002,112.538263
2024-09-09,106.470001,111.109763
2024-09-10,108.099998,110.583215


In [147]:
r2 = r2_score(valid['NVDA'], valid['Predictions'])
print(f'R² Score: {r2}')

R² Score: 0.6831012109244186


In [148]:
# Extract the last prediction as a scalar value
predicted_price = scaled_pred[-1, 0]  # Ensure this is a scalar (single value)

# Get the current price (last adjusted close price)
current_price = data['Adj Close'].iloc[-1]  # Last adjusted close price

# Define the threshold
threshold = 0.03  # 3% threshold for decision making

# Decision logic
if predicted_price > current_price * (1 + threshold):
    print("Consider buying the stock.")
elif predicted_price < current_price * (1 - threshold):
    print("Consider selling the stock.")
else:
    print("Hold the stock.")


ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().