In [1]:
############################### Packages ###############################

import pandas as pd
import yfinance as yf
import numpy as np
import matplotlib as plt
import time
from functools import reduce

In [2]:
############################### History of stock prices ###############################

# List of possible stocks we want to pull from yahoo finance. Found on Nasdaq: https://www.nasdaq.com/market-activity/stocks/screener

nasdaq_dataset = pd.read_csv("C:/Users/CMert/Documents/Bachelor-projekt/Projektet/Data/Stock_names.csv")
print(nasdaq_dataset["Symbol"])

0          A
1         AA
2       AACG
3       AACT
4       AADI
        ... 
7010    ZVRA
7011    ZVSA
7012     ZWS
7013    ZYME
7014    ZYXI
Name: Symbol, Length: 7015, dtype: object


In [3]:
# We create a list, where each element is the dataframe of a stock


def yf_stocks_df(list_of_stock_names):
    
    global stock_df
    stock_df = []
    
    global excluded_stocks
    excluded_stocks = []
    
    for stock in list_of_stock_names:
        try:
            # We combine the data of each stock from yf and add the variables; dividends, stock splits and sector.
            stock_ticker = yf.Ticker(stock)
            stock_data = yf.download(stock, interval = "1d")
            
            # If stock_data is empty, we will skip to the next stock on the list
            if stock_data.empty:
                excluded_stocks.append(stock)
                continue
            
            # The tickers include time in the date column, which is a problem since the stock data does not. Therefore we have to remove the time component.

            # Since "Date" is an index, we first make it a column.
            dividends = pd.DataFrame(stock_ticker.dividends).reset_index()
            splits = pd.DataFrame(stock_ticker.splits).reset_index()
            stock_data = pd.DataFrame(stock_data).reset_index()


            # Now we ensure all dataframes have the same dimension for the "Date".
            dividends['Date'] = pd.to_datetime(dividends['Date']).dt.date
            splits['Date'] = pd.to_datetime(splits['Date']).dt.date
            stock_data['Date'] = pd.to_datetime(stock_data['Date']).dt.date
            
            
            # Merge dividends and splits into the price dataframe by aligning them on the "Date" column. None available values, which can only be found in "Dividends" and "Stock Splits"
            # because we removed all stocks with empty values, is set to 0.

            column_combination = [stock_data, dividends, splits]

            indiv_stock_df = reduce(lambda  left,right: pd.merge(left, right,on=['Date'],how='outer'), column_combination).fillna(0)
            
            time.sleep(0.5)
            # Include "Sector", "Industry" and "Symbol" from nasdaq df.
            indiv_stock_df["Sector"] = nasdaq_dataset.loc[nasdaq_dataset["Symbol"] == stock, "Sector"].iloc[0] 
            
            time.sleep(0.2)
            
            indiv_stock_df["Industry"] = nasdaq_dataset.loc[nasdaq_dataset["Symbol"] == stock, "Industry"].iloc[0] 
            
            time.sleep(0.2)
            
            indiv_stock_df["Symbol"] = stock
            
            stock_df.append(indiv_stock_df)
        
        # If the stock should create an error, we will skip to the next stock.
        except Exception as e:
            excluded_stocks.append(stock)
            continue
        
        time.sleep(0.5)
    
    return excluded_stocks, stock_df

In [9]:
yf_stocks_df(nasdaq_dataset["Symbol"][3000:])

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

1 Failed download:
['HNRA']: YFPricesMissingError('$%ticker%: possibly delisted; no price data found  (1d 1926-02-17 -> 2025-01-23) (Yahoo error = "No data found, symbol may be delisted")')
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

1 Failed download:
['HOFVW']: YFInvalidPeriodErro

(['HNRA',
  'HOFVW',
  'HOLOW',
  'HOVRW',
  'HPAIW',
  'HPP^C',
  'HSPOR',
  'HTZWW',
  'HUBCW',
  'HUBCZ',
  'HUMAW',
  'HVT/A',
  'HWM^',
  'HYMCW',
  'HYZNW',
  'ICD',
  'ICR^A',
  'ICUCW',
  'IFIN',
  'IGTAW',
  'IIPR^A',
  'INN^E',
  'INN^F',
  'INST',
  'INTEW',
  'INVO',
  'INVZW',
  'IPXXW',
  'ISRLW',
  'ITI',
  'IVCAW',
  'IVCBW',
  'IVCPW',
  'IVR^B',
  'IVR^C',
  'IXAQW',
  'JCTCF',
  'JPM^C',
  'JPM^D',
  'JPM^J',
  'JPM^K',
  'JPM^L',
  'JPM^M',
  'JTAIW',
  'JVSAR',
  'JWSM',
  'JXN^A',
  'KA',
  'KACLR',
  'KDLYW',
  'KEY^I',
  'KEY^J',
  'KEY^K',
  'KEY^L',
  'KIM^L',
  'KIM^M',
  'KIM^N',
  'KITTW',
  'KPLTW',
  'KREF^A',
  'KSM',
  'KTRA',
  'KWESW',
  'LCW',
  'LDTCW',
  'LEXXW',
  'LFLYW',
  'LFT^A',
  'LGHLW',
  'LIFWZ',
  'LILM',
  'LILMW',
  'LLAP',
  'LNC^D',
  'LNZAW',
  'LOTWW',
  'LSEAW',
  'LSXMA',
  'LSXMB',
  'LSXMK',
  'LTRYW',
  'LUNRW',
  'LVROW',
  'LXP^C',
  'MAA^I',
  'MACIW',
  'MAPSW',
  'MARXR',
  'MCAAW',
  'MDAIW',
  'MDV^A',
  'MEDS',
  'MER^

In [10]:
excluded_stocks_list = pd.DataFrame(excluded_stocks, columns=["Stocks"])

excluded_stocks_list.to_csv("excluded_stocks_list.csv", index=False)

In [6]:
## For itreation 1-3000


# The last step is to remove the stocks on excluded_stocks_list from the Nasdaq dataset, so that we can create a stock dataset with viable data

excluded_stocks_list = pd.read_csv("C:/Users/CMert/Documents/Bachelor-projekt/Projektet/Data/excluded_stocks_list.csv")

nasdaq_true_dataset = nasdaq_dataset[~nasdaq_dataset["Symbol"].isin(excluded_stocks_list["Stocks"])]

nasdaq_true_dataset = nasdaq_true_dataset.reset_index(drop=True)



# All none available elements will be written as "Unknown" and can hence still be used as information for the stock
nasdaq_true_dataset = nasdaq_true_dataset.fillna("Unknown")

nasdaq_true_dataset.to_csv("nasdaq_true_dataset.csv", index=False)

In [7]:

nasdaq_true_dataset = pd.read_csv("C:/Users/CMert/Documents/Bachelor-projekt/Projektet/Data/nasdaq_true_dataset.csv")

nasdaq_true_dataset

Unnamed: 0,Symbol,Name,Last Sale,Net Change,% Change,Market Cap,Country,IPO Year,Volume,Sector,Industry
0,A,Agilent Technologies Inc. Common Stock,$140.67,0.1800,0.128%,41042001864.0,United States,1999.0,73898,Industrials,Biotechnology: Laboratory Analytical Instruments
1,AA,Alcoa Corporation Common Stock,$34.93,0.3700,1.071%,9023821090.0,United States,2016.0,659537,Industrials,Aluminum
2,AACG,ATA Creativity Global American Depositary Shares,$0.5809,-0.0291,-4.77%,18588876.0,China,2008.0,24130,Real Estate,Other Consumer Services
3,AACT,Ares Acquisition Corporation II Class A Ordina...,$10.76,0.0100,0.093%,0.0,Unknown,2023.0,1782034,Finance,Blank Checks
4,AADI,Aadi Bioscience Inc. Common Stock,$1.66,-0.0900,-5.143%,40860624.0,United States,Unknown,29907,Health Care,Biotechnology: Pharmaceutical Preparations
...,...,...,...,...,...,...,...,...,...,...,...
6675,ZVRA,Zevra Therapeutics Inc. Common Stock,$7.1992,0.1292,1.827%,378805987.0,United States,Unknown,56092,Health Care,Biotechnology: Pharmaceutical Preparations
6676,ZVSA,ZyVersa Therapeutics Inc. Common Stock,$2.91,0.1100,3.929%,2206903.0,United States,2022.0,8986,Health Care,Biotechnology: Pharmaceutical Preparations
6677,ZWS,Zurn Elkay Water Solutions Corporation Common ...,$31.92,-0.1700,-0.53%,5456037560.0,United States,2012.0,83391,Industrials,Industrial Machinery/Components
6678,ZYME,Zymeworks Inc. Common Stock,$11.44,-0.1000,-0.867%,812782142.0,United States,Unknown,38478,Health Care,Biotechnology: Pharmaceutical Preparations


In [8]:
# We save the dataframe of each stock. We can access stock_df from the function since it is defined as globally accessable.

for stock, df in enumerate(stock_df):
    stock_name = nasdaq_true_dataset["Symbol"][stock]
    time.sleep(0.2)
    
    df.to_csv(f'{stock_name}_data.csv', index=False)
    time.sleep(0.2)

In [11]:
## For iteration 3001 to the last

# The last step is to remove the stocks on empty_stock_list from the Nasdaq dataset, so that we can create a stock dataset with viable data

excluded_stocks_list = pd.read_csv("C:/Users/CMert/Documents/Bachelor-projekt/Projektet/Data/excluded_stocks_list.csv")

nasdaq_true_dataset = nasdaq_true_dataset[~nasdaq_true_dataset["Symbol"].isin(excluded_stocks_list["Stocks"])]

nasdaq_true_dataset = nasdaq_true_dataset.reset_index(drop=True)



# All none available elements will be written as "Unknown" and can hence still be used as information for the stock
nasdaq_true_dataset = nasdaq_true_dataset.fillna("Unknown")

nasdaq_true_dataset.to_csv("nasdaq_true_dataset.csv", index=False)

In [31]:

nasdaq_true_dataset = pd.read_csv("C:/Users/CMert/Documents/Bachelor-projekt/Projektet/Data/nasdaq_true_dataset.csv")

nasdaq_true_dataset

Unnamed: 0,Symbol,Name,Last Sale,Net Change,% Change,Market Cap,Country,IPO Year,Volume,Sector,Industry
0,A,Agilent Technologies Inc. Common Stock,$140.67,0.1800,0.128%,41042001864.0,United States,1999.0,73898,Industrials,Biotechnology: Laboratory Analytical Instruments
1,AA,Alcoa Corporation Common Stock,$34.93,0.3700,1.071%,9023821090.0,United States,2016.0,659537,Industrials,Aluminum
2,AACG,ATA Creativity Global American Depositary Shares,$0.5809,-0.0291,-4.77%,18588876.0,China,2008.0,24130,Real Estate,Other Consumer Services
3,AACT,Ares Acquisition Corporation II Class A Ordina...,$10.76,0.0100,0.093%,0.0,Unknown,2023.0,1782034,Finance,Blank Checks
4,AADI,Aadi Bioscience Inc. Common Stock,$1.66,-0.0900,-5.143%,40860624.0,United States,Unknown,29907,Health Care,Biotechnology: Pharmaceutical Preparations
...,...,...,...,...,...,...,...,...,...,...,...
6675,ZVRA,Zevra Therapeutics Inc. Common Stock,$7.1992,0.1292,1.827%,378805987.0,United States,Unknown,56092,Health Care,Biotechnology: Pharmaceutical Preparations
6676,ZVSA,ZyVersa Therapeutics Inc. Common Stock,$2.91,0.1100,3.929%,2206903.0,United States,2022.0,8986,Health Care,Biotechnology: Pharmaceutical Preparations
6677,ZWS,Zurn Elkay Water Solutions Corporation Common ...,$31.92,-0.1700,-0.53%,5456037560.0,United States,2012.0,83391,Industrials,Industrial Machinery/Components
6678,ZYME,Zymeworks Inc. Common Stock,$11.44,-0.1000,-0.867%,812782142.0,United States,Unknown,38478,Health Care,Biotechnology: Pharmaceutical Preparations


In [17]:
# I will only use "Sector" and "Industry" in the final dataset. Since most firms in the dataset is located in the US, "location" will be excluded.

# Notice: this is only to get an overview of the variables of the nasdaq dataset.

catog_data_overview = nasdaq_true_dataset.pivot_table(index = ["Industry"], aggfunc = "size")

# catog_data_overview.to_csv("catog_data_overview.csv")

catog_data_overview

Industry
 Medicinal Chemicals and Botanical Products             31
Accident &Health Insurance                               5
Advertising                                             27
Aerospace                                               29
Agricultural Chemicals                                  13
                                                      ... 
Trusts Except Educational Religious and Charitable     118
Unknown                                                327
Water Sewer Pipeline Comm & Power Line Construction      6
Water Supply                                            14
Wholesale Distributors                                   2
Length: 150, dtype: int64

In [12]:
# We can do as we previously did, since we decided to do the data gathering in 2 steps. That's why this algorithm has follow the order of the stock_df and not of nasdaq_true_dataset.

for df in stock_df:
    stock_name = df["Symbol"].iloc[0] 
    time.sleep(0.2)
    
    df.to_csv(f'{stock_name}_data.csv', index=False)
    time.sleep(0.2)

In [13]:
# We combine all dataframes now. The reason we combine them here was in case the code broke midway or if I got any new ideas.

combined_stock_data_lower = pd.concat(stock_df, ignore_index=True)

combined_stock_data_lower.to_csv('combined_stock_data_lower.csv', index=False)

KeyboardInterrupt: 

In [41]:
# Finally we complete the dataframe.

upper_stock_df = pd.read_csv("C:/Users/CMert/Documents/Bachelor-projekt/Projektet/Data/combined_stock_data_upper.csv")

lower_stock_df = pd.read_csv("C:/Users/CMert/Documents/Bachelor-projekt/Projektet/Data/combined_stock_data_lower.csv")


complete_stock_df = pd.concat([upper_stock_df, lower_stock_df], ignore_index=True)

complete_stock_df.to_csv('complete_stock_df.csv', index=False)

complete_stock_df

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Dividends,Stock Splits,Sector,Industry,Symbol
0,1999-11-18,32.546494,35.765381,28.612303,31.473534,26.603025,62546380.0,0.0,0.0,Industrials,Biotechnology: Laboratory Analytical Instruments,A
1,1999-11-19,30.713518,30.758226,28.478184,28.880545,24.411301,15234146.0,0.0,0.0,Industrials,Biotechnology: Laboratory Analytical Instruments,A
2,1999-11-22,29.551144,31.473534,28.657009,31.473534,26.603025,6577870.0,0.0,0.0,Industrials,Biotechnology: Laboratory Analytical Instruments,A
3,1999-11-23,30.400572,31.205294,28.612303,28.612303,24.184568,5975611.0,0.0,0.0,Industrials,Biotechnology: Laboratory Analytical Instruments,A
4,1999-11-24,28.701717,29.998213,28.612303,29.372318,24.826971,4843231.0,0.0,0.0,Industrials,Biotechnology: Laboratory Analytical Instruments,A
...,...,...,...,...,...,...,...,...,...,...,...,...
25904227,2024-09-05,7.750000,7.890000,7.740000,7.890000,7.890000,89600.0,0.0,0.0,Health Care,Biotechnology: Electromedical & Electrotherape...,ZYXI
25904228,2024-09-06,7.870000,7.890000,7.680000,7.840000,7.840000,81600.0,0.0,0.0,Health Care,Biotechnology: Electromedical & Electrotherape...,ZYXI
25904229,2024-09-09,7.800000,7.850000,7.730000,7.830000,7.830000,86400.0,0.0,0.0,Health Care,Biotechnology: Electromedical & Electrotherape...,ZYXI
25904230,2024-09-10,7.860000,8.030000,7.740000,8.020000,8.020000,51200.0,0.0,0.0,Health Care,Biotechnology: Electromedical & Electrotherape...,ZYXI


In [37]:
############################### GRAVEYARD ###############################

aapl = yf.download("ZYXI", interval = "1d")

aapl

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2004-02-25,2.636364,2.636364,2.554545,2.563636,1.379478,14080
2004-02-26,2.590909,2.636364,2.563636,2.627273,1.413721,65450
2004-02-27,2.627273,2.681818,2.590909,2.636364,1.418613,17930
2004-03-01,2.681818,2.681818,2.590909,2.627273,1.413721,15950
2004-03-02,2.681818,2.745455,2.545455,2.590909,1.394154,31570
...,...,...,...,...,...,...
2024-09-06,7.870000,7.890000,7.680000,7.840000,7.840000,81600
2024-09-09,7.800000,7.850000,7.730000,7.830000,7.830000,86400
2024-09-10,7.860000,8.030000,7.740000,8.020000,8.020000,51200
2024-09-11,8.010000,8.010000,7.800000,7.960000,7.960000,53900


In [39]:
kai = pd.read_csv("C:/Users/CMert/Documents/Bachelor-projekt/Projektet/combined_stock_data_lower.csv")
kai

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Dividends,Stock Splits,Sector,Industry,Symbol
0,1980-03-17,0.00,1.81250,1.68750,1.68750,0.731286,23200.0,0.0,0.0,Consumer Discretionary,Office Equipment/Supplies/Services,HNI
1,1980-03-18,0.00,1.78125,1.65625,1.65625,0.717744,91200.0,0.0,0.0,Consumer Discretionary,Office Equipment/Supplies/Services,HNI
2,1980-03-19,0.00,1.81250,1.68750,1.68750,0.731286,20000.0,0.0,0.0,Consumer Discretionary,Office Equipment/Supplies/Services,HNI
3,1980-03-20,0.00,1.78125,1.65625,1.65625,0.717744,4000.0,0.0,0.0,Consumer Discretionary,Office Equipment/Supplies/Services,HNI
4,1980-03-21,0.00,1.81250,1.68750,1.68750,0.731286,90400.0,0.0,0.0,Consumer Discretionary,Office Equipment/Supplies/Services,HNI
...,...,...,...,...,...,...,...,...,...,...,...,...
14976785,2024-09-05,7.75,7.89000,7.74000,7.89000,7.890000,89600.0,0.0,0.0,Health Care,Biotechnology: Electromedical & Electrotherape...,ZYXI
14976786,2024-09-06,7.87,7.89000,7.68000,7.84000,7.840000,81600.0,0.0,0.0,Health Care,Biotechnology: Electromedical & Electrotherape...,ZYXI
14976787,2024-09-09,7.80,7.85000,7.73000,7.83000,7.830000,86400.0,0.0,0.0,Health Care,Biotechnology: Electromedical & Electrotherape...,ZYXI
14976788,2024-09-10,7.86,8.03000,7.74000,8.02000,8.020000,51200.0,0.0,0.0,Health Care,Biotechnology: Electromedical & Electrotherape...,ZYXI


In [36]:
nasdaq_true_dataset.loc[nasdaq_true_dataset["Symbol"] == "ZYXI", "Industry"].iloc[0]

'Biotechnology: Electromedical & Electrotherapeutic Apparatus'

In [3]:
# We combine the data of each stock from yf and add the variables; dividends, stock splits, sector and.
aapl_ticker = yf.Ticker("AAPL")
aapl_data = yf.download("AAPL")

# The tickers include time in the date column, which is a problem since the stock data does not. Therefore we have to remove the time component.

# Since "Date" is an index, we first make it a column.
dividends = pd.DataFrame(aapl_ticker.dividends).reset_index()
splits = pd.DataFrame(aapl_ticker.splits).reset_index()
aapl_data = pd.DataFrame(aapl_data).reset_index()


# Now we ensure all dataframes have the same dimension for the "Date".
dividends['Date'] = pd.to_datetime(dividends['Date']).dt.date
splits['Date'] = pd.to_datetime(splits['Date']).dt.date
aapl_data['Date'] = pd.to_datetime(aapl_data['Date']).dt.date


# Merge dividends and splits into the price dataframe by aligning them on the "Date" column. None available values, which can only be found in "Dividends" and "Stock Splits"
# because we removed all stocks with empty values, is set to 0.

column_combination = [aapl_data, dividends, splits]

final_yf_df = reduce(lambda  left,right: pd.merge(left, right,on=['Date'],how='outer'), column_combination).fillna(0)

final_yf_df["Sector"] = nasdaq_true_dataset.loc[nasdaq_true_dataset["Symbol"] == "AAPL", "Sector"].iloc[0]

final_yf_df["Industry"] = nasdaq_true_dataset.loc[nasdaq_true_dataset["Symbol"] == "AAPL", "Industry"].iloc[0] 

final_yf_df

[*********************100%***********************]  1 of 1 completed


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Dividends,Stock Splits,Sector,Industry
0,1980-12-12,0.128348,0.128906,0.128348,0.128348,0.098943,469033600,0.0,0.0,Technology,Computer Manufacturing
1,1980-12-15,0.122210,0.122210,0.121652,0.121652,0.093781,175884800,0.0,0.0,Technology,Computer Manufacturing
2,1980-12-16,0.113281,0.113281,0.112723,0.112723,0.086898,105728000,0.0,0.0,Technology,Computer Manufacturing
3,1980-12-17,0.115513,0.116071,0.115513,0.115513,0.089049,86441600,0.0,0.0,Technology,Computer Manufacturing
4,1980-12-18,0.118862,0.119420,0.118862,0.118862,0.091631,73449600,0.0,0.0,Technology,Computer Manufacturing
...,...,...,...,...,...,...,...,...,...,...,...
11021,2024-09-03,228.550003,229.000000,221.169998,222.770004,222.770004,50190600,0.0,0.0,Technology,Computer Manufacturing
11022,2024-09-04,221.660004,221.779999,217.479996,220.850006,220.850006,43840200,0.0,0.0,Technology,Computer Manufacturing
11023,2024-09-05,221.630005,225.479996,221.520004,222.380005,222.380005,36615400,0.0,0.0,Technology,Computer Manufacturing
11024,2024-09-06,223.949997,225.240005,219.770004,220.820007,220.820007,48388600,0.0,0.0,Technology,Computer Manufacturing


In [None]:
def stocks_not_in_yf(stock_name):
    global empty_stock
    empty_stock = []
    
    global error_stock
    error_stock = []
    
    for stock in stock_name:
        try:
            yf_data_per_stock = yf.download(stock, interval = "1d")
            
            if yf_data_per_stock.empty:
                empty_stock.append(stock)
    
        except Exception as e:
            print(f"The stock, {stock}, gave error: {e}") and error_stock.append(stock)
            
        time.sleep(0.5) # This prevent false positives because the quick moventment of each iteration can create errors
            
            
    return empty_stock, error_stock

In [14]:
import os
import pandas as pd


folder_path = "C:/Users/CMert/Documents/Aktiepriser"
all_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]


df_list = []
for file_name in all_files:
    file_path = os.path.join(folder_path, file_name)
    df = pd.read_csv(file_path)
    
    # Optional: If you need to track which file/stock this data came from, 
    # you can add a column here:
    # df['stock_symbol'] = file_name.replace('.csv', '')
    
    df_list.append(df)

combined_df = pd.concat(df_list, ignore_index=True)


combined_df.to_csv("combined_stocks.csv", index=False)
