In [1]:
!pip install -r requirements.txt



In [2]:
import os
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import clickhouse_connect
import numpy as np
pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.width = None
pd.set_option('display.max_rows', 100)
pd.set_option('display.min_rows', 20)

In [3]:
# you can change config here)))
tickers = ["AAPL.US", "TSLA.US", "BAC.US", "MSFT.US", "GOOGL.US", "AMZN.US", "V.US", "JNJ.US", "WMT.US", "NVDA.US", "META.US", "XOM.US", "UNH.US", "KO.US", "MA.US", "NFLX.US", "DIS.US", "SBUX.US", ]  # 18

timeframe_in = "D1"  # timeframe for neural network training - input and we will trade on the same timeframe

CLICKHOUSE_CLOUD_HOSTNAME = 'csi4142-clickhouse.mershab.xyz'
CLICKHOUSE_CLOUD_USER = 'csi'
CLICKHOUSE_CLOUD_PASSWORD = 'iceberg'


TABLE_NAME = 'company_dimension_table_test'

In [4]:
# Initialize an empty set to store unique ticker names
ticker_names = set()

# Traverse the directory
for filename in os.listdir(f'../data/company_dimension/{timeframe_in}'):
    # Check if the file is a CSV file
    if filename.endswith('.csv'):
        # Extract the ticker name by splitting the filename
        # Assuming the format is <ticker_name>_<timeframe>.csv
        ticker_name = filename.split('_')[0]
        # Add the ticker name to the set
        ticker_names.add(ticker_name)

# Convert the set to a sorted list
ticker_names_list = sorted(list(ticker_names))
ticker_names_list

['A.US',
 'AAL.US',
 'AAPL.US',
 'ABBV.US',
 'ABNB.US',
 'ABT.US',
 'ACGL.US',
 'ACN.US',
 'ADBE.US',
 'ADI.US',
 'ADM.US',
 'ADP.US',
 'ADSK.US',
 'AEE.US',
 'AEP.US',
 'AES.US',
 'AFL.US',
 'AIG.US',
 'AIZ.US',
 'AJG.US',
 'AKAM.US',
 'ALB.US',
 'ALGN.US',
 'ALL.US',
 'ALLE.US',
 'AMAT.US',
 'AMCR.US',
 'AMD.US',
 'AME.US',
 'AMGN.US',
 'AMP.US',
 'AMT.US',
 'AMZN.US',
 'ANET.US',
 'ANSS.US',
 'AON.US',
 'AOS.US',
 'APA.US',
 'APD.US',
 'APH.US',
 'APTV.US',
 'ARE.US',
 'ASML.US',
 'ATO.US',
 'AVB.US',
 'AVGO.US',
 'AVY.US',
 'AWK.US',
 'AXON.US',
 'AXP.US',
 'AZN.US',
 'AZO.US',
 'BA.US',
 'BAC.US',
 'BALL.US',
 'BAX.US',
 'BBWI.US',
 'BBY.US',
 'BDX.US',
 'BEN.US',
 'BF.B.US',
 'BG.US',
 'BIIB.US',
 'BIO.US',
 'BK.US',
 'BKNG.US',
 'BKR.US',
 'BLDR.US',
 'BLK.US',
 'BMY.US',
 'BR.US',
 'BRK.B.US',
 'BRO.US',
 'BSX.US',
 'BWA.US',
 'BX.US',
 'BXP.US',
 'C.US',
 'CAG.US',
 'CAH.US',
 'CARR.US',
 'CAT.US',
 'CB.US',
 'CBOE.US',
 'CBRE.US',
 'CCEP.US',
 'CCI.US',
 'CCL.US',
 'CDAY.US',

In [5]:
# Do it for all shares
df_list = []
single_ticker_df = {}
for ticker in ticker_names_list:
    temp_df  = pd.DataFrame(pd.read_csv(f"../data/company_dimension/{timeframe_in}/{ticker}_{timeframe_in}.csv", parse_dates=['datetime']))
    temp_df['ticker'] = ticker.split('.US')[0]
    df_list.append(temp_df) 

df_list

[       datetime    open    high     low   close    volume ticker
 0    1999-11-18   45.56   50.00   40.37   42.44  28749500      A
 1    1999-11-19   42.94   43.00   39.82   40.69  10507800      A
 2    1999-11-22   41.31   43.13   40.06   43.07   4327500      A
 3    1999-11-23   42.50   42.94   40.25   40.38   3980200      A
 4    1999-11-24   40.13   41.94   40.00   41.00   3369900      A
 5    1999-11-26   40.88   41.50   40.75   41.19   1235800      A
 6    1999-11-29   41.00   42.44   40.56   41.94   2823500      A
 7    1999-11-30   42.00   42.94   40.94   42.25   2879800      A
 8    1999-12-01   42.19   43.44   41.87   43.25   2037500      A
 9    1999-12-02   43.75   45.01   43.38   44.44   2148600      A
 ...         ...     ...     ...     ...     ...       ...    ...
 6155 2024-03-02  139.09  139.10  139.06  139.06    244498      A
 6156 2024-03-04  139.15  143.49  138.81  142.96   1189895      A
 6157 2024-03-05  142.90  146.40  142.86  144.38   1698611      A
 6158 2024

In [6]:
consolidated_df = pd.concat(df_list, ignore_index=True)

In [7]:

consolidated_df['datetime'] = pd.to_datetime(consolidated_df['datetime'],unit='s')
consolidated_df['datetime'] = consolidated_df['datetime'].dt.strftime('%Y-%m-%d')
consolidated_df['volume'] = np.floor(pd.to_numeric(consolidated_df['volume'], errors='coerce')).astype('Int64')

consolidated_df

Unnamed: 0,datetime,open,high,low,close,volume,ticker
0,1999-11-18,45.56,50.00,40.37,42.44,28749500,A
1,1999-11-19,42.94,43.00,39.82,40.69,10507800,A
2,1999-11-22,41.31,43.13,40.06,43.07,4327500,A
3,1999-11-23,42.50,42.94,40.25,40.38,3980200,A
4,1999-11-24,40.13,41.94,40.00,41.00,3369900,A
5,1999-11-26,40.88,41.50,40.75,41.19,1235800,A
6,1999-11-29,41.00,42.44,40.56,41.94,2823500,A
7,1999-11-30,42.00,42.94,40.94,42.25,2879800,A
8,1999-12-01,42.19,43.44,41.87,43.25,2037500,A
9,1999-12-02,43.75,45.01,43.38,44.44,2148600,A


In [12]:
client = clickhouse_connect.get_client(host=CLICKHOUSE_CLOUD_HOSTNAME, port=443, user=CLICKHOUSE_CLOUD_USER, password=CLICKHOUSE_CLOUD_PASSWORD,database='default')
print(client.ping())
print("connected to " + CLICKHOUSE_CLOUD_HOSTNAME + "\n")
#client.command(f'DROP TABLE IF EXISTS {TABLE_NAME}')



True
connected to csi4142-clickhouse.mershab.xyz



''

In [None]:
# Check if table exists and create if it doesn't
create_table_query = f"""
CREATE TABLE IF NOT EXISTS {TABLE_NAME} (
    datetime String,
    open Float64,
    high Float64,
    low Float64,
    close Float64,
    volume UInt64,
    ticker String
) ENGINE = MergeTree()
ORDER BY datetime
"""

client.command(create_table_query)

In [13]:
client.insert_df(TABLE_NAME, consolidated_df)