In [None]:
!pip install -r requirements.txt

In [None]:
import pandas as pd
import clickhouse_connect

pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.width = None
pd.set_option('display.max_rows', 100)
pd.set_option('display.min_rows', 20)

In [None]:
CLICKHOUSE_CLOUD_HOSTNAME = 'csi4142-clickhouse.mershab.xyz'
CLICKHOUSE_CLOUD_USER = 'csi'
CLICKHOUSE_CLOUD_PASSWORD = 'iceberg'

FACT_TABLE_NAME = "fact_table"

In [None]:
client = clickhouse_connect.get_client(host=CLICKHOUSE_CLOUD_HOSTNAME, port=443, user=CLICKHOUSE_CLOUD_USER, password=CLICKHOUSE_CLOUD_PASSWORD,database='default')
print(client.ping())
print("connected to " + CLICKHOUSE_CLOUD_HOSTNAME + "\n")


In [None]:
# Check if table exists and create if it doesn't
create_table_query = f"""
CREATE TABLE IF NOT EXISTS {FACT_TABLE_NAME} (
    ticker String,
    median_worker_pay Float64,
    stock_price Float64,
    pay_ratio String,
    CEO String,
    ceo_pay Float64,
    cost_of_living Float64,
    company String,
    year String,
    state String,
) ENGINE = MergeTree()
ORDER BY year 
"""

#client.command(f'DROP TABLE IF EXISTS {TABLE_NAME}')
client.command(create_table_query)

In [None]:

company_df = client.query_df(f'SELECT * FROM company_dimension_table_test')
company_df.head(10)


In [None]:
# Convert 'date' from string to datetime
company_df['date'] = pd.to_datetime(company_df['datetime'])

# Extracting year from date
company_df['year'] = company_df['date'].dt.year

# Group by ticker and year, then calculate mean price
df_yearly_avg = company_df.groupby(['ticker', 'year'])['close'].mean().reset_index()

print(df_yearly_avg)

In [None]:

ceo_df = client.query_df(f'SELECT * FROM ceo_dimension_table_test')
ceo_df['ticker'] = ceo_df['Ticker']
ceo_df['year'] = ceo_df['Year']

ceo_df.head(10)


In [None]:

col_df = client.query_df(f"""SELECT * FROM cost_of_living_dimension_table_test WHERE Indicator = 'Cost of Living Index'""")
col_df['year'] = col_df['Year']
col_df

In [None]:
# Joining df_yearly_avg with df_yearly on ticker and year
df_joined = pd.merge(df_yearly_avg, ceo_df, on=['ticker', 'year'], how='inner')
df_joined = pd.merge(df_joined, col_df, on=['year'], how='inner')
df_joined

In [None]:
df_joined['stock_price'] = df_joined['close']
df_joined['pay_ratio'] = df_joined['Pay_Ratio']
df_joined['median_worker_pay'] = df_joined['Median_Worker_Pay']
df_joined['ceo_pay'] = df_joined['CEO_Pay']
df_joined['company'] = df_joined['Company']
df_joined['cost_of_living'] = df_joined['Cost_of_Living_Index']

df_joined['year'] = df_joined['Year_x']
df_joined['state'] = df_joined['State']
df_joined['year'] = df_joined['year'].astype(str)

# Drop unwanted columns
df_fact_table = df_joined.drop(columns=['Year_y','Year_x', 'Ticker', 'close', 'Median_Worker_Pay', 'Pay_Ratio', 'CEO_Pay', 'Company', 'State', 'Indicator', 'Cost_of_Living_Index'])
print(df_fact_table)

In [None]:
df_fact_table.dtypes

In [None]:
client.insert_df(FACT_TABLE_NAME, df_fact_table)

In [None]:
result_df = client.query_df(f'SELECT * FROM {FACT_TABLE_NAME}')
result_df