In [1]:
import pandas as pd
import glob
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

import itertools
from pathlib import Path
import tqdm

In [None]:
# Reading all data into all_event_data

root_path = Path('data/')
company_basket_file = root_path / 'company_basket.csv'

# Load basket data
basket_df = pd.read_csv(company_basket_file, sep=";")

all_event_data = pd.DataFrame()
all_txt_files = list(root_path.rglob('Sweden*.txt'))
all_csv_file = list(root_path.rglob('data*.csv'))
for event_file in tqdm.tqdm(all_txt_files, desc="Iterating all txt files"):
    current_df = pd.read_csv(event_file, sep=";")
    all_event_data = pd.concat([all_event_data, current_df], ignore_index=True)

for event_file in tqdm.tqdm(all_csv_file, desc="Iterating all csv files"):
    current_df = pd.read_csv(event_file, sep=",")
    all_event_data = pd.concat([all_event_data, current_df], ignore_index=True)

In [17]:
# Generate one hot encoding for each company based on industry_name and economic_sector_name

# Drop duplicates when ['country_name', 'industry_name', 'economic_sector_name', 'company_name'] are the same
company_dedup_columns = ['country_name', 'industry_name', 'economic_sector_name', 'company_name']
company_overview_data = all_event_data.drop_duplicates(subset=company_dedup_columns)
company_overview_data = company_overview_data[company_dedup_columns]

# --------- One-Hot Encoding Industry and Sector ---------
industry_encoded = pd.get_dummies(company_overview_data[['company_name', 'industry_name']], columns=['industry_name'], dtype=int)
sector_encoded = pd.get_dummies(company_overview_data[['company_name', 'economic_sector_name']], columns=['economic_sector_name'], dtype=int)

# Merge one-hot encoded data
stock_features = pd.concat([industry_encoded, sector_encoded.drop("company_name", axis=1)], axis=1)

Iterating all txt files: 100%|█████████████████████████████████████████████| 10/10 [00:04<00:00,  2.24it/s]
Iterating all csv files: 100%|█████████████████████████████████████████████| 11/11 [00:10<00:00,  1.05it/s]


In [106]:
# Classify companies based on their latest Mcap

# Only keep rows where ['finparametername'] == 'Mcap'
mcap_df = all_event_data[all_event_data['finparametername'] == 'Mcap'].copy()

# Sort endtime and only keep the latest (drop_duplicates only keep the first one)
latest_mcap_df = (mcap_df
    .sort_values('endtime', ascending=False)
    .drop_duplicates(subset=['country_name', 'industry_name', 'economic_sector_name', 'company_name'])
)

# Define Market Cap bins
mcap_bins = [0, 2e9, 10e9, float("inf")]  # Small, Mid, Large Cap
mcap_labels = ["Small", "Medium", "Large"]

latest_mcap_df["Size Category"] = pd.cut(latest_mcap_df["finval"], bins=mcap_bins, labels=mcap_labels)
latest_mcap_df.head()

Unnamed: 0,country_name,industry_name,economic_sector_name,company_name,finparametername,endtime,finval,Size Category
11217504,United States of America,Specialty Chemicals,Basic Materials,CSW Industrials Inc,Mcap,2024-08-07 00:00:00,4755978000.0,Medium
11218628,United States of America,Airlines,Industrials,Hawaiian Holdings Inc,Mcap,2024-08-07 00:00:00,642320800.0,Small
11289148,New Zealand,Electric Utilities,Utilities,Genesis Energy Ltd,Mcap,2024-08-07 00:00:00,2630302000.0,Medium
11239094,United States of America,Leisure & Recreation,Consumer Cyclicals,Travelzoo,Mcap,2024-08-07 00:00:00,263722500.0,Small
11289245,Canada,Electric Utilities,Utilities,Innergex Renewable Energy Inc,Mcap,2024-08-07 00:00:00,2098220000.0,Medium


In [111]:
# -----Volatility Classification 1 (high, medium, low):  by Industry-Relative Classification-------
closing_price_df = all_event_data[all_event_data['finparametername'] == 'closingPrice'].copy()

closing_price_df = closing_price_df.sort_values([
    'country_name', 'industry_name', 'economic_sector_name', 'company_name', 'endtime'
])

# Create a list to store results
volatility_results = []

# Group by all company identifier fields
company_groups = closing_price_df.groupby([
    'country_name', 'industry_name', 'economic_sector_name', 'company_name'
])

for company_key, company_data in tqdm.tqdm(company_groups):
    country, industry, sector, company = company_key
    
    # Convert to time series and calculate returns
    prices = company_data.set_index('endtime')['finval']
    returns = prices.pct_change().dropna()
    
    # Calculate annualized volatility (assuming daily data)
    daily_vol = returns.std()
    annualized_vol = daily_vol * np.sqrt(252)
    
    # Store result
    volatility_results.append({
        'country_name': country,
        'industry_name': industry,
        'economic_sector_name': sector,
        'company_name': company,
        'price_volatility': annualized_vol
    })

# Create result dataframe
volatility_df = pd.DataFrame(volatility_results)

industry_groups = volatility_df.groupby('industry_name')

volatility_df['volatility_category'] = 'Medium'

for industry, group in industry_groups:
    low_threshold = group['price_volatility'].quantile(0.33)
    high_threshold = group['price_volatility'].quantile(0.67)
    
    # Apply industry-specific thresholds
    industry_mask = volatility_df['industry_name'] == industry
    volatility_df.loc[industry_mask & (volatility_df['price_volatility'] <= low_threshold), 'volatility_category'] = 'Low'
    volatility_df.loc[industry_mask & (volatility_df['price_volatility'] >= high_threshold), 'volatility_category'] = 'High'

100%|████████████████████████████████████████████████████████████████| 2673/2673 [00:01<00:00, 1712.00it/s]


In [112]:
volatility_df.head()
volatility_df["volatility_category"].value_counts()

volatility_category
High      903
Low       894
Medium    876
Name: count, dtype: int64

In [102]:
# -----Volatility Classification 2 (high, medium, low):  by Fixed Thresholds Based on Financial Industry Standards-------

closing_price_df = all_event_data[all_event_data['finparametername'] == 'closingPrice'].copy()

closing_price_df = closing_price_df.sort_values([
    'country_name', 'industry_name', 'economic_sector_name', 'company_name', 'endtime'
])

# Create a list to store results
volatility_results = []

# Group by all company identifier fields
company_groups = closing_price_df.groupby([
    'country_name', 'industry_name', 'economic_sector_name', 'company_name'
])

for company_key, company_data in tqdm.tqdm(company_groups):
    country, industry, sector, company = company_key
    
    # Convert to time series and calculate returns
    prices = company_data.set_index('endtime')['finval']
    returns = prices.pct_change().dropna()
    
    # Calculate annualized volatility (assuming daily data)
    daily_vol = returns.std()
    annualized_vol = daily_vol * np.sqrt(252)
    
    # Store result
    volatility_results.append({
        'country_name': country,
        'industry_name': industry,
        'economic_sector_name': sector,
        'company_name': company,
        'price_volatility': annualized_vol
    })

# Create result dataframe
volatility_df = pd.DataFrame(volatility_results)

def classify_stock_volatility(volatility_df):
    # First cap extreme values (e.g., above 99th percentile)
    cap_value = volatility_df['price_volatility'].quantile(0.99)
    volatility_df['price_volatility_capped'] = volatility_df['price_volatility'].clip(upper=cap_value)
    
    # Apply standard classifications to capped values
    volatility_df['volatility_category'] = 'Medium'
    volatility_df.loc[volatility_df['price_volatility_capped'] < 0.15, 'volatility_category'] = 'Low'
    volatility_df.loc[volatility_df['price_volatility_capped'] > 0.30, 'volatility_category'] = 'High'
    
    return volatility_df

volatility_df_2 = classify_stock_volatility(volatility_df)



100%|████████████████████████████████████████████████████████████████| 2673/2673 [00:01<00:00, 1455.13it/s]


In [109]:
volatility_df_2["volatility_category"].value_counts()

volatility_category
High      1985
Medium     650
Low         38
Name: count, dtype: int64