# Baseline Regression Models

In [14]:
import numpy as np # Use version 1.x not 2.x
import pandas as pd

RANDOM_SEED = 42

In [20]:
import pickle

def load_data(file_path):
    with open(file_path, 'rb') as f:
        data = pickle.load(f)

    records = []
    for company, periods in data.items():
        for period, (features, rating, normed_rating) in periods.items():
            record = {
                'company': company,
                'period': period,
                **{f'feature_{i}': feature.item() for i, feature in enumerate(features)},
                'rating': rating.item(),
                'normed_rating': normed_rating.item()
            }
            records.append(record)

    return pd.DataFrame(records)

### Ret

In [21]:
suffix = "Ret"

In [22]:
import importlib
import CompustatExtractor
importlib.reload(CompustatExtractor)
from CompustatExtractor import CompustatExtractor
import os
import Hypers

features = CompustatExtractor.process_compustat_features2(
	os.path.join(Hypers.Config.data_path, "WRDS", f"features_{suffix}.csv"),
	save=False,
	filestem=f"features_{suffix}_1",
	add_cpi=False
)

ratings = CompustatExtractor.process_compustat_ratings(
	os.path.join(Hypers.Config.data_path, "WRDS", f"ratings_{suffix}.csv"),
	save=False,
	filestem=f"ratings_{suffix}_1"
)

merged_dict = CompustatExtractor.merge_input_output_dicts(
	features,
	ratings,
	save=True,
	filestem=f"dataset_{suffix}_1"
)

input_dict: 49
output_dict: 37
merged_dict: 37


In [23]:
import utils
importlib.reload(utils)

merged_dict = utils.load_pickle(os.path.join(Hypers.Config.data_path, f"dataset_{suffix}_1.pkl"))
_, _ = utils.spilt_train_valid(merged_dict, random_select=True, save=True, suffix=(suffix + "_1"))

In [24]:
from sklearn.preprocessing import StandardScaler
import importlib
import os
import Hypers
importlib.reload(Hypers)

train_df = load_data('./data/train_dict_' + suffix + '_1.pkl')
test_df = load_data('./data/test_dict_' + suffix + '_1.pkl')

X_train = train_df.drop(columns=['company', 'period', 'rating', 'normed_rating'])
y_train = train_df['normed_rating']
labels_train = train_df['rating']
X_test = test_df.drop(columns=['company', 'period', 'rating', 'normed_rating'])
y_test = test_df['normed_rating']
labels_test = test_df['rating']


In [25]:
train_df.describe()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,rating,normed_rating
count,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0
mean,7974.957674,12537.306597,20512.264269,1651.452361,8293.912846,5055.16361,7302.773576,6156.93123,13459.169889,387.379591,10253.399661,11439.442905,7040.365607,2220.053211,3.142466,0.285679
std,11790.918996,26425.13554,37823.52781,2803.746359,16578.767081,8264.905577,13278.726847,10318.598069,23270.499122,804.159056,22138.886139,22460.420172,14816.228569,4380.39328,1.090666,0.099151
min,146.431,285.28299,763.768005,1.664,70.504997,83.723999,116.002998,297.992004,556.700012,-2640.0,130.647003,169.832993,-1895.224976,92.889,1.0,0.090909
25%,2076.648743,1606.512207,3669.390259,104.910503,1181.624023,1252.358032,1387.672485,897.888504,2380.476685,61.49525,1172.849213,1947.826477,1026.92926,375.636497,2.0,0.181818
50%,3497.637451,3694.300049,7932.032471,682.5,2373.17749,2529.595947,2737.0,2329.400024,5876.873047,156.984497,2301.135498,3371.449951,2287.150024,740.002502,3.0,0.272727
75%,10108.5,12058.100098,23234.75,1638.5,8654.75,5547.5,8978.0,5760.0,14299.0,372.031494,9009.75,13055.25,6391.75,2506.0,4.0,0.363636
max,67142.0,144353.0,209876.0,25981.0,96823.0,51501.0,77021.0,56605.0,129862.0,6056.0,118723.0,131565.0,85937.0,25993.0,6.0,0.545455


In [26]:
# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [29]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, accuracy_score
import Hypers

models = {
    'Decision Tree': DecisionTreeRegressor(random_state=RANDOM_SEED),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=RANDOM_SEED),
    'Linear Regression': LinearRegression(),
    'k-NN (k=1)': KNeighborsRegressor(n_neighbors=1),
    'SVR': SVR(),
    'XGBoost': xgb.XGBRegressor(objective='reg:squarederror', random_state=RANDOM_SEED),
    'LightGBM': lgb.LGBMRegressor(random_state=RANDOM_SEED, verbose=-1)
}

print('Train-Test Split Regression Results')
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_rounded = np.round(y_pred * (len(Hypers.rating_to_category) - 1)).astype(int)
    y_pred_rounded = np.clip(y_pred_rounded, 0, 23)

    mse = mean_squared_error(y_test, y_pred)
    accuracy = accuracy_score(labels_test, y_pred_rounded)

    print(f'{model_name:20}: Accuracy={accuracy:.4f}  MSE={mse:.4f}')

Train-Test Split Regression Results
Decision Tree       : Accuracy=0.3050  MSE=0.0140
Random Forest       : Accuracy=0.3546  MSE=0.0085
Linear Regression   : Accuracy=0.3830  MSE=0.0074
k-NN (k=1)          : Accuracy=0.2837  MSE=0.0100
SVR                 : Accuracy=0.4184  MSE=0.0060
XGBoost             : Accuracy=0.3404  MSE=0.0096
LightGBM            : Accuracy=0.4610  MSE=0.0056


### RetInd

In [30]:
suffix = "RetInd"

In [35]:
features = CompustatExtractor.process_compustat_features2(
	os.path.join(Hypers.Config.data_path, "WRDS", f"features_{suffix}.csv"),
	save=False,
	filestem=f"features_{suffix}_1",
	add_cpi=False
)

ratings = CompustatExtractor.process_compustat_ratings(
	os.path.join(Hypers.Config.data_path, "WRDS", f"ratings_{suffix}.csv"),
	save=False,
	filestem=f"ratings_{suffix}_1"
)

merged_dict = CompustatExtractor.merge_input_output_dicts(
	features,
	ratings,
	save=True,
	filestem=f"dataset_{suffix}_1"
)


input_dict: 924
output_dict: 392
merged_dict: 342


In [36]:
import utils
importlib.reload(utils)

merged_dict = utils.load_pickle(os.path.join(Hypers.Config.data_path, f"dataset_{suffix}_1.pkl"))
_, _ = utils.spilt_train_valid(merged_dict, random_select=True, save=True, suffix=(suffix + "_1"))

In [37]:
from sklearn.preprocessing import StandardScaler
import importlib
import os
import Hypers
importlib.reload(Hypers)

train_df = load_data('./data/train_dict_' + suffix + '_1.pkl')
test_df = load_data('./data/test_dict_' + suffix + '_1.pkl')

X_train = train_df.drop(columns=['company', 'period', 'rating', 'normed_rating'])
y_train = train_df['normed_rating']
labels_train = train_df['rating']
X_test = test_df.drop(columns=['company', 'period', 'rating', 'normed_rating'])
y_test = test_df['normed_rating']
labels_test = test_df['rating']

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [38]:
print('Train-Test Split Regression Results')
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_rounded = np.round(y_pred * (len(Hypers.rating_to_category) - 1)).astype(int)
    y_pred_rounded = np.clip(y_pred_rounded, 0, 23)

    mse = mean_squared_error(y_test, y_pred)
    accuracy = accuracy_score(labels_test, y_pred_rounded)

    print(f'{model_name:20}: Accuracy={accuracy:.4f}  MSE={mse:.4f}')

Train-Test Split Regression Results
Decision Tree       : Accuracy=0.3879  MSE=0.0073
Random Forest       : Accuracy=0.4327  MSE=0.0047
Linear Regression   : Accuracy=0.3730  MSE=0.0064
k-NN (k=1)          : Accuracy=0.4306  MSE=0.0087
SVR                 : Accuracy=0.4932  MSE=0.0042
XGBoost             : Accuracy=0.4192  MSE=0.0052
LightGBM            : Accuracy=0.4470  MSE=0.0044


### US

In [39]:
suffix = "US"

In [16]:
import importlib
import CompustatExtractor
importlib.reload(CompustatExtractor)
from CompustatExtractor import CompustatExtractor
import os
import Hypers

features = CompustatExtractor.process_compustat_features2(
	os.path.join(Hypers.Config.data_path, "WRDS", f"features_{suffix}.csv"),
	save=False,
	filestem=f"features_{suffix}_1",
	add_cpi=False
)

concatenate_features = CompustatExtractor.concatenate_features(features, k=1)

ratings = CompustatExtractor.process_compustat_ratings(
	os.path.join(Hypers.Config.data_path, "WRDS", f"ratings_{suffix}.csv"),
	save=False,
	filestem=f"ratings_{suffix}_1"
)

merged_dict = CompustatExtractor.merge_input_output_dicts(
	concatenate_features,
	ratings,
	save=True,
	filestem=f"dataset_{suffix}_1"
)


input_dict: 2510
output_dict: 1151
merged_dict: 775


In [47]:
import utils
importlib.reload(utils)

merged_dict = utils.load_pickle(os.path.join(Hypers.Config.data_path, f"dataset_{suffix}_1.pkl"))
_, _ = utils.spilt_train_valid(merged_dict, random_select=True, save=True, suffix=(suffix + "_1"))

In [48]:
from sklearn.preprocessing import StandardScaler
import importlib
import os
import Hypers
importlib.reload(Hypers)

train_df = load_data('./data/train_dict_' + suffix + '_1.pkl')
test_df = load_data('./data/test_dict_' + suffix + '_1.pkl')

X_train = train_df.drop(columns=['company', 'period', 'rating', 'normed_rating'])
y_train = train_df['normed_rating']
labels_train = train_df['rating']
X_test = test_df.drop(columns=['company', 'period', 'rating', 'normed_rating'])
y_test = test_df['normed_rating']
labels_test = test_df['rating']

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [49]:
print('Train-Test Split Regression Results')
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_rounded = np.round(y_pred * (len(Hypers.rating_to_category) - 1)).astype(int)
    y_pred_rounded = np.clip(y_pred_rounded, 0, 23)

    mse = mean_squared_error(y_test, y_pred)
    accuracy = accuracy_score(labels_test, y_pred_rounded)

    print(f'{model_name:20}: Accuracy={accuracy:.4f}  MSE={mse:.4f}')

Train-Test Split Regression Results
Decision Tree       : Accuracy=0.4016  MSE=0.0138
Random Forest       : Accuracy=0.4768  MSE=0.0088
Linear Regression   : Accuracy=0.3499  MSE=0.0107
k-NN (k=1)          : Accuracy=0.4301  MSE=0.0136
SVR                 : Accuracy=0.4310  MSE=0.0090
XGBoost             : Accuracy=0.4630  MSE=0.0091
LightGBM            : Accuracy=0.4947  MSE=0.0090
