# Baseline Regression Models

In [1]:
import numpy as np # Use version 1.x not 2.x
import random
import pandas as pd

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

In [2]:
import pickle

def load_data(file_path):
    with open(file_path, 'rb') as f:
        data = pickle.load(f)

    records = []
    for company, periods in data.items():
        for period, (features, rating, normed_rating) in periods.items():
            record = {
                'company': company,
                'period': period,
                **{f'feature_{i}': feature.item() for i, feature in enumerate(features)},
                'rating': rating.item(),
                'normed_rating': normed_rating.item()
            }
            records.append(record)

    return pd.DataFrame(records)

### Ret

In [3]:
suffix = "Ret"

In [4]:
from sklearn.preprocessing import StandardScaler
import importlib
import os
import Hypers
importlib.reload(Hypers)
import utils
importlib.reload(utils)

merged_dict = utils.load_pickle(os.path.join(Hypers.Config.data_path, f"dataset_{suffix}.pkl"))
_, _ = utils.spilt_train_valid(merged_dict, random_select=True, save=True, suffix=suffix)

train_df = load_data('./data/train_dict_Ret.pkl')
test_df = load_data('./data/test_dict_Ret.pkl')

X_train = train_df.drop(columns=['company', 'period', 'rating', 'normed_rating'])
y_train = train_df['normed_rating']
labels_train = train_df['rating']
X_test = test_df.drop(columns=['company', 'period', 'rating', 'normed_rating'])
y_test = test_df['normed_rating']
labels_test = test_df['rating']

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [5]:
train_df.describe()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_144,feature_145,feature_146,feature_147,feature_148,feature_149,feature_150,feature_151,rating,normed_rating
count,698.0,698.0,698.0,698.0,698.0,698.0,698.0,698.0,698.0,698.0,...,698.0,698.0,698.0,698.0,698.0,698.0,698.0,698.0,698.0,698.0
mean,0.594831,0.57702,0.591411,0.653195,0.774548,0.536094,0.621724,0.500771,0.481432,0.436503,...,0.554901,0.17035,0.458713,0.214132,0.106362,-0.048175,0.051097,231.042954,3.160458,0.287314
std,1.038298,1.113754,1.12472,1.759639,1.492708,0.873233,1.129207,1.005302,1.020033,1.389351,...,2.740273,1.290153,8.675437,2.164138,1.183531,2.015263,1.433077,7.451931,1.030405,0.093673
min,-0.451154,-0.33723,-0.44434,-0.447085,-0.444327,-0.467565,-0.39311,-0.345999,-0.4857,-11.514503,...,-6.829387,-4.55015,-86.946945,-8.108098,-2.701822,-15.393591,-3.806167,216.177002,1.0,0.090909
25%,-0.084466,-0.079193,-0.127887,-0.345208,-0.17848,-0.074121,-0.090574,-0.135876,-0.2017,-0.291585,...,-0.256378,-0.447852,-0.47064,-0.441947,-0.404765,-0.540801,-0.469965,226.421005,2.0,0.181818
50%,0.099123,0.035303,0.081214,0.016288,0.070168,0.146916,0.118784,0.082683,0.062017,0.0,...,-0.002702,-0.02723,0.0,-0.058788,0.001524,-0.057903,0.006696,233.546005,3.0,0.272727
75%,1.207945,0.784976,0.953786,0.81914,1.020126,1.045798,1.124965,0.601573,0.812106,0.683331,...,0.654815,0.571663,0.512019,0.461521,0.492112,0.449534,0.521937,237.432999,4.0,0.363636
max,6.246325,5.965914,5.264041,15.109673,7.858496,3.11911,7.442028,4.626092,5.416079,8.654183,...,26.717739,8.815295,130.894943,46.763088,7.141855,32.851486,10.174973,242.839005,6.0,0.545455


In [6]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, accuracy_score
import Hypers

models = {
    'Decision Tree': DecisionTreeRegressor(random_state=RANDOM_SEED),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=RANDOM_SEED),
    'Linear Regression': LinearRegression(),
    'k-NN (k=1)': KNeighborsRegressor(n_neighbors=1),
    'SVR': SVR(),
    'XGBoost': xgb.XGBRegressor(objective='reg:squarederror', random_state=RANDOM_SEED),
    'LightGBM': lgb.LGBMRegressor(random_state=RANDOM_SEED, verbose=-1)
}

print('Train-Test Split Regression Results')
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_rounded = np.round(y_pred * (len(Hypers.rating_to_category) - 1)).astype(int)
    y_pred_rounded = np.clip(y_pred_rounded, 0, 23)

    mse = mean_squared_error(y_test, y_pred)
    accuracy = accuracy_score(labels_test, y_pred_rounded)

    print(f'{model_name:20}: Accuracy={accuracy:.4f}  MSE={mse:.4f}')

Train-Test Split Regression Results
Decision Tree       : Accuracy=0.3757  MSE=0.0138
Random Forest       : Accuracy=0.1850  MSE=0.0114
Linear Regression   : Accuracy=0.4162  MSE=1556506.3367
k-NN (k=1)          : Accuracy=0.4682  MSE=0.0070
SVR                 : Accuracy=0.4451  MSE=0.0101
XGBoost             : Accuracy=0.1445  MSE=0.0124
LightGBM            : Accuracy=0.2312  MSE=0.0120


### RetInd

In [7]:
suffix = "RetInd"

In [8]:
merged_dict = utils.load_pickle(os.path.join(Hypers.Config.data_path, f"dataset_{suffix}.pkl"))
_, _ = utils.spilt_train_valid(merged_dict, random_select=True, save=True, suffix=suffix)

train_df = load_data('./data/train_dict_RetInd.pkl')
test_df = load_data('./data/test_dict_RetInd.pkl')

X_train = train_df.drop(columns=['company', 'period', 'rating', 'normed_rating'])
y_train = train_df['normed_rating']
labels_train = train_df['rating']
X_test = test_df.drop(columns=['company', 'period', 'rating', 'normed_rating'])
y_test = test_df['normed_rating']
labels_test = test_df['rating']

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [9]:
print('Train-Test Split Regression Results')
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_rounded = np.round(y_pred * (len(Hypers.rating_to_category) - 1)).astype(int)
    y_pred_rounded = np.clip(y_pred_rounded, 0, 23)

    mse = mean_squared_error(y_test, y_pred)
    accuracy = accuracy_score(labels_test, y_pred_rounded)

    print(f'{model_name:20}: Accuracy={accuracy:.4f}  MSE={mse:.4f}')

Train-Test Split Regression Results
Decision Tree       : Accuracy=0.4315  MSE=0.0102
Random Forest       : Accuracy=0.5136  MSE=0.0054
Linear Regression   : Accuracy=0.4640  MSE=0.0161
k-NN (k=1)          : Accuracy=0.3777  MSE=0.0135
SVR                 : Accuracy=0.5013  MSE=0.0075
XGBoost             : Accuracy=0.5408  MSE=0.0052
LightGBM            : Accuracy=0.5583  MSE=0.0048


### US

In [10]:
suffix = "US"

In [11]:
merged_dict = utils.load_pickle(os.path.join(Hypers.Config.data_path, f"dataset_{suffix}.pkl"))
_, _ = utils.spilt_train_valid(merged_dict, random_select=True, save=True, suffix=suffix)

train_df = load_data('./data/train_dict_US.pkl')
test_df = load_data('./data/test_dict_US.pkl')

X_train = train_df.drop(columns=['company', 'period', 'rating', 'normed_rating'])
y_train = train_df['normed_rating']
labels_train = train_df['rating']
X_test = test_df.drop(columns=['company', 'period', 'rating', 'normed_rating'])
y_test = test_df['normed_rating']
labels_test = test_df['rating']

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [12]:
print('Train-Test Split Regression Results')
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_rounded = np.round(y_pred * (len(Hypers.rating_to_category) - 1)).astype(int)
    y_pred_rounded = np.clip(y_pred_rounded, 0, 23)

    mse = mean_squared_error(y_test, y_pred)
    accuracy = accuracy_score(labels_test, y_pred_rounded)

    print(f'{model_name:20}: Accuracy={accuracy:.4f}  MSE={mse:.4f}')

Train-Test Split Regression Results
Decision Tree       : Accuracy=0.4627  MSE=0.0126
Random Forest       : Accuracy=0.5577  MSE=0.0080
Linear Regression   : Accuracy=0.3959  MSE=0.0146
k-NN (k=1)          : Accuracy=0.3701  MSE=0.0155
SVR                 : Accuracy=0.4492  MSE=0.0112
XGBoost             : Accuracy=0.5220  MSE=0.0076
LightGBM            : Accuracy=0.5565  MSE=0.0074
