# Baseline Regression Models

In [1]:
import numpy as np # Use version 1.x not 2.x
import random
import pandas as pd

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

In [2]:
import pickle

def load_data(file_path):
    with open(file_path, 'rb') as f:
        data = pickle.load(f)

    records = []
    for company, periods in data.items():
        for period, (features, rating, normed_rating) in periods.items():
            record = {
                'company': company,
                'period': period,
                **{f'feature_{i}': feature.item() for i, feature in enumerate(features)},
                'rating': rating.item(),
                'normed_rating': normed_rating.item()
            }
            records.append(record)

    return pd.DataFrame(records)

### Ret

In [3]:
suffix = "Ret"

In [4]:
from sklearn.preprocessing import StandardScaler
import importlib
import os
import Hypers
importlib.reload(Hypers)
import utils
importlib.reload(utils)

merged_dict = utils.load_pickle(os.path.join(Hypers.Config.data_path, f"dataset_{suffix}.pkl"))
_, _ = utils.spilt_train_valid(merged_dict, random_select=True, save=True, suffix=suffix)

train_df = load_data('./data/train_dict_Ret.pkl')
test_df = load_data('./data/test_dict_Ret.pkl')

X_train = train_df.drop(columns=['company', 'period', 'rating', 'normed_rating'])
y_train = train_df['normed_rating']
labels_train = train_df['rating']
X_test = test_df.drop(columns=['company', 'period', 'rating', 'normed_rating'])
y_test = test_df['normed_rating']
labels_test = test_df['rating']

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [5]:
train_df.describe()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_144,feature_145,feature_146,feature_147,feature_148,feature_149,feature_150,feature_151,rating,normed_rating
count,6079.0,6079.0,6079.0,6079.0,6079.0,6079.0,6079.0,6079.0,6079.0,6079.0,...,6079.0,6079.0,6079.0,6079.0,6079.0,6079.0,6079.0,6079.0,6079.0,6079.0
mean,3.187263,3.672922,3.32855,3.678313,3.535753,3.178305,4.632567,3.20021,3.447581,3.510239,...,0.922653,0.28527,-0.128237,0.454483,0.217197,-0.19516,0.121779,231.538452,3.437243,0.312477
std,9.122091,10.115408,9.168132,11.502782,11.037309,8.630302,14.84191,7.309015,9.044514,9.708005,...,23.838283,3.215718,9.729554,9.677379,2.809457,47.856759,1.573591,7.286771,1.173064,0.106642
min,-0.301897,-0.266775,-0.288342,-0.29574,-0.291449,-0.222196,-0.281195,-0.187304,-0.22462,-48.201126,...,-14.857426,-8.556656,-210.371216,-21.627665,-6.615319,-2797.750244,-6.860989,215.350998,0.0,0.0
25%,0.045383,0.165266,0.138738,0.0,0.110582,0.0,0.117887,0.196239,0.193609,0.0,...,-0.321555,-0.264877,-0.455649,-0.27637,-0.341851,-0.652883,-0.388726,226.421005,3.0,0.272727
50%,0.881459,0.879826,0.908394,0.509373,0.769879,0.830885,0.898819,0.933667,0.941483,0.748759,...,-0.003799,-0.004444,0.019302,0.02862,-0.007522,-0.058273,-0.002089,233.546005,3.0,0.272727
75%,2.406673,2.694911,2.579666,2.902219,2.623788,2.706648,3.098006,2.705751,2.756163,2.922812,...,0.379047,0.358832,0.373275,0.471599,0.440151,0.489981,0.474075,237.432999,4.0,0.363636
max,124.559494,123.289658,119.041771,169.197495,154.96048,124.026894,197.495544,87.117142,114.921547,153.907394,...,1789.264038,192.869949,245.542419,618.762756,80.971764,1347.325928,30.387802,242.839005,10.0,0.909091


In [6]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, accuracy_score
import Hypers

models = {
    'Decision Tree': DecisionTreeRegressor(random_state=RANDOM_SEED),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=RANDOM_SEED),
    'Linear Regression': LinearRegression(),
    'k-NN (k=1)': KNeighborsRegressor(n_neighbors=1),
    'SVR': SVR(),
    'XGBoost': xgb.XGBRegressor(objective='reg:squarederror', random_state=RANDOM_SEED),
    'LightGBM': lgb.LGBMRegressor(random_state=RANDOM_SEED, verbose=-1)
}

print('Train-Test Split Regression Results')
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_rounded = np.round(y_pred * (len(Hypers.rating_to_category) - 1)).astype(int)
    y_pred_rounded = np.clip(y_pred_rounded, 0, 23)

    mse = mean_squared_error(y_test, y_pred)
    accuracy = accuracy_score(labels_test, y_pred_rounded)

    print(f'{model_name:20}: Accuracy={accuracy:.4f}  MSE={mse:.4f}')

Train-Test Split Regression Results
Decision Tree       : Accuracy=0.4015  MSE=0.0098
Random Forest       : Accuracy=0.4992  MSE=0.0059
Linear Regression   : Accuracy=0.4587  MSE=0.0066
k-NN (k=1)          : Accuracy=0.3484  MSE=0.0113
SVR                 : Accuracy=0.5244  MSE=0.0058
XGBoost             : Accuracy=0.4929  MSE=0.0065
LightGBM            : Accuracy=0.5102  MSE=0.0057


### RetInd

In [7]:
suffix = "RetInd"

In [8]:
merged_dict = utils.load_pickle(os.path.join(Hypers.Config.data_path, f"dataset_{suffix}.pkl"))
_, _ = utils.spilt_train_valid(merged_dict, random_select=True, save=True, suffix=suffix)

train_df = load_data('./data/train_dict_RetInd.pkl')
test_df = load_data('./data/test_dict_RetInd.pkl')

X_train = train_df.drop(columns=['company', 'period', 'rating', 'normed_rating'])
y_train = train_df['normed_rating']
labels_train = train_df['rating']
X_test = test_df.drop(columns=['company', 'period', 'rating', 'normed_rating'])
y_test = test_df['normed_rating']
labels_test = test_df['rating']

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [9]:
print('Train-Test Split Regression Results')
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_rounded = np.round(y_pred * (len(Hypers.rating_to_category) - 1)).astype(int)
    y_pred_rounded = np.clip(y_pred_rounded, 0, 23)

    mse = mean_squared_error(y_test, y_pred)
    accuracy = accuracy_score(labels_test, y_pred_rounded)

    print(f'{model_name:20}: Accuracy={accuracy:.4f}  MSE={mse:.4f}')

Train-Test Split Regression Results
Decision Tree       : Accuracy=0.4647  MSE=0.0092
Random Forest       : Accuracy=0.5684  MSE=0.0056
Linear Regression   : Accuracy=0.4593  MSE=0.0145
k-NN (k=1)          : Accuracy=0.3725  MSE=0.0114
SVR                 : Accuracy=0.4885  MSE=0.0066
XGBoost             : Accuracy=0.5169  MSE=0.0060
LightGBM            : Accuracy=0.5952  MSE=0.0053


### US

In [10]:
suffix = "US"

In [11]:
merged_dict = utils.load_pickle(os.path.join(Hypers.Config.data_path, f"dataset_{suffix}.pkl"))
_, _ = utils.spilt_train_valid(merged_dict, random_select=True, save=True, suffix=suffix)

train_df = load_data('./data/train_dict_US.pkl')
test_df = load_data('./data/test_dict_US.pkl')

X_train = train_df.drop(columns=['company', 'period', 'rating', 'normed_rating'])
y_train = train_df['normed_rating']
labels_train = train_df['rating']
X_test = test_df.drop(columns=['company', 'period', 'rating', 'normed_rating'])
y_test = test_df['normed_rating']
labels_test = test_df['rating']

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [12]:
print('Train-Test Split Regression Results')
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_rounded = np.round(y_pred * (len(Hypers.rating_to_category) - 1)).astype(int)
    y_pred_rounded = np.clip(y_pred_rounded, 0, 23)

    mse = mean_squared_error(y_test, y_pred)
    accuracy = accuracy_score(labels_test, y_pred_rounded)

    print(f'{model_name:20}: Accuracy={accuracy:.4f}  MSE={mse:.4f}')

Train-Test Split Regression Results
Decision Tree       : Accuracy=0.4646  MSE=0.0104
Random Forest       : Accuracy=0.5497  MSE=0.0046
Linear Regression   : Accuracy=0.4473  MSE=0.0070
k-NN (k=1)          : Accuracy=0.3757  MSE=0.0121
SVR                 : Accuracy=0.4803  MSE=0.0061
XGBoost             : Accuracy=0.5292  MSE=0.0053
LightGBM            : Accuracy=0.5573  MSE=0.0047
