# Baseline Regression Models

In [1]:
import numpy as np # Use version 1.x not 2.x
import random
import pandas as pd

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

In [2]:
import pickle

def load_data(file_path):
    with open(file_path, 'rb') as f:
        data = pickle.load(f)

    records = []
    for company, periods in data.items():
        for period, (features, rating) in periods.items():
            record = {
                'company': company,
                'period': period,
                **{f'feature_{i}': feature.item() for i, feature in enumerate(features)},
                'rating': rating.item()
            }
            records.append(record)

    return pd.DataFrame(records)

In [3]:
from sklearn.preprocessing import StandardScaler

train_df = load_data('./data/train_dict.pkl')
test_df = load_data('./data/test_dict.pkl')

X_train = train_df.drop(columns=['company', 'period', 'rating'])
y_train = train_df['rating']
X_test = test_df.drop(columns=['company', 'period', 'rating'])
y_test = test_df['rating']

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [4]:
train_df.describe()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,rating
count,13419.0,13419.0,13419.0,13419.0,13419.0,13419.0,13419.0,13419.0,13419.0,13419.0,...,13419.0,13419.0,13419.0,13419.0,13419.0,13419.0,13419.0,13419.0,13419.0,13419.0
mean,3.810639,4.23991,3.958005,4.658519,4.193749,4.022636,5.49003,3.910604,4.253997,3.846538,...,-0.03814,-0.917355,-0.078312,-0.083236,0.055194,-0.029182,0.332568,-0.558261,1.797081,9.54706
std,11.431498,11.293457,10.779591,16.127461,13.029752,12.897926,17.574325,8.844879,10.974842,12.296666,...,4.323687,51.917468,0.594742,0.663512,0.680137,129.240364,1.876188,91.322609,7.384274,3.16827
min,-0.308304,-0.266087,-0.294602,-0.289814,-0.2954,-0.205454,-0.277362,-0.203393,-0.226853,-215.762192,...,-271.892731,-4882.29834,-1.141994,-1.142238,-0.532463,-12999.674805,-40.224262,-9938.629883,-0.649376,0.0
25%,0.227744,0.298246,0.302132,0.028129,0.251324,0.096226,0.246125,0.368439,0.365536,0.053331,...,-0.418189,-0.239244,-0.477208,-0.504659,-0.359399,0.006327,-0.201479,-0.250107,-0.154979,7.0
50%,0.946296,1.023283,1.014643,0.719754,0.949407,0.884547,1.06643,1.104624,1.105655,0.937856,...,0.020812,0.160447,-0.19957,-0.172257,-0.130685,0.410631,0.193717,0.177438,0.161311,10.0
75%,2.658684,3.832452,3.462913,2.845509,2.915207,2.80577,3.686272,3.565986,3.72719,3.505346,...,0.471509,0.61343,0.204816,0.191851,0.218999,1.221484,0.748354,0.701961,0.956126,12.0
max,160.789398,152.580627,150.960022,289.723022,199.850311,197.84256,233.93811,100.60276,137.974792,245.519623,...,204.916534,1640.114136,4.421368,6.690714,9.021321,4006.946289,30.412418,697.988953,569.232544,22.0


In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, accuracy_score

models = {
    'Decision Tree': DecisionTreeRegressor(random_state=RANDOM_SEED),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=RANDOM_SEED),
    'Linear Regression': LinearRegression(),
    'k-NN (k=1)': KNeighborsRegressor(n_neighbors=1),
    'SVR': SVR(),
    'XGBoost': xgb.XGBRegressor(objective='reg:squarederror', random_state=RANDOM_SEED),
    'LightGBM': lgb.LGBMRegressor(random_state=RANDOM_SEED, verbose=-1)
}

print('Train-Test Split Regression Results')
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_rounded = np.round(y_pred).astype(int)
    y_pred_rounded = np.clip(y_pred_rounded, 0, 23)

    mse = mean_squared_error(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred_rounded)

    print(f'{model_name:20}: Accuracy={accuracy:.4f}  MSE={mse:.4f}')

Train-Test Split Regression Results
Decision Tree       : Accuracy=0.1456  MSE=9.0457
Random Forest       : Accuracy=0.2339  MSE=4.8748
Linear Regression   : Accuracy=0.1514  MSE=5.7144
k-NN (k=1)          : Accuracy=0.1397  MSE=8.9096
SVR                 : Accuracy=0.1862  MSE=4.9309
XGBoost             : Accuracy=0.2084  MSE=4.7861
LightGBM            : Accuracy=0.2094  MSE=4.6182


### Predict newer data (20%) from older data (80%)

In [6]:
def split_by_period(df):
    split_index = int(len(df) * 0.8)
    train_data = df.iloc[:split_index]
    test_data = df.iloc[split_index:]
    return train_data, test_data

merged_df = pd.concat([train_df, test_df], ignore_index=True)
merged_df = merged_df.sort_values(by=['company', 'period'])

train_df = pd.DataFrame()
test_df = pd.DataFrame()

for company, group in merged_df.groupby('company'):
    train_data, test_data = split_by_period(group)
    train_df = pd.concat([train_df, train_data])
    test_df = pd.concat([test_df, test_data])

train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [7]:
train_df.iloc[41:46]

Unnamed: 0,company,period,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,...,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,rating
41,AAL,2020Q2,9.358152,20.610624,16.032759,30.888725,6.761358,3.559145,21.061703,29.293434,...,-8.993368,-14.530337,-0.707366,-0.522562,0.225215,-14.396945,-2.791513,10.533374,0.934059,12.0
42,AAL,2020Q3,8.340997,20.478172,15.584459,26.19026,7.776797,3.513597,19.382196,30.482496,...,-4.068741,-8.808525,-0.722517,-0.550122,0.16598,-8.626437,-3.491277,6.862294,1.184092,12.0
43,AAL,2020Q4,7.467043,20.675615,15.390813,22.217646,7.848329,3.470326,19.364405,30.831976,...,-2.642344,-6.427341,-0.773885,-0.620862,0.061735,-7.137291,-3.521091,4.897902,1.222114,12.0
44,AAL,2021Q1,12.376627,20.530413,17.071869,44.42091,8.754915,3.570533,20.270578,34.958687,...,-3.286706,-3.899826,-0.515904,-0.258778,0.595875,-6.887997,-1.521122,2.209769,1.37526,12.0
45,AAP,2010Q1,1.098972,0.218212,0.505274,0.111622,1.057709,3.769933,1.725678,0.03129,...,0.855467,0.205086,-0.413703,-1.014619,-0.429032,0.493377,0.415742,1.222719,-0.361857,10.0


In [8]:
test_df.head()

Unnamed: 0,company,period,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,...,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,rating
0,AAL,2021Q2,15.576209,20.224787,18.037573,56.78149,10.118683,3.868877,25.237936,34.572418,...,-0.9266,-0.427103,-0.509899,-0.238318,0.627533,-7.407228,-0.421442,-0.477901,1.509752,12.0
1,AAL,2021Q3,13.153007,19.988266,17.018206,46.27018,11.286515,4.010077,22.174259,33.579163,...,-0.650341,-0.245818,-0.536629,-0.28974,0.542962,-7.24767,-0.295384,-0.820659,1.671445,12.0
2,AAL,2021Q4,11.848042,19.942608,16.519533,40.132053,12.151116,3.882541,22.254913,32.31237,...,-0.729365,-1.547125,-0.606601,-0.384312,0.397845,-7.153681,-1.279906,1.699241,1.922507,12.0
3,AAL,2022Q1,12.147082,20.151567,16.75596,40.177231,12.574086,4.353971,25.647139,32.118938,...,-1.072615,-2.48444,-0.675459,-0.477723,0.277068,-6.182828,-1.881923,2.639538,1.734946,12.0
4,AAL,2022Q2,12.682686,20.068888,16.89822,40.427216,15.150783,4.971155,26.719366,31.608664,...,-0.26714,-0.063466,-0.67459,-0.484475,0.249731,-6.52248,-0.030636,-1.38675,1.87114,12.0


In [9]:
X_train = train_df.drop(columns=['company', 'period', 'rating'])
y_train = train_df['rating']
X_test = test_df.drop(columns=['company', 'period', 'rating'])
y_test = test_df['rating']

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print('Predict newer data (20%) from older data (80%) \nRegression Results')
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_rounded = np.round(y_pred).astype(int)
    y_pred_rounded = np.clip(y_pred_rounded, 0, 23)

    mse = mean_squared_error(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred_rounded)

    print(f'{model_name:20}: Accuracy={accuracy:.4f}  MSE={mse:.4f}')

Predict newer data (20%) from older data (80%) 
Regression Results
Decision Tree       : Accuracy=0.4712  MSE=5.5491
Random Forest       : Accuracy=0.3851  MSE=2.3570
Linear Regression   : Accuracy=0.1456  MSE=8.2201
k-NN (k=1)          : Accuracy=0.5981  MSE=4.1042
SVR                 : Accuracy=0.2341  MSE=4.3063
XGBoost             : Accuracy=0.3246  MSE=2.3806
LightGBM            : Accuracy=0.2930  MSE=2.5335


### Predict the most recent ratings from all past data

In [10]:
merged_df = pd.concat([train_df, test_df], ignore_index=True)
merged_df = merged_df.sort_values(by=['company', 'period'])

train_df = pd.DataFrame()
test_df = pd.DataFrame()

for company, group in merged_df.groupby('company'):
    test_data = group.iloc[-1:]
    train_data = group.iloc[:-1]
    train_df = pd.concat([train_df, train_data])
    test_df = pd.concat([test_df, test_data])

train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [11]:
print(len(test_df))
print(merged_df['company'].nunique())

346
346


In [12]:
X_train = train_df.drop(columns=['company', 'period', 'rating'])
y_train = train_df['rating']
X_test = test_df.drop(columns=['company', 'period', 'rating'])
y_test = test_df['rating']

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print('Predict the most recent ratings from all past data \nRegression Results')
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_rounded = np.round(y_pred).astype(int)
    y_pred_rounded = np.clip(y_pred_rounded, 0, 23)

    mse = mean_squared_error(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred_rounded)

    print(f'{model_name:20}: Accuracy={accuracy:.4f}  MSE={mse:.4f}')

Predict the most recent ratings from all past data 
Regression Results
Decision Tree       : Accuracy=0.6965  MSE=3.1792
Random Forest       : Accuracy=0.5723  MSE=1.2485
Linear Regression   : Accuracy=0.1474  MSE=8.1234
k-NN (k=1)          : Accuracy=0.8035  MSE=2.1387
SVR                 : Accuracy=0.2428  MSE=4.0762
XGBoost             : Accuracy=0.4827  MSE=1.3094
LightGBM            : Accuracy=0.3064  MSE=1.7631


### Appendix: Cross-Validation

#### split by companies (GroupKFold)

In [13]:
from sklearn.model_selection import GroupKFold
from sklearn.pipeline import Pipeline

merged_df = pd.concat([train_df, test_df], ignore_index=True)

X = merged_df.drop(columns=['company', 'period', 'rating'])
y = merged_df['rating']

groups = merged_df['company']

gkf = GroupKFold(n_splits=5)
print('GroupKFold (5-Fold Cross-Validation) Regression Results')

for model_name, model in models.items():
    fold_accuracies = []
    fold_mses = []
    
    for train_index, test_index in gkf.split(X, y, groups):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
                
        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('model', model)
        ])
        
        pipeline.fit(X_train, y_train)
        
        y_pred = pipeline.predict(X_test)
        y_pred_rounded = np.round(y_pred).astype(int)
        y_pred_rounded = np.clip(y_pred_rounded, 0, 23)
        
        mse = mean_squared_error(y_test, y_pred)
        accuracy = accuracy_score(y_test, y_pred_rounded)
        
        fold_mses.append(mse)
        fold_accuracies.append(accuracy)
    
    print(f'{model_name:20}: Accuracy={np.mean(fold_accuracies):.4f}  MSE={np.mean(fold_mses):.4f} ')

GroupKFold (5-Fold Cross-Validation) Regression Results
Decision Tree       : Accuracy=0.1477  MSE=8.5529 
Random Forest       : Accuracy=0.1889  MSE=5.0330 
Linear Regression   : Accuracy=0.1435  MSE=8.5086 
k-NN (k=1)          : Accuracy=0.1464  MSE=8.7746 
SVR                 : Accuracy=0.1840  MSE=5.1296 
XGBoost             : Accuracy=0.1903  MSE=5.1971 
LightGBM            : Accuracy=0.1934  MSE=4.7071 


#### split by companies (StratifiedGroupKFold)

In [14]:
from sklearn.model_selection import StratifiedGroupKFold

merged_df = pd.concat([train_df, test_df], ignore_index=True)

X = merged_df.drop(columns=['company', 'period', 'rating'])
y = merged_df['rating']

groups = merged_df['company']

sgkf = StratifiedGroupKFold()
print('StratifiedGroupKFold (5-Fold Cross-Validation) Regression Results')

for model_name, model in models.items():
    fold_accuracies = []
    fold_mses = []
    
    for train_index, test_index in sgkf.split(X, y, groups):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
                
        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('model', model)
        ])
        
        pipeline.fit(X_train, y_train)
        
        y_pred = pipeline.predict(X_test)
        y_pred_rounded = np.round(y_pred).astype(int)
        y_pred_rounded = np.clip(y_pred_rounded, 0, 23)
        
        mse = mean_squared_error(y_test, y_pred)
        accuracy = accuracy_score(y_test, y_pred_rounded)
        
        fold_mses.append(mse)
        fold_accuracies.append(accuracy)
    
    print(f'{model_name:20}: Accuracy={np.mean(fold_accuracies):.4f}  MSE={np.mean(fold_mses):.4f} ')

StratifiedGroupKFold (5-Fold Cross-Validation) Regression Results




Decision Tree       : Accuracy=0.1604  MSE=8.1060 




Random Forest       : Accuracy=0.1923  MSE=4.8567 
Linear Regression   : Accuracy=0.1373  MSE=9.0528 




k-NN (k=1)          : Accuracy=0.1533  MSE=8.9978 




SVR                 : Accuracy=0.1858  MSE=5.0809 




XGBoost             : Accuracy=0.1875  MSE=5.0734 




LightGBM            : Accuracy=0.2001  MSE=4.5409 


#### shuffle/split by periods (data leakage?)

In [15]:
from sklearn.model_selection import KFold

merged_df = pd.concat([train_df, test_df], ignore_index=True)

X = merged_df.drop(columns=['company', 'period', 'rating'])
y = merged_df['rating']

kf = KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
print('5-Fold Cross-Validation (with shuffle) Regression Results')

for model_name, model in models.items():
    fold_accuracies = []
    fold_mses = []
    
    for train_index, test_index in kf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
                
        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('model', model)
        ])
        
        pipeline.fit(X_train, y_train)
        
        y_pred = pipeline.predict(X_test)
        y_pred_rounded = np.round(y_pred).astype(int)
        y_pred_rounded = np.clip(y_pred_rounded, 0, 23)
        
        mse = mean_squared_error(y_test, y_pred)
        accuracy = accuracy_score(y_test, y_pred_rounded)
        
        fold_mses.append(mse)
        fold_accuracies.append(accuracy)
    
    print(f'{model_name:20}: Accuracy={np.mean(fold_accuracies):.4f}  MSE={np.mean(fold_mses):.4f} ')

5-Fold Cross-Validation (with shuffle) Regression Results
Decision Tree       : Accuracy=0.7221  MSE=2.2105 
Random Forest       : Accuracy=0.6322  MSE=0.8888 
Linear Regression   : Accuracy=0.1474  MSE=7.1841 
k-NN (k=1)          : Accuracy=0.7385  MSE=2.1918 
SVR                 : Accuracy=0.2575  MSE=3.9049 
XGBoost             : Accuracy=0.5230  MSE=0.9278 
LightGBM            : Accuracy=0.4014  MSE=1.3866 
