In [1]:
import numpy as np # Use version 1.x not 2.x
import random
import pandas as pd

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

In [2]:
import pickle

with open('./data/merged_dict.pkl', 'rb') as f:
    data = pickle.load(f)

records = []
for company, periods in data.items():
    for period, (features, rating) in periods.items():
        record = {
            'company': company,
            'period': period,
            **{f'feature_{i}': feature.item() for i, feature in enumerate(features)},
            'rating': int(rating.item())
        }
        records.append(record)

df = pd.DataFrame(records)

df.head()

Unnamed: 0,company,period,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,...,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,rating
0,AMZN,2018Q3,59885.0,83810.0,143695.0,30369.0,29225.0,15552.0,55324.0,49246.0,...,0.483438,0.050958,1.082442,0.801334,0.54893,3.672716,0.048142,0.073687,1.879179,4
1,AMZN,2018Q4,75101.0,87547.0,162648.0,41668.0,40524.0,16655.0,68391.0,50708.0,...,0.440145,0.041819,1.098112,0.854586,0.609261,3.734828,0.040306,0.069508,2.433143,4
2,AMZN,2019Q1,69431.0,108671.0,178102.0,37379.0,29066.0,16432.0,63695.0,65997.0,...,0.513132,0.059648,1.090054,0.832075,0.586844,3.679033,0.051288,0.073559,1.768866,4
3,AMZN,2019Q2,76790.0,114561.0,191351.0,41781.0,31135.0,18580.0,69678.0,68612.0,...,0.508943,0.041401,1.102069,0.835414,0.59963,3.606246,0.034184,0.049471,1.675727,4
4,AMZN,2019Q3,79054.0,120045.0,199099.0,43658.0,35739.0,18766.0,72136.0,70455.0,...,0.489312,0.030494,1.095902,0.835755,0.605218,3.523377,0.026994,0.037765,1.904455,4


In [3]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

X = df.drop(columns=['company', 'period', 'rating'])
y = df['rating']

scaler = StandardScaler()
X = scaler.fit_transform(X)

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [4]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import LeaveOneOut, cross_val_score

models = {
    'Decision Tree': DecisionTreeClassifier(random_state=RANDOM_SEED),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=RANDOM_SEED),
    'Logistic Regression': LogisticRegression(max_iter=2000, random_state=RANDOM_SEED),
    'k-NN (k=1)': KNeighborsClassifier(n_neighbors=1),
    'Naive Bayes': GaussianNB(),
    'SVM': SVC(random_state=RANDOM_SEED),
    'LDA': LinearDiscriminantAnalysis(),
    'XGBoost': xgb.XGBClassifier(eval_metric='mlogloss', random_state=RANDOM_SEED),
    'LightGBM': lgb.LGBMClassifier(random_state=RANDOM_SEED, verbose = -1)
}

# Evaluate models using LOOCV
loo = LeaveOneOut()
print('LOOCV (Leave-One-Out Cross-Validation) Accuracy Results')
for model_name, model in models.items():
    scores = cross_val_score(model, X, y, cv=loo, scoring='accuracy')
    print(f'{model_name:20}: {scores.mean():.4f}')


LOOCV (Leave-One-Out Cross-Validation) Accuracy Results
Decision Tree       : 0.8430
Random Forest       : 0.8891
Logistic Regression : 0.7389
k-NN (k=1)          : 0.7867
Naive Bayes         : 0.6212
SVM                 : 0.6928
LDA                 : 0.6928
XGBoost             : 0.8703
LightGBM            : 0.8976
