In [1]:
import numpy as np # Use version 1.x not 2.x
import random
import pandas as pd

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

In [2]:
import pickle

with open('./data/merged_dict.pkl', 'rb') as f:
    data = pickle.load(f)

records = []
for company, periods in data.items():
    for period, (features, rating) in periods.items():
        record = {
            'company': company,
            'period': period,
            **{f'feature_{i}': feature.item() for i, feature in enumerate(features)},
            'rating': int(rating.item())
        }
        records.append(record)

df = pd.DataFrame(records)

df.head()

Unnamed: 0,company,period,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,...,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,rating
0,ARKO,2019Q4,290.110992,1557.254028,1847.36499,50.799999,3504.822021,157.751999,340.480988,1311.994019,...,0.034109,-0.011999,0.852062,0.388741,0.149201,9.479013,-0.150077,-0.223403,22.217291,10
1,ARKO,2020Q4,609.406006,2130.403076,2739.809082,311.424988,1019.528992,163.686005,387.050995,2134.883057,...,0.059373,-0.007326,1.574485,1.151579,0.80461,8.61914,-0.013029,-0.024978,6.228566,11
2,ARKO,2021Q1,532.14801,2118.406006,2650.553955,223.003006,1195.98999,171.123001,367.614014,2103.61792,...,0.052198,-0.011678,1.447573,0.982076,0.606623,9.489242,-0.027692,-0.052756,6.989066,11
3,ARKO,2021Q2,586.747009,2182.637939,2769.38501,244.936005,1543.029053,183.113007,367.946991,2196.549072,...,0.063192,0.015493,1.594651,1.09699,0.665683,9.083257,0.043492,0.083699,8.426649,11
4,ARKO,2021Q3,638.828003,2169.602051,2808.429932,290.105011,1649.729004,189.026001,371.884003,2195.968018,...,0.062187,0.0202,1.717815,1.209522,0.780095,8.24607,0.055624,0.104334,8.727524,11


In [3]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

X = df.drop(columns=['company', 'period', 'rating'])
y = df['rating']

scaler = StandardScaler()
X = scaler.fit_transform(X)

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [4]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import LeaveOneOut, cross_val_score

models = {
    'Decision Tree': DecisionTreeClassifier(random_state=RANDOM_SEED),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=RANDOM_SEED),
    'Logistic Regression': LogisticRegression(max_iter=2000, random_state=RANDOM_SEED),
    'k-NN (k=1)': KNeighborsClassifier(n_neighbors=1),
    'Naive Bayes': GaussianNB(),
    'SVM': SVC(random_state=RANDOM_SEED),
    'LDA': LinearDiscriminantAnalysis(),
    'XGBoost': xgb.XGBClassifier(eval_metric='mlogloss', random_state=RANDOM_SEED),
    'LightGBM': lgb.LGBMClassifier(random_state=RANDOM_SEED, verbose = -1)
}

# Evaluate models using LOOCV
loo = LeaveOneOut()
print('LOOCV (Leave-One-Out Cross-Validation) Accuracy Results')
for model_name, model in models.items():
    scores = cross_val_score(model, X, y, cv=loo, scoring='accuracy')
    print(f'{model_name:20}: {scores.mean():.4f}')


LOOCV (Leave-One-Out Cross-Validation) Accuracy Results
Decision Tree       : 0.8606
Random Forest       : 0.8727
Logistic Regression : 0.7697
k-NN (k=1)          : 0.7848
Naive Bayes         : 0.7303
SVM                 : 0.7273
LDA                 : 0.7273
XGBoost             : 0.8545
LightGBM            : 0.8909
