In [76]:
import pandas as pd
from data import Database
from pymongo import MongoClient

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from dotenv import load_dotenv
import os
load_dotenv()

True

In [77]:
print("DB_URL:", os.getenv("DB_URL"))

DB_URL: mongodb+srv://kara_labs1:Labssprint1@cluster0.4mwhcea.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0


In [78]:
import sys, os
sys.path.append(os.path.abspath("app"))

from data import Database

In [79]:
db = Database()
monsters = list(db.collection.find())
df = pd.DataFrame(monsters)

#drop Mongo_id as it is not useful in this model training
df = df.drop(columns=["_id"])

df = df.head(1000)

print("Training on sample size:", len(df))
df.head()


Training on sample size: 1000


Unnamed: 0,health,energy,rarity
0,52,28,Rank2
1,46,33,Rank1
2,42,63,Rank2
3,27,21,Rank1
4,65,23,Rank2


In [80]:

#separate featrues(x) and target(y)
X = df[["health", "energy"]]
y = df["rarity"]

#encode target labels (Rank1, Rank2, etc.)
le = LabelEncoder()
y = le.fit_transform(y)

#scale feature
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

#split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

df.head(), X_train.shape, X_test.shape

(   health  energy rarity
 0      52      28  Rank2
 1      46      33  Rank1
 2      42      63  Rank2
 3      27      21  Rank1
 4      65      23  Rank2,
 (800, 2),
 (200, 2))

In [83]:
#initialize models
models = {
    "RandomForest": RandomForestClassifier(random_state=42),
    "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42),
    "XGBoost": XGBClassifier(eval_metric="logloss", random_state=42)
}

In [84]:
#Train and evaluate models
results = {}

for name, model in models.items():
    #train
    model.fit(X_train, y_train)

    #predict model
    y_pred = model.predict(X_test)

    #evaluate
    acc = accuracy_score(y_test, y_pred)
    print(f"\n/{name} Accuracy: {acc:.3f}")
    print(classification_report(y_test, y_pred))

    results[name] = acc

best_model = max(results, key=results.get)
print(f"\n Best model: {best_model} with accuracy{results[best_model]:.3f}")



/RandomForest Accuracy: 0.230
              precision    recall  f1-score   support

           0       0.28      0.35      0.31        52
           1       0.15      0.10      0.12        50
           2       0.25      0.34      0.29        44
           3       0.19      0.15      0.17        54

    accuracy                           0.23       200
   macro avg       0.22      0.23      0.22       200
weighted avg       0.22      0.23      0.22       200


/LogisticRegression Accuracy: 0.210
              precision    recall  f1-score   support

           0       0.15      0.04      0.06        52
           1       0.00      0.00      0.00        50
           2       0.22      0.91      0.35        44
           3       0.00      0.00      0.00        54

    accuracy                           0.21       200
   macro avg       0.09      0.24      0.10       200
weighted avg       0.09      0.21      0.09       200


/XGBoost Accuracy: 0.225
              precision    recall  f

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Best model summary. After training and evaluating 3 models, RandomForestClassifier, LogisticRegression and XGBoostClassifier, on a 1000 monster dataset the best model was the RandomForestClassifier model with the accuracy of 23%. The dataset has four rarity classes (Rank1-4) and the features (health and energy). It is difficult for the models to separate the features clearly because they overlap across the classes.

LogisticRegression performed poorly due to the overlapping features. It assumed a linear relationship to the features and the target. This misclassified most of the monsters creating a low precision and recall for many. Some weren't predicted at all with a recall of 0.

XGBoostClassifier performed a bit better than the LogisticRegression model but not by much. The XGBoostClassifier uses gradient boost to correct errors from previous predictions which would have helped a bit with the captureing some of the non-linear relationships that the LogisticRegression didn't. The XGBoostClassification model's gradiant boost feature was not significant to out perform the RandomForestClassifier due to the small dataset.

RandomForestClassifier created a non-linear relationship between the features and target which allowed for a better balanced predictions. This makes the RandomForestClassifier a more reliable predictions model for this dataset. Therefore; the RandomForestClassifier is the model of choice to continue training and evaluating the dataset.