## Wczytanie bibliotek

In [12]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score
from sklearn.base import clone
import xgboost as xgb
import numpy as np
import mlflow

mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")

## Wczytanie i przetworzenie danych
Poniższy kod nie będzie jakoś bardzo analizowany, ze względu na to, że w poprzednich notebookach ten kod był tworzony. W tym notebooku jedynie kod jest skopiowany, aby przygotowac dane do treningu różny modeli

In [3]:
df = pd.read_csv("https://pwozniak.kia.prz.edu.pl/files/uczeniemaszynowe/train_data.csv")
df.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,59,Arcanine,Fire,,555,82,110,88,105,72,98,1,False
1,107,Hitmonchan,Fighting,,455,47,97,84,27,121,79,1,False
2,123,Scyther,Bug,Flying,500,63,113,86,56,73,109,1,False
3,35,Clefairy,Fairy,,323,67,54,46,58,62,36,1,False
4,150,MewtwoMega Mewtwo X,Psychic,Fighting,780,97,193,102,157,98,133,1,True


In [4]:
df_no_nans = df.fillna('Other')
df_no_nans.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,59,Arcanine,Fire,Other,555,82,110,88,105,72,98,1,False
1,107,Hitmonchan,Fighting,Other,455,47,97,84,27,121,79,1,False
2,123,Scyther,Bug,Flying,500,63,113,86,56,73,109,1,False
3,35,Clefairy,Fairy,Other,323,67,54,46,58,62,36,1,False
4,150,MewtwoMega Mewtwo X,Psychic,Fighting,780,97,193,102,157,98,133,1,True


In [5]:
df_clean = df_no_nans[(df_no_nans.HP > 0) & (df_no_nans.Attack > 0) & (df_no_nans.Defense > 0)].reset_index(drop=True)

columns = ['Total', 'HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']
for column in columns:
    print(f"Wiersze w {column} które mają wartość poniżej lub równe 0: {len(df_clean[df_clean[column] <= 0])}")

Wiersze w Total które mają wartość poniżej lub równe 0: 0
Wiersze w HP które mają wartość poniżej lub równe 0: 0
Wiersze w Attack które mają wartość poniżej lub równe 0: 0
Wiersze w Defense które mają wartość poniżej lub równe 0: 0
Wiersze w Sp. Atk które mają wartość poniżej lub równe 0: 0
Wiersze w Sp. Def które mają wartość poniżej lub równe 0: 0
Wiersze w Speed które mają wartość poniżej lub równe 0: 0


In [6]:
label_enc = LabelEncoder()
df_cat = pd.DataFrame()
for column in ['Name', 'Type 1', 'Type 2']:
    df_cat[column] = label_enc.fit_transform(df_clean[column])

In [7]:
df_num = df_clean[['Total', 'HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']]
minmax_scaler = MinMaxScaler()
columns = df_num.columns
np_num = minmax_scaler.fit_transform(df_num)
df_num_norm = pd.DataFrame(np_num, columns=columns)
df_prepared = pd.concat([df_num_norm, df_cat, df_clean['Generation']], axis=1)

## Testowanie różnych modeli
W tej części będzie testowane 7 różnych modeli:
 - Adaboosting
 - Drzewa decyzyjne
 - Gradient boosting
 - Regresja logistycznya
 - KNN
 - SVM
 - Las losowy
Modele będą testowane na wykrywaniu imion pokemonów oraz typu I. Drugi typ zostanie pominięty, ponieważ nie jest on aż tak istotny.

In [8]:
def train(clf, X, y, model_name):
    with mlflow.start_run():
        mlflow.set_tags({"Split type": "random", "Model name" : model_name})
        for test_split in [0.1, 0.2, 0.3, 0.4]:
            train_size = (1 - test_split) * 100
            print(f"Train size: {train_size}%, Test size: {test_split * 100}%")
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_split, random_state=42)
            clf_t = clone(clf)
            clf_t.fit(X_train, y_train)
            y_pred = clf_t.predict(X_test)
            
            accuracy = np.round(accuracy_score(y_pred=y_pred, y_true=y_test), 3)
            f1 = np.round(f1_score(y_pred=y_pred, y_true=y_test, average='macro'), 3)
            
            mlflow.log_metric(f"Accuracy_{int(test_split * 100)}", accuracy)
            mlflow.log_metric(f"F1_score_{int(test_split * 100)}", f1)
            
            print(f"Accuracy: {accuracy}")
            print(f"F1-score: {f1}")


def train_with_strat(clf, X, y, model_name):
    with mlflow.start_run():
        mlflow.set_tags({"Split type": "stratified", "Model name" : model_name})
        for test_split in [0.1, 0.2, 0.3, 0.4]:
            train_size = (1 - test_split) * 100
            print(f"Train size: {train_size}%, Test size: {test_split * 100}%")
            X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=test_split, random_state=42)
            clf_t = clone(clf)
            clf_t.fit(X_train, y_train)
            y_pred = clf_t.predict(X_test)
            
            accuracy = np.round(accuracy_score(y_pred=y_pred, y_true=y_test), 3)
            f1 = np.round(f1_score(y_pred=y_pred, y_true=y_test, average='micro'), 3)
            
            mlflow.log_metric(f"Accuracy_{int(test_split * 100)}", accuracy)
            mlflow.log_metric(f"F1_score_{int(test_split * 100)}", f1)
            
            print(f"Accuracy: {accuracy}")
            print(f"F1-score: {f1}")


### Podział danych
Od razu stworze dwa zbiory treningowe oraz testowe, aby móc testować każdy model dla danego zadania (przewidywanie pokemonów lub typu I)

In [9]:
X = df_prepared[['Total', 'HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']]
y_pok = df_prepared.Name
y_typ = df_prepared['Type 1']

### KNN

In [34]:
mlflow.set_experiment("Predict Pokemons Name")
knn = KNeighborsClassifier()
train(knn, X, y_pok, "KNN")
knn = KNeighborsClassifier()
train_with_strat(knn, X, y_pok, "KNN")

Train size: 90.0%, Test size: 10.0%
Accuracy: 0.897
F1-score: 0.875
Train size: 80.0%, Test size: 20.0%
Accuracy: 0.897
F1-score: 0.899
Train size: 70.0%, Test size: 30.0%


2024/12/10 16:49:33 INFO mlflow.tracking._tracking_service.client: 🏃 View run valuable-snail-107 at: http://127.0.0.1:8080/#/experiments/479905616626895318/runs/dfc4576f7740460ea56081d60807fc67.
2024/12/10 16:49:33 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/479905616626895318.


Accuracy: 0.904
F1-score: 0.899
Train size: 60.0%, Test size: 40.0%
Accuracy: 0.902
F1-score: 0.899
Train size: 90.0%, Test size: 10.0%
Accuracy: 0.943
F1-score: 0.943
Train size: 80.0%, Test size: 20.0%
Accuracy: 0.929
F1-score: 0.929
Train size: 70.0%, Test size: 30.0%
Accuracy: 0.917
F1-score: 0.917
Train size: 60.0%, Test size: 40.0%


2024/12/10 16:49:34 INFO mlflow.tracking._tracking_service.client: 🏃 View run powerful-robin-78 at: http://127.0.0.1:8080/#/experiments/479905616626895318/runs/1293ef1b277b4777a78d649b6405ddf4.
2024/12/10 16:49:34 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/479905616626895318.


Accuracy: 0.91
F1-score: 0.91


In [35]:
mlflow.set_experiment("Predict Pokemons Type 1")
knn = KNeighborsClassifier()
train(knn, X, y_typ, "KNN")
knn = KNeighborsClassifier()
train_with_strat(knn, X, y_typ, "KNN")

Train size: 90.0%, Test size: 10.0%
Accuracy: 0.934
F1-score: 0.924
Train size: 80.0%, Test size: 20.0%
Accuracy: 0.931
F1-score: 0.935
Train size: 70.0%, Test size: 30.0%


2024/12/10 16:49:47 INFO mlflow.tracking._tracking_service.client: 🏃 View run shivering-mole-218 at: http://127.0.0.1:8080/#/experiments/509757812240063095/runs/b2157f2ca7e946e9a5f8de6d90dcd958.
2024/12/10 16:49:47 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/509757812240063095.


Accuracy: 0.931
F1-score: 0.932
Train size: 60.0%, Test size: 40.0%
Accuracy: 0.925
F1-score: 0.928
Train size: 90.0%, Test size: 10.0%
Accuracy: 0.921
F1-score: 0.921
Train size: 80.0%, Test size: 20.0%
Accuracy: 0.935
F1-score: 0.935
Train size: 70.0%, Test size: 30.0%


2024/12/10 16:49:47 INFO mlflow.tracking._tracking_service.client: 🏃 View run bright-mink-978 at: http://127.0.0.1:8080/#/experiments/509757812240063095/runs/0e7b04eb568448efbacdebda1b812dfd.
2024/12/10 16:49:47 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/509757812240063095.


Accuracy: 0.931
F1-score: 0.931
Train size: 60.0%, Test size: 40.0%
Accuracy: 0.912
F1-score: 0.912


Model KNN działa bardzo dobrze na tym zestawie danych. nalepsze metryki otrzymuje dla podziału danych 90-10. Mozna również zauważyć, że przy predykcji nazw pokemonów wykorzystanie warstwowego podziału danych daje o wiele lepsze wyniki. Jest to spowodowane dużą ilościa grup (każda grupa jest dośc mała), dlateo dobrze aby próbki z każdej grupy były w zestawie treningowym i testowym.

### Logistyczna regresja

In [37]:
mlflow.set_experiment("Predict Pokemons Name")
clf_lr = LogisticRegression()
train(clf_lr, X, y_pok, "Logisitic regression")
clf_lr = LogisticRegression()
train_with_strat(clf_lr, X, y_pok, "Logisitic regression")

Train size: 90.0%, Test size: 10.0%
Accuracy: 0.275
F1-score: 0.209
Train size: 80.0%, Test size: 20.0%
Accuracy: 0.245
F1-score: 0.186
Train size: 70.0%, Test size: 30.0%
Accuracy: 0.268
F1-score: 0.204
Train size: 60.0%, Test size: 40.0%


2024/12/10 16:53:11 INFO mlflow.tracking._tracking_service.client: 🏃 View run gentle-pig-518 at: http://127.0.0.1:8080/#/experiments/479905616626895318/runs/cae24e2115644dc48916dddb6bebef88.
2024/12/10 16:53:11 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/479905616626895318.


Accuracy: 0.232
F1-score: 0.168
Train size: 90.0%, Test size: 10.0%
Accuracy: 0.405
F1-score: 0.405
Train size: 80.0%, Test size: 20.0%
Accuracy: 0.382
F1-score: 0.382
Train size: 70.0%, Test size: 30.0%
Accuracy: 0.343
F1-score: 0.343
Train size: 60.0%, Test size: 40.0%


2024/12/10 16:53:12 INFO mlflow.tracking._tracking_service.client: 🏃 View run bald-cow-183 at: http://127.0.0.1:8080/#/experiments/479905616626895318/runs/849d7e8daf85441c9d9276fcdfcfceaa.
2024/12/10 16:53:12 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/479905616626895318.


Accuracy: 0.314
F1-score: 0.314


In [38]:
mlflow.set_experiment("Predict Pokemons Type 1")
clf_lr = LogisticRegression()
train(clf_lr, X, y_typ, "Logisitic regression")
clf_lr = LogisticRegression()
train_with_strat(clf_lr, X, y_typ, "Logisitic regression")

Train size: 90.0%, Test size: 10.0%
Accuracy: 0.35
F1-score: 0.252
Train size: 80.0%, Test size: 20.0%
Accuracy: 0.355
F1-score: 0.253
Train size: 70.0%, Test size: 30.0%
Accuracy: 0.359
F1-score: 0.256
Train size: 60.0%, Test size: 40.0%


2024/12/10 16:53:21 INFO mlflow.tracking._tracking_service.client: 🏃 View run unequaled-snake-472 at: http://127.0.0.1:8080/#/experiments/509757812240063095/runs/6e2a886aef294e42931ee1563f0b158b.
2024/12/10 16:53:21 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/509757812240063095.


Accuracy: 0.353
F1-score: 0.246
Train size: 90.0%, Test size: 10.0%
Accuracy: 0.353
F1-score: 0.353
Train size: 80.0%, Test size: 20.0%
Accuracy: 0.358
F1-score: 0.358
Train size: 70.0%, Test size: 30.0%


2024/12/10 16:53:21 INFO mlflow.tracking._tracking_service.client: 🏃 View run victorious-gnu-675 at: http://127.0.0.1:8080/#/experiments/509757812240063095/runs/8c6c2b0b5c2a4b9a812c531ed1ab5a1d.
2024/12/10 16:53:21 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/509757812240063095.


Accuracy: 0.334
F1-score: 0.334
Train size: 60.0%, Test size: 40.0%
Accuracy: 0.338
F1-score: 0.338


Logisyczna regresja najsłabiej radzi sobie z tymi zadaniami. Natomiast tak jak poprzednio widać, że dla pierwszego zadania warstwowy podział jest lepszy oraz najlepsze wyniki otrzymujemy dla podziału 90/10.

### Las losowy

In [40]:
mlflow.set_experiment("Predict Pokemons Name")
clf_rf = RandomForestClassifier(random_state=42)
train(clf_rf, X, y_pok, "Random forest")
clf_rf = RandomForestClassifier(random_state=42)
train_with_strat(clf_rf, X, y_pok, "Random forest")

Train size: 90.0%, Test size: 10.0%
Accuracy: 0.97
F1-score: 0.968
Train size: 80.0%, Test size: 20.0%
Accuracy: 0.961
F1-score: 0.958
Train size: 70.0%, Test size: 30.0%
Accuracy: 0.968
F1-score: 0.968
Train size: 60.0%, Test size: 40.0%


2024/12/10 16:57:20 INFO mlflow.tracking._tracking_service.client: 🏃 View run powerful-sloth-532 at: http://127.0.0.1:8080/#/experiments/479905616626895318/runs/54d80c3f80a84e13bb745144be38074c.
2024/12/10 16:57:20 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/479905616626895318.


Accuracy: 0.966
F1-score: 0.967
Train size: 90.0%, Test size: 10.0%
Accuracy: 0.985
F1-score: 0.985
Train size: 80.0%, Test size: 20.0%
Accuracy: 0.977
F1-score: 0.977
Train size: 70.0%, Test size: 30.0%
Accuracy: 0.975
F1-score: 0.975
Train size: 60.0%, Test size: 40.0%


2024/12/10 16:57:25 INFO mlflow.tracking._tracking_service.client: 🏃 View run gregarious-roo-412 at: http://127.0.0.1:8080/#/experiments/479905616626895318/runs/8ce8fb63fccd48a480185e7fac854a8f.
2024/12/10 16:57:25 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/479905616626895318.


Accuracy: 0.974
F1-score: 0.974


In [41]:
mlflow.set_experiment("Predict Pokemons Type 1")
clf_rf = RandomForestClassifier(random_state=42)
train(clf_rf, X, y_typ, "Random forest")
clf_rf = RandomForestClassifier(random_state=42)
train_with_strat(clf_rf, X, y_typ, "Random forest")

Train size: 90.0%, Test size: 10.0%
Accuracy: 0.976
F1-score: 0.984
Train size: 80.0%, Test size: 20.0%
Accuracy: 0.968
F1-score: 0.97
Train size: 70.0%, Test size: 30.0%
Accuracy: 0.957
F1-score: 0.96
Train size: 60.0%, Test size: 40.0%


2024/12/10 16:57:58 INFO mlflow.tracking._tracking_service.client: 🏃 View run abundant-smelt-468 at: http://127.0.0.1:8080/#/experiments/509757812240063095/runs/5fef19822c8c4fdf9a8785a7066fc1bc.
2024/12/10 16:57:58 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/509757812240063095.


Accuracy: 0.964
F1-score: 0.969
Train size: 90.0%, Test size: 10.0%
Accuracy: 0.973
F1-score: 0.973
Train size: 80.0%, Test size: 20.0%
Accuracy: 0.968
F1-score: 0.968
Train size: 70.0%, Test size: 30.0%
Accuracy: 0.964
F1-score: 0.964
Train size: 60.0%, Test size: 40.0%


2024/12/10 16:58:01 INFO mlflow.tracking._tracking_service.client: 🏃 View run youthful-quail-622 at: http://127.0.0.1:8080/#/experiments/509757812240063095/runs/907dee6bf93d411fb43b3f32ef589d8e.
2024/12/10 16:58:01 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/509757812240063095.


Accuracy: 0.965
F1-score: 0.965


Jak dotąd las losowy okazał się najlepszym modelem. Również dla podziału 90/10 oraz dla podziału warstowego zostały wygenerowane najlepsze wyniki.

### Drzewo decyzyjne

In [42]:
mlflow.set_experiment("Predict Pokemons Name")
clf_dt = DecisionTreeClassifier(random_state=42)
train(clf_dt, X, y_pok, "Decision tree")
clf_rf = DecisionTreeClassifier(random_state=42)
train_with_strat(clf_dt, X, y_pok, "Decision tree")

Train size: 90.0%, Test size: 10.0%
Accuracy: 0.964
F1-score: 0.964
Train size: 80.0%, Test size: 20.0%
Accuracy: 0.952
F1-score: 0.956
Train size: 70.0%, Test size: 30.0%
Accuracy: 0.948
F1-score: 0.952
Train size: 60.0%, Test size: 40.0%
Accuracy: 0.937
F1-score: 0.94


2024/12/10 16:58:15 INFO mlflow.tracking._tracking_service.client: 🏃 View run spiffy-hound-8 at: http://127.0.0.1:8080/#/experiments/479905616626895318/runs/4fa9f68d18bf4181af0f6bdf63af02bb.
2024/12/10 16:58:15 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/479905616626895318.


Train size: 90.0%, Test size: 10.0%
Accuracy: 0.967
F1-score: 0.967
Train size: 80.0%, Test size: 20.0%
Accuracy: 0.964
F1-score: 0.964
Train size: 70.0%, Test size: 30.0%
Accuracy: 0.957
F1-score: 0.957
Train size: 60.0%, Test size: 40.0%


2024/12/10 16:58:16 INFO mlflow.tracking._tracking_service.client: 🏃 View run burly-fly-430 at: http://127.0.0.1:8080/#/experiments/479905616626895318/runs/3048933166c1420886e21f71d25a170a.
2024/12/10 16:58:16 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/479905616626895318.


Accuracy: 0.951
F1-score: 0.951


In [43]:
mlflow.set_experiment("Predict Pokemons Type 1")
clf_dt = DecisionTreeClassifier(random_state=42)
train(clf_dt, X, y_typ, "Decision tree")
clf_dt = DecisionTreeClassifier(random_state=42)
train_with_strat(clf_dt, X, y_typ, "Decision tree")

Train size: 90.0%, Test size: 10.0%
Accuracy: 0.888
F1-score: 0.881
Train size: 80.0%, Test size: 20.0%
Accuracy: 0.875
F1-score: 0.882
Train size: 70.0%, Test size: 30.0%


2024/12/10 16:58:17 INFO mlflow.tracking._tracking_service.client: 🏃 View run whimsical-shoat-646 at: http://127.0.0.1:8080/#/experiments/509757812240063095/runs/1e4fea3bf5604880ab57dc8ff59405c5.
2024/12/10 16:58:17 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/509757812240063095.


Accuracy: 0.877
F1-score: 0.881
Train size: 60.0%, Test size: 40.0%
Accuracy: 0.85
F1-score: 0.852
Train size: 90.0%, Test size: 10.0%
Accuracy: 0.879
F1-score: 0.879
Train size: 80.0%, Test size: 20.0%
Accuracy: 0.887
F1-score: 0.887
Train size: 70.0%, Test size: 30.0%


2024/12/10 16:58:18 INFO mlflow.tracking._tracking_service.client: 🏃 View run bouncy-finch-336 at: http://127.0.0.1:8080/#/experiments/509757812240063095/runs/3d304340860342278f281f5e8eb1caef.
2024/12/10 16:58:18 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/509757812240063095.


Accuracy: 0.824
F1-score: 0.824
Train size: 60.0%, Test size: 40.0%
Accuracy: 0.829
F1-score: 0.829


Drzewo decyzyjne dla zadania klasyfikacji nazw pokemonów wypada bardzo dobrze i osiąda podobne wyniki jak las losowy. Widać znaczny spadek dokładności dla klasyfikacji typu I. Również otrzymujemy najlepsze wyniki dla podziału 90/10, natomiast co jest ciekawe to, przy podziale warstwowym otrzymujemy gorsze wyniki.

### SVM

In [44]:
mlflow.set_experiment("Predict Pokemons Name")
clf_svm = SVC()
train(clf_svm, X, y_pok, "SVM")
clf_svm = SVC()
train_with_strat(clf_svm, X, y_pok, "SVM")

Train size: 90.0%, Test size: 10.0%
Accuracy: 0.891
F1-score: 0.868
Train size: 80.0%, Test size: 20.0%
Accuracy: 0.891
F1-score: 0.893
Train size: 70.0%, Test size: 30.0%
Accuracy: 0.862
F1-score: 0.848
Train size: 60.0%, Test size: 40.0%


2024/12/10 16:58:29 INFO mlflow.tracking._tracking_service.client: 🏃 View run colorful-grouse-956 at: http://127.0.0.1:8080/#/experiments/479905616626895318/runs/43b35be2d11a45008cb0632cc0726c57.
2024/12/10 16:58:29 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/479905616626895318.


Accuracy: 0.848
F1-score: 0.839
Train size: 90.0%, Test size: 10.0%
Accuracy: 0.937
F1-score: 0.937
Train size: 80.0%, Test size: 20.0%
Accuracy: 0.934
F1-score: 0.934
Train size: 70.0%, Test size: 30.0%
Accuracy: 0.927
F1-score: 0.927
Train size: 60.0%, Test size: 40.0%


2024/12/10 16:58:33 INFO mlflow.tracking._tracking_service.client: 🏃 View run chill-hen-30 at: http://127.0.0.1:8080/#/experiments/479905616626895318/runs/60da7f87852344d8be05055290162ee6.
2024/12/10 16:58:33 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/479905616626895318.


Accuracy: 0.922
F1-score: 0.922


In [45]:
mlflow.set_experiment("Predict Pokemons Type 1")
clf_svc = SVC()
train(clf_svc, X, y_typ, "SVM")
clf_svc = SVC()
train_with_strat(clf_svc, X, y_typ, "SVM")

Train size: 90.0%, Test size: 10.0%
Accuracy: 0.785
F1-score: 0.747
Train size: 80.0%, Test size: 20.0%
Accuracy: 0.766
F1-score: 0.763
Train size: 70.0%, Test size: 30.0%
Accuracy: 0.752
F1-score: 0.679
Train size: 60.0%, Test size: 40.0%


2024/12/10 16:58:40 INFO mlflow.tracking._tracking_service.client: 🏃 View run treasured-perch-470 at: http://127.0.0.1:8080/#/experiments/509757812240063095/runs/0df9c3b4d1bb4f1eb16724f364681de8.
2024/12/10 16:58:40 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/509757812240063095.


Accuracy: 0.729
F1-score: 0.671
Train size: 90.0%, Test size: 10.0%
Accuracy: 0.785
F1-score: 0.785
Train size: 80.0%, Test size: 20.0%
Accuracy: 0.766
F1-score: 0.766
Train size: 70.0%, Test size: 30.0%
Accuracy: 0.743
F1-score: 0.743
Train size: 60.0%, Test size: 40.0%


2024/12/10 16:58:42 INFO mlflow.tracking._tracking_service.client: 🏃 View run secretive-asp-511 at: http://127.0.0.1:8080/#/experiments/509757812240063095/runs/c6ea63144a8a419eb9b5bfec6a1e0666.
2024/12/10 16:58:42 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/509757812240063095.


Accuracy: 0.72
F1-score: 0.72


SVM działa dośc dobrze, natomiast lekko odbiega od Lasu losowego i Drzewa decyzyjnego. Otrzymujemy lepsze wyniki dla przewidywania nazw pokemonów niż dla typu I. Widać również znaczny spadek przy uzyciu losowego podziału zbioru na testowy i treningowy.

### Gradient boosting

In [46]:
mlflow.set_experiment("Predict Pokemons Name")
clf_gb = GradientBoostingClassifier(random_state=42)
train(clf_gb, X, y_pok, "Gradient boosting")
clf_gb = GradientBoostingClassifier(random_state=42)
train_with_strat(clf_gb, X, y_pok, "Gradient boosting")

Train size: 90.0%, Test size: 10.0%
Accuracy: 0.888
F1-score: 0.869
Train size: 80.0%, Test size: 20.0%
Accuracy: 0.728
F1-score: 0.743
Train size: 70.0%, Test size: 30.0%
Accuracy: 0.826
F1-score: 0.821
Train size: 60.0%, Test size: 40.0%


2024/12/10 17:03:50 INFO mlflow.tracking._tracking_service.client: 🏃 View run persistent-ant-968 at: http://127.0.0.1:8080/#/experiments/479905616626895318/runs/2f80053320674b5d954d27a4a70c2173.
2024/12/10 17:03:50 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/479905616626895318.


Accuracy: 0.084
F1-score: 0.083
Train size: 90.0%, Test size: 10.0%
Accuracy: 0.082
F1-score: 0.082
Train size: 80.0%, Test size: 20.0%
Accuracy: 0.9
F1-score: 0.9
Train size: 70.0%, Test size: 30.0%
Accuracy: 0.868
F1-score: 0.868
Train size: 60.0%, Test size: 40.0%


2024/12/10 17:08:09 INFO mlflow.tracking._tracking_service.client: 🏃 View run charming-bird-576 at: http://127.0.0.1:8080/#/experiments/479905616626895318/runs/9e05b4eaadc04d39b1673993c861981e.
2024/12/10 17:08:09 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/479905616626895318.


Accuracy: 0.116
F1-score: 0.116


In [48]:
mlflow.set_experiment("Predict Pokemons Type 1")
clf_gb = GradientBoostingClassifier(random_state=42)
train(clf_gb, X, y_typ, "Gradient boosting")
clf_gb = GradientBoostingClassifier(random_state=42)
train_with_strat(clf_gb, X, y_typ, "Gradient boosting")

Train size: 90.0%, Test size: 10.0%
Accuracy: 0.967
F1-score: 0.968
Train size: 80.0%, Test size: 20.0%
Accuracy: 0.967
F1-score: 0.965
Train size: 70.0%, Test size: 30.0%
Accuracy: 0.956
F1-score: 0.962
Train size: 60.0%, Test size: 40.0%


2024/12/10 17:10:31 INFO mlflow.tracking._tracking_service.client: 🏃 View run victorious-snake-840 at: http://127.0.0.1:8080/#/experiments/509757812240063095/runs/18a27d20271f4e47aa9a73514e969df2.
2024/12/10 17:10:31 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/509757812240063095.


Accuracy: 0.937
F1-score: 0.939
Train size: 90.0%, Test size: 10.0%
Accuracy: 0.961
F1-score: 0.961
Train size: 80.0%, Test size: 20.0%
Accuracy: 0.946
F1-score: 0.946
Train size: 70.0%, Test size: 30.0%
Accuracy: 0.94
F1-score: 0.94
Train size: 60.0%, Test size: 40.0%


2024/12/10 17:10:58 INFO mlflow.tracking._tracking_service.client: 🏃 View run sedate-worm-402 at: http://127.0.0.1:8080/#/experiments/509757812240063095/runs/e580d405ec17468c82446dcea7bced64.
2024/12/10 17:10:58 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/509757812240063095.


Accuracy: 0.937
F1-score: 0.937


Metoda gradient boostingu, działa również bardzo dobrze. Wyniki są porównywalne do lasów losowych.

### Adaboost

In [49]:
mlflow.set_experiment("Predict Pokemons Name")
clf_ab = AdaBoostClassifier(random_state=42)
train(clf_ab, X, y_pok, "Ada boosting")
clf_ab = AdaBoostClassifier(random_state=42)
train_with_strat(clf_ab, X, y_pok, "Ada boosting")

Train size: 90.0%, Test size: 10.0%




Accuracy: 0.009
F1-score: 0.007
Train size: 80.0%, Test size: 20.0%




Accuracy: 0.03
F1-score: 0.024
Train size: 70.0%, Test size: 30.0%




Accuracy: 0.03
F1-score: 0.025
Train size: 60.0%, Test size: 40.0%


2024/12/10 17:13:33 INFO mlflow.tracking._tracking_service.client: 🏃 View run valuable-vole-414 at: http://127.0.0.1:8080/#/experiments/479905616626895318/runs/e4b5c030733b4a2d9de781aa3587c435.
2024/12/10 17:13:33 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/479905616626895318.


Accuracy: 0.02
F1-score: 0.016
Train size: 90.0%, Test size: 10.0%




Accuracy: 0.106
F1-score: 0.106
Train size: 80.0%, Test size: 20.0%




Accuracy: 0.116
F1-score: 0.116
Train size: 70.0%, Test size: 30.0%




Accuracy: 0.123
F1-score: 0.123
Train size: 60.0%, Test size: 40.0%


2024/12/10 17:13:41 INFO mlflow.tracking._tracking_service.client: 🏃 View run trusting-shrew-340 at: http://127.0.0.1:8080/#/experiments/479905616626895318/runs/bdad1be5e44f477abbf09a6eabd12ce5.
2024/12/10 17:13:41 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/479905616626895318.


Accuracy: 0.105
F1-score: 0.105


In [50]:
mlflow.set_experiment("Predict Pokemons Type 1")
clf_ab = AdaBoostClassifier(random_state=42)
train(clf_ab, X, y_typ, "Ada boosting")
clf_ab = AdaBoostClassifier(random_state=42)
train_with_strat(clf_ab, X, y_typ, "Ada boosting")

Train size: 90.0%, Test size: 10.0%




Accuracy: 0.13
F1-score: 0.043
Train size: 80.0%, Test size: 20.0%




Accuracy: 0.159
F1-score: 0.049
Train size: 70.0%, Test size: 30.0%




Accuracy: 0.221
F1-score: 0.097
Train size: 60.0%, Test size: 40.0%


2024/12/10 17:13:54 INFO mlflow.tracking._tracking_service.client: 🏃 View run legendary-toad-89 at: http://127.0.0.1:8080/#/experiments/509757812240063095/runs/fe33392ca6ce4d4ca0a1e835eae30429.
2024/12/10 17:13:54 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/509757812240063095.


Accuracy: 0.196
F1-score: 0.065
Train size: 90.0%, Test size: 10.0%




Accuracy: 0.215
F1-score: 0.215
Train size: 80.0%, Test size: 20.0%




Accuracy: 0.213
F1-score: 0.213
Train size: 70.0%, Test size: 30.0%




Accuracy: 0.205
F1-score: 0.205
Train size: 60.0%, Test size: 40.0%


2024/12/10 17:13:56 INFO mlflow.tracking._tracking_service.client: 🏃 View run merciful-squid-976 at: http://127.0.0.1:8080/#/experiments/509757812240063095/runs/fd407d12ed934d5b973559bc63a990cb.
2024/12/10 17:13:56 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/509757812240063095.


Accuracy: 0.23
F1-score: 0.23


## Xgboost

In [13]:
mlflow.set_experiment("Predict Pokemons Name")
clf_xgb = xgb.XGBClassifier(random_state=42)
train(clf_xgb, X, y_pok, "Xgboost")
clf_xgb = xgb.XGBClassifier(random_state=42)
train_with_strat(clf_xgb, X, y_pok, "Xgboost")

Train size: 90.0%, Test size: 10.0%
Accuracy: 0.937
F1-score: 0.927
Train size: 80.0%, Test size: 20.0%
Accuracy: 0.938
F1-score: 0.931
Train size: 70.0%, Test size: 30.0%
Accuracy: 0.943
F1-score: 0.942
Train size: 60.0%, Test size: 40.0%


2024/12/10 20:22:51 INFO mlflow.tracking._tracking_service.client: 🏃 View run unleashed-gull-720 at: http://127.0.0.1:8080/#/experiments/479905616626895318/runs/8478b5e613fd42adbdaa835b9f70f419.
2024/12/10 20:22:51 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/479905616626895318.


Accuracy: 0.94
F1-score: 0.94
Train size: 90.0%, Test size: 10.0%
Accuracy: 0.964
F1-score: 0.964
Train size: 80.0%, Test size: 20.0%
Accuracy: 0.946
F1-score: 0.946
Train size: 70.0%, Test size: 30.0%
Accuracy: 0.944
F1-score: 0.944
Train size: 60.0%, Test size: 40.0%


2024/12/10 20:22:59 INFO mlflow.tracking._tracking_service.client: 🏃 View run merciful-pug-381 at: http://127.0.0.1:8080/#/experiments/479905616626895318/runs/06ef16e8aa994c6186b436a6c70d08fa.
2024/12/10 20:22:59 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/479905616626895318.


Accuracy: 0.937
F1-score: 0.937


In [15]:
mlflow.set_experiment("Predict Pokemons Type 1")
clf_xgb = xgb.XGBClassifier(random_state=42)
train(clf_xgb, X, y_typ, "Xgboost")
clf_xgb = xgb.XGBClassifier(random_state=42)
train_with_strat(clf_xgb, X, y_typ, "Xgboost")

Train size: 90.0%, Test size: 10.0%
Accuracy: 0.979
F1-score: 0.976
Train size: 80.0%, Test size: 20.0%
Accuracy: 0.962
F1-score: 0.961
Train size: 70.0%, Test size: 30.0%
Accuracy: 0.963
F1-score: 0.96
Train size: 60.0%, Test size: 40.0%


2024/12/10 20:24:09 INFO mlflow.tracking._tracking_service.client: 🏃 View run awesome-koi-950 at: http://127.0.0.1:8080/#/experiments/509757812240063095/runs/a65e889a9ac54817b51836c29a7ac96b.
2024/12/10 20:24:09 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/509757812240063095.


Accuracy: 0.957
F1-score: 0.953
Train size: 90.0%, Test size: 10.0%
Accuracy: 0.976
F1-score: 0.976
Train size: 80.0%, Test size: 20.0%
Accuracy: 0.955
F1-score: 0.955
Train size: 70.0%, Test size: 30.0%
Accuracy: 0.954
F1-score: 0.954
Train size: 60.0%, Test size: 40.0%


2024/12/10 20:24:11 INFO mlflow.tracking._tracking_service.client: 🏃 View run unequaled-ray-669 at: http://127.0.0.1:8080/#/experiments/509757812240063095/runs/2214aa2bce7b40e08d08eb1ee7c8b51c.
2024/12/10 20:24:11 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/509757812240063095.


Accuracy: 0.952
F1-score: 0.952


Model xgboost jest porównywalny do lasów losowych. Podczas eksperymentów wynik wskazał na to, że las losowy jest najlepszym modelem. Natomiast xgboost jest drugim najlepszym modelem. Średnie wyniki metryk obydwu modeli oscylowały w okolicy 95/96%. Trzecim najlepszym modelem były drzewa decyzyjne, natomiast one podczas kolejnych testów nie będą brane pod uwagę. Jeśli celem by było otrzymanie jak najszybszych predykcji przy jak najlepszej jakości to stosunek między czasem, a wynikiem mógłby wskazywać na metodę KNN. Działa ona najszybciej ze wszystkich metod, a przy tym otrzymujemy również bardzo wysokie metryki dokładności (około 92%)