In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score

# Wczytanie pliku CSV
df = pd.read_csv('drug200.csv')

# Podział zbioru na część uczącą i testową (80% - ucząca, 20% - testowa)
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

# Podzielenie zbioru na zmienne niezależne (X) i zmienną zależną (y)
X_train = train_set.drop('Drug', axis=1)
y_train = train_set['Drug']

X_test = test_set.drop('Drug', axis=1)
y_test = test_set['Drug']

# One-hot encoding dla zmiennych kategorycznych
X_train_encoded = pd.get_dummies(X_train, columns=["Sex", "BP", "Cholesterol"], drop_first=True)
X_test_encoded = pd.get_dummies(X_test, columns=["Sex", "BP", "Cholesterol"], drop_first=True)

# Inicjalizacja i trenowanie modeli
models = {
    'GLM': LogisticRegression(),
    'DT': DecisionTreeClassifier(),
    'RF': RandomForestClassifier(),
    'KNN': KNeighborsClassifier(),
    'SVM': SVC()
}

for name, model in models.items():
    model.fit(X_train_encoded, y_train)

    # Predykcja na zbiorze testowym
    y_pred = model.predict(X_test_encoded)

    # Macierz pomyłek i dokładność
    cm = confusion_matrix(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)
    print(f'Model: {name}')
    print(f'Confusion Matrix:\n{cm}')
    print(f'Accuracy: {acc}\n')

    # Wypisanie istotnych zmiennych dla modelu GLM
    if name == 'GLM':
        coefficients = pd.DataFrame({
            'Variable': X_train_encoded.columns,
            'Coefficient': model.coef_[0]
        })
        coefficients = coefficients.sort_values(by='Coefficient', ascending=False)
        print(f'Most important variables for {name}:\n{coefficients.head()}\n')

    # Wypisanie istotnych zmiennych dla modelu DT
    if name == 'DT':
        feature_importances = pd.DataFrame({
            'Variable': X_train_encoded.columns,
            'Importance': model.feature_importances_
        })
        feature_importances = feature_importances.sort_values(by='Importance', ascending=False)
        print(f'Most important variables for {name}:\n{feature_importances.head()}\n')

    # Wypisanie istotnych zmiennych dla modelu RF
    if name == 'RF':
        feature_importances = pd.DataFrame({
            'Variable': X_train_encoded.columns,
            'Importance': model.feature_importances_
        })
        feature_importances = feature_importances.sort_values(by='Importance', ascending=False)
        print(f'Most important variables for {name}:\n{feature_importances.head()}\n')

    # Wypisanie istotnych zmiennych dla modelu SVM (uwaga: w przypadku SVM wartości współczynników są trudne do interpretacji)
    if name == 'SVM':
        print(f'Most important variables for {name}:\nNot applicable for SVM\n')


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model: GLM
Confusion Matrix:
[[15  0  0  0  0]
 [ 3  3  0  0  0]
 [ 0  0  3  0  0]
 [ 1  0  0  4  0]
 [ 0  0  0  0 11]]
Accuracy: 0.9

Most important variables for GLM:
             Variable  Coefficient
1             Na_to_K     0.510230
4           BP_NORMAL     0.106472
2               Sex_M     0.105781
0                 Age    -0.035601
5  Cholesterol_NORMAL    -0.329614

Model: DT
Confusion Matrix:
[[15  0  0  0  0]
 [ 0  6  0  0  0]
 [ 0  0  3  0  0]
 [ 0  0  0  5  0]
 [ 0  0  0  0 11]]
Accuracy: 1.0

Most important variables for DT:
             Variable  Importance
1             Na_to_K    0.493261
4           BP_NORMAL    0.144739
0                 Age    0.135510
3              BP_LOW    0.120919
5  Cholesterol_NORMAL    0.105571

Model: RF
Confusion Matrix:
[[15  0  0  0  0]
 [ 0  6  0  0  0]
 [ 0  0  3  0  0]
 [ 0  0  0  5  0]
 [ 0  0  0  0 11]]
Accuracy: 1.0

Most important variables for RF:
             Variable  Importance
1             Na_to_K    0.555392
0            