In [None]:
import warnings
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFECV
import matplotlib.pyplot as plt
import data_preparation.utils.data_loader as dl
warnings.filterwarnings('ignore')

x_2023, y_2023, X_train, X_test, y_train, y_test = dl.data_loader(
    'data_preparation/db/out/output_std.csv',
)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier()

result = model.fit(X_train, y_train)
y_pred = model.predict(X_test.values)

basic_report = classification_report(y_test, y_pred, output_dict=True)

y_pred_proba = result.predict(X_test.values)
fpr_basic, tpr_basic, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc_basic = metrics.roc_auc_score(y_test, y_pred_proba)

In [None]:
print(basic_report)

In [None]:
from sklearn.naive_bayes import ComplementNB

model = ComplementNB()

result = model.fit(X_train, y_train)
y_pred = model.predict(X_test.values)

basic_report = classification_report(y_test, y_pred, output_dict=True)

basic_report

In [None]:
from sklearn.svm import NuSVC

model = NuSVC()

result = model.fit(X_train, y_train)
y_pred = model.predict(X_test)

basic_report = classification_report(y_test, y_pred, output_dict=True)

basic_report

In [None]:
from sklearn.tree import ExtraTreeClassifier

model = ExtraTreeClassifier()

result = model.fit(X_train, y_train)
y_pred = model.predict(X_test)

basic_report = classification_report(y_test, y_pred, output_dict=True)

basic_report

In [None]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(max_depth=10, random_state=75)

param_grid = {
    'max_depth': list(range(5, 20, 1)),
    'max_features': ['auto', 'sqrt', 'log2']
}

search = RandomizedSearchCV(model, param_grid, n_iter=15,
                            cv=10, scoring='accuracy', n_jobs=-1, random_state=1, verbose=2)

result = search.fit(X_train, y_train)
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

In [None]:
model = DecisionTreeClassifier(
    max_depth=7, max_features='log2', criterion='entropy')

result = model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))

feat_importances = pd.DataFrame(
    model.feature_importances_, index=X_test.columns, columns=["Importance"])
feat_importances.sort_values(by='Importance', ascending=False, inplace=True)
feat_importances.plot(kind='bar', figsize=(8, 6))
plt.show()

In [None]:
model = DecisionTreeClassifier()

param_grid = {
    'max_depth': list(range(5, 20, 1)),
    'max_features': ['auto', 'sqrt', 'log2'],
    'criterion': ['gini', 'entropy', 'log_loss']
}

search = RandomizedSearchCV(model, param_grid, n_iter=25,
                            cv=5, scoring='accuracy', n_jobs=-1, random_state=45, verbose=2)

result = search.fit(X_train, y_train)
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)


best_random = result.best_estimator_
y_pred_test = best_random.predict(X_test)
print(classification_report(y_test, y_pred_test))

fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_test)
auc = metrics.roc_auc_score(y_test, y_pred_test)

In [None]:
# Decision tree with RFECV
from sklearn.tree import export_graphviz

model = DecisionTreeClassifier(
    max_depth=7, max_features='log2', criterion='entropy', random_state=45)

cv = StratifiedKFold(3)

rfecv = RFECV(model, cv=cv, scoring='accuracy', step=1)

rfecv.fit(X_train, y_train)

y_pred = rfecv.predict(X_test)

print('Optimal number of features : %d' % rfecv.n_features_)

ranks = pd.DataFrame(
    rfecv.ranking_, index=X.columns, columns=['Rank'])

print(ranks.sort_values(by='Rank', ascending=True))

selected_features = ranks[ranks['Rank'] == 1].index.values.tolist()

ranks = pd.DataFrame(
    rfecv.ranking_, index=X.columns, columns=['Rank'])

ranks_1 = ranks[ranks['Rank'] == 1]

print(classification_report(y_test, y_pred))

y_pred_proba = result.predict_proba(X_test)[::, 1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)

plt.grid(True)
plt.plot(fpr, tpr, label="Optimized, auc="+str(auc))
plt.plot(fpr_basic, tpr_basic, label="Basic, auc=" +
         str(auc_basic), color='blue')
plt.legend(loc=4)
plt.show()

In [None]:
import json

filename = 'results.json'

final_results = {
    "report_basic": basic_report,
    # "fpr_basic": fpr_basic.tolist(),
    # "tpr_basic": tpr_basic.tolist(),
    "auc_basic": float(auc_basic),
    "report_optimized": classification_report(y_test, y_pred, output_dict=True),
    # "fpr_optimized": fpr.tolist(),
    # "tpr_optimized": tpr.tolist(),
    "auc_optimied": float(auc),
    "selected_features": selected_features,
}

try:
    with open(filename, 'r') as file:
        data = json.load(file)
except FileNotFoundError:
    data = {}

data['DecisionTree'] = final_results

try:
    with open(filename, 'w') as file:
        json.dump(data, file, indent=4)
except json.JSONDecodeError as e:
    print(f"Error: {e}")


data['DecisionTree'] = (final_results)


with open('results.json', 'w') as file:
    json.dump(data, file, indent=4)