In [None]:
import warnings
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFECV
import matplotlib.pyplot as plt
import data_preparation.utils.data_loader as dl

import time

start_time = time.time()


warnings.filterwarnings('ignore')


x_2023, y_2023, X_train, X_test, y_train, y_test = dl.data_loader(
    'data_preparation/db/out/output_std.csv', normalize_flag=True, std_scaler=True
)


In [None]:
from sklearn import svm

model = svm.LinearSVC()

result = model.fit(X_train, y_train)
y_pred = model.predict(X_test)


basic_report = classification_report(y_test, y_pred, output_dict=True)


y_pred_proba = result.predict(X_test)
fpr_basic, tpr_basic, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc_basic = metrics.roc_auc_score(y_test, y_pred_proba)

print(basic_report)


In [None]:
model = svm.LinearSVC()

param_grid = {'C': range(1, 100, 5),
              'loss': ['hinge', 'squared_hinge'],
              }


search = RandomizedSearchCV(model, param_grid, n_iter=5,
                            cv=10, scoring='accuracy', n_jobs=-1, random_state=1)

# execute search
result = search.fit(X_train, y_train)
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)


best_random = result.best_estimator_
y_pred_test = best_random.predict(X_test)
print(classification_report(y_test, y_pred_test))

fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_test)
auc = metrics.roc_auc_score(y_test, y_pred_test)
plt.plot(fpr, tpr, label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

disp = ConfusionMatrixDisplay(confusion_matrix=cnf_matrix)
plt.grid(False)
disp.plot()
plt.show()

In [None]:
# Linear SVC with RFECV
model = svm.LinearSVC(C=6, loss='hinge', random_state=45)

cv = StratifiedKFold(3)

rfecv = RFECV(model, cv=cv, scoring='accuracy', step=1)

rfecv.fit(X_train, y_train)

y_pred = rfecv.predict(X_test)

print('Optimal number of features : %d' % rfecv.n_features_)

ranks = pd.DataFrame(
    rfecv.ranking_, index=X.columns, columns=['Rank'])

print(ranks.sort_values(by='Rank', ascending=True))

selected_features = ranks[ranks['Rank'] == 1].index.values.tolist()

plt.style.use('ggplot')
plt.figure(figsize=(12, 6))
plt.xlabel('Number of features selected')
plt.ylabel('Cross validation score (nb of correct classifications)')
plt.plot(range(1, len(rfecv.cv_results_[
         "mean_test_score"]) + 1), rfecv.cv_results_["mean_test_score"])
plt.title("Recursive Feature Elimination \nwith correlated features")
plt.show()


print(classification_report(y_test, y_pred))


cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix


disp = ConfusionMatrixDisplay(confusion_matrix=cnf_matrix)
disp.plot()
plt.grid(False)
plt.show()

y_pred_proba = result.predict(X_test)
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)

plt.grid(True)
plt.plot(fpr, tpr, label="data 1, auc="+str(auc), color='blue')
plt.plot(fpr_basic, tpr_basic, label="Basic, auc="+str(auc), color='red')
plt.legend(loc=4)
plt.show()

In [None]:
import json

filename = 'results.json'

final_results = {
    "report_basic": basic_report,
    "fpr_basic": fpr_basic.tolist(),
    "tpr_basic": tpr_basic.tolist(),
    "auc_basic": float(auc_basic),
    "report_optimized": classification_report(y_test, y_pred, output_dict=True),
    "fpr_optimized": fpr.tolist(),
    "tpr_optimized": tpr.tolist(),
    "auc_optimied": float(auc),
    "selected_features": selected_features,
    "time": time.time() - start_time
}


with open(filename, 'r') as file:
    data = json.load(file)


data['SVM'] = (final_results)


with open('results.json', 'w') as file:
    json.dump(data, file, indent=4)