In [None]:
# Import Common Libraies
import pandas as pd 
import numpy as np 

from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler 
from sklearn.model_selection import RandomizedSearchCV

# For Machine learning Model
from sklearn.linear_model import LogisticRegression

In [None]:
import os 
os.getcwd()

In [None]:
os.chdir("C:/Users/91629/OneDrive/MSC 4TH SEM\MSC_THESIS_of_Kumarjit_Gupta_(Roll-573)")

# Own Library
# from Own_Library.Classification_Reports import classification_reports
from Own_Library.ClassifierAnalysis import find_reports, best_find, Splitings, Feature_Selections, Cross_Validations

In [None]:
# Load dataset into the memory
data = pd.read_csv('Dataset\data-ori_clear_PTC.csv')

print(data.shape)
data.head(3)

In [None]:
# Select independent features
X = data.iloc[:,:-1]#.values

# Select Dependent features
y = data.iloc[:,-1]#.values

In [None]:
scaler = RobustScaler()

In [None]:
X = scaler.fit_transform(X)

In [None]:
splits = [0.1, 0.2, 0.25]

# Model 1: LOGISTIC REGRESSION

In [None]:
model1 = LogisticRegression()
algoname1= 'Logistic Regression'

## 1. All Split Checking
- 1.1. Train 90%, Test 10%
- 1.2. Train 80%, Test 20%
- 1.3. Train 75%, Test 25%

In [None]:
metrics1 =['F1_Score', 'Diagnostic_Odds_Ratio', 'Critical_Success_Index']

In [None]:
best_split1, plotdf_split1 = Splitings(model1, X, y, splits, metrics1, algoname1, dim =False)
print('Spliting Classification Metrics DataFrame:') 
plotdf_split1

so the best spilit is (90,10) split for better metrics

## 2. ALL Feature section checking
- 2.1. Without Feature Selection 
- 2.2. Pearson Correlation
- 2.3. Mutual info classif
- 2.4. Chi-square

In [None]:
metrics2= ['F1_Measure', 'Specificity', 'Negative_Predictive_Value']
featX1, plotdf_fs2, del_cols1 = Feature_Selections(model1, data, best_split1, metrics2, algoname1, dim = False)
print("Best Independent vector's Shape by feature Selection:", featX1.shape)
print("Deleted Columns:", del_cols1)
print("Feature Selections Classification Metrics DataFrame:")
plotdf_fs2

## 3. All CV CHECKING
- 3.1.  Kfold
- 3.2.  Stratified Kfold
- 3.3.  Shuffle Split

In [None]:
metrics2 = ['balanced_accuracy', 'accuracy', 'roc_auc']

In [None]:
best_cv1, plotdf_cv1 = Cross_Validations(model1, featX1, y, best_split1, metrics2, algoname1, dim =False)
print('Spliting Classification Metrics DataFrame:') 
plotdf_cv1

## 4. ALL Hyperparameter tuning checking
- 4.1. Randomized Search CV (Normal Optimization)
- 4.2. HyperOpt (Bayesian Optimization)
- 4.3. Optuna (Asynchronous Distributed Optimization)
- 4.4. Cuckoo Search (NIOA) 

### 4.1. Using Randomized Search Cv

In [None]:
metrics3 = ['precision_macro', 'recall_macro', 'neg_log_loss']

In [None]:
from scipy.stats import uniform, randint
from sklearn.model_selection import StratifiedKFold
# Hyperparameters
param_grid = {'C': uniform(0.001, 100), #[0.001, 0.01, 0.1, 1, 10, 100, 1000],
            'fit_intercept':[True, False],
            'max_iter':randint(100, 300),#[100, 200, 300],
             }

# Random search for best hyperparameters
search = RandomizedSearchCV(estimator=model1, 
                            param_distributions=param_grid, 
                            cv=best_cv1,
                            n_iter=10, 
                            scoring=metrics3,
                            refit= False,
                            n_jobs=2,
                            #  return_train_score=True,
                            verbose=4)  

search.fit(X, y)

# Best parameters for Logistic regression classifier
# search.best_params_

In [None]:
pd.DataFrame(search.cv_results_)

In [None]:
# Retrain with best model
pd.DataFrame(search.cv_results_)['mean_test_precision_macro']

### 4.2. Using HyperOpt

In [None]:
from hyperopt import tpe, hp, fmin, STATUS_OK, Trials
from sklearn.model_selection import cross_val_score

# define parameter space
space = {
    "C": hp.uniform("C",0.0, 100.0),
    "fit_intercept":hp.choice("fit_intercept", [True, False]),
    "max_iter":  hp.randint("max_iter", 100, 300),
    "class_weight": hp.choice("class_weight", ['balanced'])#, {0: 1, 1: 2}, {0: 1, 1: 4}, {0: 1, 1: 5}]),
}
# from sklearn.ensemble import RandomForestClassifier
# space = {
#     "n_estimators": hp.choice("n_estimators", [100, 200, 300, 400,500,600]),
#     "max_depth": hp.randint("max_depth", 1, 15),
#     "criterion": hp.choice("criterion", ["gini", "entropy"]),
# }


def hyperparameter_tuning(params):
    clf = LogisticRegression(**params, n_jobs=-1, random_state=42)
    acc = cross_val_score(clf, X, y, scoring="accuracy").mean()
    return {"loss": -acc, "model": clf, "status": STATUS_OK}


# Fine tune the model
trials = Trials()
best = fmin(
    fn=hyperparameter_tuning,
    space = space, 
    algo=tpe.suggest, 
    max_evals=10, 
    trials=trials
)
# best.best_params_
print("Best: {}".format(best))

In [None]:
best_hyperopt= trials.results[np.argmin([r['loss'] for r in trials.results])]['model']

In [None]:
best_hyperopt.fit(X, y)
y_pred = best_hyperopt.predict(X)

### 4.3. Using Optuna

In [None]:
# import joblib 
import optuna 
from optuna.samplers import TPESampler

# define the search space and the objecive function
def objective(trial):
    # # Define the search space
    # criterions = trial.suggest_categorical('criterion', ['gini', 'entropy'])
    # max_depths = trial.suggest_int('max_depth', 1, 9, 1)
    # n_estimators = trial.suggest_int('n_estimators', 100, 1000, 100)

    # clf = RandomForestClassifier(n_estimators=n_estimators,
    #                              criterion=criterions,
    #                              max_depth=max_depths,
    #                              n_jobs=-1)
    params = {
    'C' : trial.suggest_float("C", 1e-2, 1),
    'fit_intercept' : trial.suggest_categorical('fit_intercept' , [True, False]),
    "max_iter":  trial.suggest_int("max_iter", 100, 300, 1),
    # "class_weight": trial.suggest_categorical("class_weight", [{0: 2, 1: 3}]), #, {0: 1, 1: 2}, {0: 1, 1: 4}, {0: 1, 1: 5}
    'solver' : trial.suggest_categorical('solver' , ['lbfgs','liblinear']),
    # "n_jobs":-1
    }

    clf = LogisticRegression(**params, random_state=42)
    score = cross_val_score(clf, X, y, scoring="accuracy").mean()

    return score

# create a study object and pass the objective function to method optimize()
study = optuna.create_study(study_name="LogiticReg_optimization",
                            direction="maximize",
                            sampler=TPESampler())

study.optimize(objective, n_trials=10)

In [None]:
# print best parameters 
print(study.best_params)

In [None]:
best_optuna = LogisticRegression(**study.best_params)
best_optuna.fit(X, y)

### Using PSO

In [None]:
from sklearn.svm import SVC
from sklearn_nature_inspired_algorithms.model_selection import NatureInspiredSearchCV
from sklearn.ensemble import RandomForestClassifier
from niapy.algorithms.basic import ParticleSwarmAlgorithm, CuckooSearch

# param_grid = {
#     'n_estimators': range(20, 100, 20),
#     'max_depth': range(2, 40, 2),
#     'min_samples_split': range(2, 20, 2),
#     'max_features': ["sqrt", "log2"],
# }

param_grid = {'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000],
            # 'penalty':['l1', 'l2', 'elasticnet', 'none'],
            'fit_intercept':[True, False],
            'max_iter':range(100, 300, 20),#[100, 200, 300],
              # 'class_weight': [{0: 1, 1: 1}, {0: 1, 1: 2}, {0: 1, 1: 4}, {0: 1, 1: 5}]
             }

# # Hyperparameters
# param_grid = {'C': [0.1, 1, 10, 100, 1000], 
#               'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
#               'kernel': ['linear', 'rbf', 'poly'],
#               'degree':[0, 1, 2, 3, 4, 5, 6]
#              }

clf = LogisticRegression(random_state=42)

algorithm = ParticleSwarmAlgorithm() # when custom algorithm is provided random_state is ignored
algorithm.set_parameters(NP=50, Ts=5, Mr=0.25)

nia_search = NatureInspiredSearchCV(
    clf,
    param_grid,
    algorithm=algorithm,
    population_size=50,
    max_n_gen=30,
    max_stagnating_gen=20,
    scoring= 'accuracy',
    verbose = 2,
    runs=3,
)

nia_search.fit(X, y)

# The most optimal parameters are stored in:
# nia_search.best_params

In [None]:
nia_search.best_score_

In [None]:
clf = LogisticRegression(**nia_search.best_params_, random_state=42)
clf.fit(X, y)

In [None]:
# from importlib import reload
# from sklearn.metrics import auc, roc_curve

# plt=reload(plt)
# fpr = {}
# tpr = {}
# thresh = {}
# roc_auc = {}
# color = ['orange', 'green', 'blue', 'yellow', 'red', 'violet']
# category = ['outcare', 'incare']

# plt.figure(figsize=(6, 6))
# for i in range(len(category)):
#     fpr[i], tpr[i], thresh[i] = roc_curve(
#         y_test, y_pred, pos_label=i)
#     roc_auc[i] = np.round(auc(fpr[i], tpr[i]), 4)
#     plt.plot(fpr[i], tpr[i], linestyle='-', color=color[i],
#              label=("{},(area={})".format(category[i], roc_auc[i])))
    
# plt.title('Multiclass ROC curve')
# plt.xlabel('False_Positive_Rate')
# plt.ylabel('True Positive rate')
# plt.legend(loc='best')