In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from joblib import dump
import xgboost as xgb
import matplotlib.pyplot as plt
from scipy.stats import randint
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix, classification_report, r2_score, precision_score, recall_score, accuracy_score, make_scorer

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df_n2v = pd.read_excel("/content/drive/MyDrive/Colab Notebooks/n2v_whole_data.xlsx")
#print(df)

In [None]:
columns_to_drop = [7, 8, 9, 11, 12, 13, 14, 16, 17, 18, 27, 28, 29, 30, 31, 32, 35, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56]
df_n2v_extracted = df_n2v.drop(columns=columns_to_drop)
df_n2v_extracted.head()

Unnamed: 0,0,1,2,3,4,5,6,10,15,19,...,37,38,57,58,59,60,61,62,63,label
0,0.185664,0.038381,0.088199,0.060264,0.059804,0.02993,0.024279,0.005061,-0.03097,-0.018356,...,-0.016635,-0.021995,-0.003927,-0.007916,-0.004296,-0.011041,-0.011489,-0.010767,-0.012662,1
1,0.240407,0.083226,0.092082,0.072697,0.083744,0.053693,0.046664,0.017592,0.002981,-0.011767,...,-0.003653,-0.006539,0.002449,-0.000369,0.0023,-0.000759,0.001286,0.000648,-0.002569,1
2,0.255905,-0.056426,-0.041874,-0.068825,-0.024977,-0.02638,-0.002083,0.022709,-0.016479,-0.005985,...,-0.017954,-0.023319,-0.026269,-0.029155,-0.026701,-0.031623,-0.03191,-0.032459,-0.029607,1
3,0.136404,-0.072341,0.013134,0.007694,0.058358,0.035262,0.058339,-0.015095,-0.052562,-0.01702,...,-0.018606,-0.015528,0.003666,0.004405,0.006,-0.000561,0.003274,0.002714,0.002069,1
4,0.326345,-0.069603,0.09667,-0.029981,-0.057323,-0.149889,-0.123073,-0.080844,-0.073645,-0.07976,...,-0.013255,-0.025324,-0.015458,-0.010203,-0.004443,-0.008781,-0.008274,-0.009391,-0.006729,1


In [None]:
df_n2v_extracted["label"].value_counts()

1    388
0    296
Name: label, dtype: int64

In [None]:
X = df_n2v_extracted.iloc[:, :-1]
y = df_n2v_extracted.iloc[:, -1]

In [None]:
print(X.shape)
print(y.shape)

(684, 29)
(684,)


In [None]:
# Let's split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
xgb_model = xgb.XGBClassifier()


In [None]:
def custom_scorer(y_true, y_pred):
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)

    # You can generate a composite score by combining these metrics with a desired formula
    combined_score = (precision + recall + accuracy) / 3

    return combined_score

# Transform the custom scoring function into a scoring function using 'make_scorer'
custom_score = make_scorer(custom_scorer, greater_is_better=True)

In [None]:
# GridSearchCV için parametre aralıkları
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

# GridSearchCV ile hiperparametre optimizasyonu
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='accuracy', cv=3)
grid_search.fit(X_train, y_train)

# Retrieve the best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
best_score = grid_search.best_score_

In [None]:
# Print the best parameters and the model to the screen
print("Best grid search hyperparameters are:", best_params)
print("Best grid search model is:", best_model)
print("Best grid search score is:", best_score)

Best grid search hyperparameters are: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 1.0}
Best grid search model is: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.8, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=7, max_leaves=None,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=200, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)
Best grid search score is: 0.784293120358694


In [None]:
y_pred_grid = best_model.predict(X_test)
#confusion_matrix(y_test, y_pred_grid)

In [None]:
acc_grid = accuracy_score(y_test, y_pred_grid)
print('Accuracy score is: ', acc_grid)
cl_report = classification_report(y_pred_grid, y_test)
print(cl_report)

Accuracy score is:  0.7737226277372263
              precision    recall  f1-score   support

           0       0.70      0.76      0.73        55
           1       0.83      0.78      0.81        82

    accuracy                           0.77       137
   macro avg       0.77      0.77      0.77       137
weighted avg       0.78      0.77      0.78       137



In [None]:
# model adınıza ve model değişkeninize uygun isimleri verin
model_adı = "xgb_model_gridsearchCV"
model_değişkeni = best_model

# Modeli kaydet
dump(model_değişkeni, f"{model_adı}.sav")

['xgb_model_gridsearchCV.sav']

In [None]:
# RandomizedSearchCV için parametre aralıkları
param_dist = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

In [None]:
xgb2 = xgb.XGBClassifier()
# RandomizedSearchCV ile hiperparametre optimizasyonu
random_search = RandomizedSearchCV(estimator=xgb2, param_distributions=param_dist, scoring='accuracy', n_iter=10, cv=3, random_state=42)
model_random=random_search.fit(X_train, y_train)

In [None]:
# random forest random search results
print('Best random search hyperparameters are: '+str(model_random.best_params_))
print('Best grid search model is: ' +str(model_random.best_estimator_))
print('Best random search score is: '+str(model_random.best_score_))

Best random search hyperparameters are: {'subsample': 0.8, 'n_estimators': 100, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.1, 'colsample_bytree': 0.8}
Best grid search model is: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.8, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=3, max_leaves=None,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=100, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)
Best random search score is: 0.7751456194079145


In [None]:
y_pred_randomized = model_random.predict(X_test)
#confusion_matrix(y_test, y_pred_randomized)

In [None]:
acc_randomized = accuracy_score(y_test, y_pred_randomized)
print('Accuracy score is: ', acc_randomized)
print(classification_report(y_pred_randomized, y_test))

Accuracy score is:  0.7591240875912408
              precision    recall  f1-score   support

           0       0.68      0.75      0.71        55
           1       0.82      0.77      0.79        82

    accuracy                           0.76       137
   macro avg       0.75      0.76      0.75       137
weighted avg       0.76      0.76      0.76       137



In [None]:
# Give appropriate names to your model and model variables
model_adı = "xgb_model_rs"
model_değişkeni = model_random

# Save the model
dump(model_değişkeni, f"{model_adı}.sav")

['xgb_model_rs.sav']