In [22]:
import pandas as pd
import numpy as np
import os
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, cross_validate, StratifiedKFold
from sklearn.metrics import (
    roc_auc_score, average_precision_score, precision_recall_curve, roc_curve,
    confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, make_scorer
)

In [23]:
TARGET = "label" # 1=attack, 0=normal
RANDOM_STATE = 42

In [24]:
clearData = pd.read_csv("C:\\Users\\Gökhan\\Desktop\\nids-adversarial\\data\\clear_data_full.csv", low_memory=False)

In [25]:
clearData.shape

(2540027, 43)

In [26]:
clearData.head(10)

Unnamed: 0,sport,dsport,proto,state,dur,sbytes,dbytes,sttl,dttl,sloss,...,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,attack_cat,Label
0,33661,1024,udp,CON,0.036133,528,304,31,29,0,...,0,2,4,2,3,1,1,2,Normal,0
1,1464,53,udp,CON,0.001119,146,178,31,29,0,...,0,12,8,1,2,2,1,1,Normal,0
2,3593,53,udp,CON,0.001209,132,164,31,29,0,...,0,6,9,1,1,1,1,1,Normal,0
3,49664,53,udp,CON,0.001169,146,178,31,29,0,...,0,7,9,1,1,1,1,1,Normal,0
4,32119,111,udp,CON,0.078339,568,312,31,29,0,...,0,2,4,2,3,1,1,2,Normal,0
5,2142,53,udp,CON,0.001134,132,164,31,29,0,...,0,12,7,1,2,2,1,1,Normal,0
6,0,0,arp,INT,0.0,46,0,0,0,0,...,0,2,2,2,2,2,2,2,Normal,0
7,40726,53,udp,CON,0.001126,146,178,31,29,0,...,0,6,7,3,1,1,1,1,Normal,0
8,12660,53,udp,CON,0.001167,132,164,31,29,0,...,0,6,7,2,1,1,1,1,Normal,0
9,0,0,arp,INT,0.0,46,0,0,0,0,...,0,2,2,2,2,2,2,2,Normal,0


In [27]:
TARGET_COL = 'attack_cat'

# X'e hedefi ve varsa 'label'ı ALMA (label -> leakage)
drop_cols = [TARGET_COL]
if 'Label' in clearData.columns:
    drop_cols.append('Label')

X = clearData.drop(columns=drop_cols)
y = clearData[TARGET_COL]

# %80 / %20 split (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42, shuffle=True
)

print("Train class dist:\n", y_train.value_counts(normalize=True).round(3))
print("\nTest class dist:\n", y_test.value_counts(normalize=True).round(3))


Train class dist:
 attack_cat
Normal              0.874
Generic             0.085
Exploits            0.018
 Fuzzers            0.008
DoS                 0.006
 Reconnaissance     0.005
 Fuzzers            0.002
Analysis            0.001
Backdoor            0.001
Reconnaissance      0.001
 Shellcode          0.001
Backdoors           0.000
Shellcode           0.000
Worms               0.000
Name: proportion, dtype: float64

Test class dist:
 attack_cat
Normal              0.874
Generic             0.085
Exploits            0.018
 Fuzzers            0.008
DoS                 0.006
 Reconnaissance     0.005
 Fuzzers            0.002
Analysis            0.001
Backdoor            0.001
Reconnaissance      0.001
 Shellcode          0.001
Backdoors           0.000
Shellcode           0.000
Worms               0.000
Name: proportion, dtype: float64


In [28]:
X_train.shape

(2032021, 41)

In [29]:
y_train.shape

(2032021,)

In [30]:
X_test.shape

(508006, 41)

In [31]:
y_test.shape

(508006,)

In [32]:
# 2) Kolon tiplerini belirleyelim
# Sayısal/kategorik ayrımı: object ve category -> kategorik; geri kalan -> sayısal varsayımı
def split_columns(X_train, target):
    cols = [c for c in X_train.columns if c != target]
    cat_cols = []
    num_cols = []

    for c in cols:
        if X_train[c].dtype.name in ["object", "category"]:
            cat_cols.append(c)
        else:
            # Çok-unique ve sayısal görünümlü object'ler varsa dönüştürmeyi düşünebilirsiniz.
            num_cols.append(c)

    return num_cols, cat_cols

num_cols, cat_cols = split_columns(X_train, TARGET)
print("Numeric:", len(num_cols), "\nCategorical:", len(cat_cols))

Numeric: 36 
Categorical: 5


Category features hangileri 

In [33]:
print("Categorical features:", cat_cols)


Categorical features: ['sport', 'dsport', 'proto', 'state', 'ct_ftp_cmd']


In [34]:
other_cat = [c for c in cat_cols if c != ['sport', 'dsport']]

sport_ohe = OneHotEncoder(
    handle_unknown='infrequent_if_exist',  # görülmeyen + nadirleri tek kovaya at
    max_categories=20,                     # en sık 20 kategori ayrı
    dtype=np.float32,
    sparse_output=False,
)
dsport_ohe = OneHotEncoder(
    handle_unknown='infrequent_if_exist',
    max_categories=101,
    sparse_output=False,
    dtype=np.float32,
)

other_ohe = OneHotEncoder(
    handle_unknown='ignore',
    sparse_output=False,
    dtype=np.float32

)

preprocessor = ColumnTransformer(
    transformers=[
        ('sport', sport_ohe, ['sport']),
        ('cat',   other_ohe, other_cat),
        ('num',   'passthrough', num_cols),
    ],
    remainder='drop',
    verbose_feature_names_out=False
)


OneHotEncode yaptıktan sonra veri setinin info'na bak, neler yapıyor bak 17 öncesi ve sonrası için , features isimlerini bastır

In [35]:
#All DATA
clearData["Label"].value_counts()                            # adet
clearData["Label"].value_counts(normalize=True).mul(100)     # %

Label
0    87.351197
1    12.648803
Name: proportion, dtype: float64

In [36]:
# TRAIN
clearData.loc[X_train.index, "Label"].value_counts()
clearData.loc[X_train.index, "Label"].value_counts(normalize=True).mul(100)


Label
0    87.351213
1    12.648787
Name: proportion, dtype: float64

In [37]:
# TEST
clearData.loc[X_test.index, "Label"].value_counts()
clearData.loc[X_test.index, "Label"].value_counts(normalize=True).mul(100)

Label
0    87.351134
1    12.648866
Name: proportion, dtype: float64

In [38]:
model = Pipeline([
    ('prep', preprocessor),
    ('rf', RandomForestClassifier(random_state=42, n_jobs=-1))
])

In [None]:
# --- 2) CV tanımı
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


In [41]:

param_grid = {
    'rf__n_estimators': [100, 300, 500],
    'rf__max_depth': [10, 20, 30, 50, None],
    'rf__min_samples_split': [2, 5, 10],
    'rf__min_samples_leaf': [1, 2, 4],
    'rf__class_weight': ['balanced', None]
}

param_grid_small = {
    'rf__n_estimators': [200, 400],
    'rf__max_depth': [20, None],
    'rf__min_samples_split': [2, 5],
    'rf__min_samples_leaf': [1, 2],
    'rf__class_weight': ['balanced', None]
}

In [42]:
gs = GridSearchCV(
    estimator=model,
    param_grid=param_grid_small,
    scoring={"f1_macro":"f1_macro", "accuracy":"accuracy"},
    refit="f1_macro",          # en iyi modeli f1_macro'ya göre seç & yeniden eğit
    cv=cv,
    n_jobs=-1,
    verbose=0                  # <-- ilerleme çıktısı yok
)

In [43]:
# 3) Fit (sadece TRAIN)
gs.fit(X_train, y_train)

ValueError: 
All the 96 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
32 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1363, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\pipeline.py", line 653, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\pipeline.py", line 587, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\joblib\memory.py", line 326, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\pipeline.py", line 1539, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\_set_output.py", line 316, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1363, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\compose\_column_transformer.py", line 996, in fit_transform
    result = self._call_func_on_transformers(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\compose\_column_transformer.py", line 897, in _call_func_on_transformers
    return Parallel(n_jobs=self.n_jobs)(jobs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\parallel.py", line 82, in __call__
    return super().__call__(iterable_with_config_and_warning_filters)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\joblib\parallel.py", line 1986, in __call__
    return output if self.return_generator else list(output)
                                                ^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\joblib\parallel.py", line 1914, in _get_sequential_output
    res = func(*args, **kwargs)
          ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\parallel.py", line 147, in __call__
    return self.function(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\pipeline.py", line 1539, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\_set_output.py", line 316, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 895, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\_set_output.py", line 316, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\preprocessing\_encoders.py", line 1083, in transform
    return out.toarray()
           ^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\scipy\sparse\_compressed.py", line 996, in toarray
    out = self._process_toarray_args(order, out)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\scipy\sparse\_base.py", line 1527, in _process_toarray_args
    return np.zeros(self.shape, dtype=self.dtype, order=order)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
numpy._core._exceptions._ArrayMemoryError: Unable to allocate 651. GiB for an array with shape (1354680, 129054) and data type float32

--------------------------------------------------------------------------------
32 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1363, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\pipeline.py", line 653, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\pipeline.py", line 587, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\joblib\memory.py", line 326, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\pipeline.py", line 1539, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\_set_output.py", line 316, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1363, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\compose\_column_transformer.py", line 996, in fit_transform
    result = self._call_func_on_transformers(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\compose\_column_transformer.py", line 897, in _call_func_on_transformers
    return Parallel(n_jobs=self.n_jobs)(jobs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\parallel.py", line 82, in __call__
    return super().__call__(iterable_with_config_and_warning_filters)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\joblib\parallel.py", line 1986, in __call__
    return output if self.return_generator else list(output)
                                                ^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\joblib\parallel.py", line 1914, in _get_sequential_output
    res = func(*args, **kwargs)
          ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\parallel.py", line 147, in __call__
    return self.function(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\pipeline.py", line 1539, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\_set_output.py", line 316, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 895, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\_set_output.py", line 316, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\preprocessing\_encoders.py", line 1083, in transform
    return out.toarray()
           ^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\scipy\sparse\_compressed.py", line 996, in toarray
    out = self._process_toarray_args(order, out)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\scipy\sparse\_base.py", line 1527, in _process_toarray_args
    return np.zeros(self.shape, dtype=self.dtype, order=order)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
numpy._core._exceptions._ArrayMemoryError: Unable to allocate 651. GiB for an array with shape (1354681, 129061) and data type float32

--------------------------------------------------------------------------------
32 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1363, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\pipeline.py", line 653, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\pipeline.py", line 587, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\joblib\memory.py", line 326, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\pipeline.py", line 1539, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\_set_output.py", line 316, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1363, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\compose\_column_transformer.py", line 996, in fit_transform
    result = self._call_func_on_transformers(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\compose\_column_transformer.py", line 897, in _call_func_on_transformers
    return Parallel(n_jobs=self.n_jobs)(jobs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\parallel.py", line 82, in __call__
    return super().__call__(iterable_with_config_and_warning_filters)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\joblib\parallel.py", line 1986, in __call__
    return output if self.return_generator else list(output)
                                                ^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\joblib\parallel.py", line 1914, in _get_sequential_output
    res = func(*args, **kwargs)
          ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\parallel.py", line 147, in __call__
    return self.function(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\pipeline.py", line 1539, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\_set_output.py", line 316, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 895, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\_set_output.py", line 316, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\preprocessing\_encoders.py", line 1083, in transform
    return out.toarray()
           ^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\scipy\sparse\_compressed.py", line 996, in toarray
    out = self._process_toarray_args(order, out)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Gökhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\scipy\sparse\_base.py", line 1527, in _process_toarray_args
    return np.zeros(self.shape, dtype=self.dtype, order=order)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
numpy._core._exceptions._ArrayMemoryError: Unable to allocate 651. GiB for an array with shape (1354681, 129072) and data type float32


In [None]:
best_params = gs.best_params_              # örn: {'clf__n_estimators': 150, ...}
best_model  = gs.best_estimator_

In [None]:
print("Seçilen hiperparametreler:")
for k, v in best_params.items():
    print(f" - {k}: {v}")

In [None]:
# 4) Sadece sonuç özeti yazdır
res = pd.DataFrame(gs.cv_results_)
best_idx = gs.best_index_

best_params = gs.best_params_
best_f1_mean = res.loc[best_idx, "mean_test_f1_macro"]
best_f1_std  = res.loc[best_idx, "std_test_f1_macro"]
best_acc_mean = res.loc[best_idx, "mean_test_accuracy"]
best_acc_std  = res.loc[best_idx, "std_test_accuracy"]

print("=== GridSearch En İyi Değerler (CV, train) ===")
print("Best params:", best_params)
print(f"f1_macro: {best_f1_mean:.4f} ± {best_f1_std:.4f}")
print(f"accuracy: {best_acc_mean:.4f} ± {best_acc_std:.4f}")

# (İstersen) en iyi model:
best_model = gs.best_estimator_

AttributeError: 'GridSearchCV' object has no attribute 'cv_results_'

Dağılıma bak, bu kadar 1'in fazla çıkması normal değil. 

# Define the pipeline
rf_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1))
])

# Define parameter grid for Grid Search
param_grid = {
    'classifier__n_estimators': [100, 300, 500],
    'classifier__max_depth': [10, 20, 30, 50, None],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__class_weight': ['balanced', None]
}

# For faster testing, you can use a smaller grid:
param_grid_small = {
    'classifier__n_estimators': [300, 500],
    'classifier__max_depth': [20, 30],
    'classifier__min_samples_split': [2, 5],
    'classifier__min_samples_leaf': [1, 2],
    'classifier__class_weight': ['balanced']
}

print("\n" + "="*50)
print("STARTING GRID SEARCH WITH CROSS-VALIDATION")
print("="*50)

# Create stratified k-fold for cross-validation
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

# Create GridSearchCV object
grid_search = GridSearchCV(
    rf_pipeline,
    param_grid_small, # Use param_grid for full search
    cv=cv_strategy,
    scoring='f1',  # Primary metric for optimization
    n_jobs=-1,
    verbose=1,
    return_train_score=True
)

# Fit grid search
print("\nPerforming grid search... This may take a while...")
grid_search.fit(X_train, y_train)

print(f"\nBest parameters: {grid_search.best_params_}")
print(f"Best CV F1 Score: {grid_search.best_score_:.4f}")


 #9. CROSS-VALIDATION WITH BEST MODEL
# ============================================

print("\n" + "="*50)
print("CROSS-VALIDATION RESULTS WITH BEST MODEL")
print("="*50)

# Get the best model
best_model = grid_search.best_estimator_

# Define multiple scoring metrics
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, zero_division=0),
    'recall': make_scorer(recall_score, zero_division=0),
    'f1': make_scorer(f1_score, zero_division=0),
    'roc_auc': make_scorer(roc_auc_score)
}

# Perform cross-validation with multiple metrics
cv_results = cross_validate(
    best_model,
    X_train,
    y_train,
    cv=cv_strategy,
    scoring=scoring,
    return_train_score=True,
    n_jobs=-1
)

# Print CV results
print("\nCross-Validation Results (Mean ± Std):")
print("-" * 40)
for metric in scoring.keys():
    train_scores = cv_results[f'train_{metric}']
    val_scores = cv_results[f'test_{metric}']
    print(f"{metric.upper():12} | Train: {train_scores.mean():.4f} ± {train_scores.std():.4f} | "
          f"Val: {val_scores.mean():.4f} ± {val_scores.std():.4f}")


cross-validation , eğitimden hemen önce olsun, 5e böldükten sonra ortalamasını al.

print("\n" + "="*50)
print("EVALUATION ON TEST SET")
print("="*50)

# Make predictions
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# Calculate all metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
roc_auc = roc_auc_score(y_test, y_pred_proba)
pr_auc = average_precision_score(y_test, y_pred_proba)

print("\nTest Set Performance Metrics:")
print("-" * 40)
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-Score:  {f1:.4f}")
print(f"ROC-AUC:   {roc_auc:.4f}")
print(f"PR-AUC:    {pr_auc:.4f}")

# Confusion Matrix
print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)


# 5) PR ve ROC eğrileri
prec, rec, thr_pr = precision_recall_curve(y_test, probs)
fpr, tpr, thr_roc = roc_curve(y_test, probs)

plt.figure()
plt.plot(rec, prec)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.show()

plt.figure()
plt.plot(fpr, tpr)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.show()
