In [5]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.0-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.0-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   -------------- ------------------------- 54.3/150.0 MB 28.1 MB/s eta 0:00:04
   ------------------- -------------------- 71.3/150.0 MB 28.8 MB/s eta 0:00:03
   ----------------------- ---------------- 87.8/150.0 MB 31.5 MB/s eta 0:00:02
   ----------------------------- --------- 113.5/150.0 MB 36.8 MB/s eta 0:00:01
   -------------------------------- ------ 124.5/150.0 MB 37.7 MB/s eta 0:00:01
   ------------------------------------ -- 141.0/150.0 MB 39.0 MB/s eta 0:00:01
   --------------------------------------  149.9/150.0 MB 39.6 MB/s eta 0:00:01
   --------------------------------------  149.9/150.0 MB 39.6 MB/s eta 0:00:01
   --------------------------------------- 150.0/150.0 MB 33.0 MB/s eta 0:00:00
Installing collected packages: xgboost
Successfully installed

In [6]:
#ADASYN
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

folder_path = os.path.join(os.getcwd(), 'CIC')
sampled_file = 'adasyn_data.csv'
chunksize = 900_000  # Tune based on your RAM

original_sampled_chunks = []
fraction_to_keep = 0.05  # Just keep 5% from each original chunk

for fname in os.listdir(folder_path):
    if fname.endswith('.csv'):
        fpath = os.path.join(folder_path, fname)
        try:
            print("working on " + fpath)
            for chunk in pd.read_csv(fpath, chunksize=chunksize):
                sampled_chunk = chunk.sample(frac=fraction_to_keep, random_state=42)
                sampled_chunk['is_synthetic'] = 0
                if 'label' in sampled_chunk.columns:
                    sampled_chunk = sampled_chunk.drop(columns=['label'])
                original_sampled_chunks.append(sampled_chunk)
        except Exception as e:
            print(f" Error reading {fname}: {e}")

original_df = pd.concat(original_sampled_chunks, ignore_index=True)

# Load synthetic/sampled data
print("Loading synthetic data...")
sampled_df = pd.read_csv(sampled_file)
sampled_df['is_synthetic'] = 1
if 'label' in sampled_df.columns:
    sampled_df = sampled_df.drop(columns=['label'])

# Align columns and merge
common_cols = list(set(original_df.columns) & set(sampled_df.columns))
original_df = original_df[common_cols]
sampled_df = sampled_df[common_cols]

combined_df = pd.concat([original_df, sampled_df], ignore_index=True)
combined_df = combined_df.dropna(axis=1, how='any')  # safe drop NaNs

#  Prepare for adversarial model
X = combined_df.drop(columns=['is_synthetic'])
y = combined_df['is_synthetic']

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, stratify=y, test_size=0.3, random_state=42
)

#  Train adversarial classifier
clf = XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='logloss', random_state=42)
clf.fit(X_train, y_train)
y_pred_proba = clf.predict_proba(X_test)[:, 1]
auc_score = roc_auc_score(y_test, y_pred_proba)


print(f"\nAdversarial AUC (Synthetic vs Original): {auc_score:.4f}")
if auc_score > 0.9:
    print(" Synthetic data is very easy to detect.")
elif auc_score > 0.7:
    print(" Some detectable differences exist. ")
else:
    print("Synthetic data appears realistic. ")


working on C:\Users\vlad.serban\Desktop\CIC\part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00006-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00007-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00008-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00009-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Adversarial AUC (Synthetic vs Original): 1.0000
 Synthetic data is very easy to detect.


In [1]:
#Undersampled 
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

folder_path = os.path.join(os.getcwd(), 'CIC')
sampled_file = 'undersampled_data.csv'
chunksize = 900_000  # Tune based on your RAM

original_sampled_chunks = []
fraction_to_keep = 0.05  # Just keep 2% from each original chunk

for fname in os.listdir(folder_path):
    if fname.endswith('.csv'):
        fpath = os.path.join(folder_path, fname)
        try:
            print("working on " + fpath)
            for chunk in pd.read_csv(fpath, chunksize=chunksize):
                sampled_chunk = chunk.sample(frac=fraction_to_keep, random_state=42)
                sampled_chunk['is_synthetic'] = 0
                if 'label' in sampled_chunk.columns:
                    sampled_chunk = sampled_chunk.drop(columns=['label'])
                original_sampled_chunks.append(sampled_chunk)
        except Exception as e:
            print(f" Error reading {fname}: {e}")

original_df = pd.concat(original_sampled_chunks, ignore_index=True)

# Load synthetic/sampled data
print("Loading synthetic data...")
sampled_df = pd.read_csv(sampled_file)
sampled_df['is_synthetic'] = 1
if 'label' in sampled_df.columns:
    sampled_df = sampled_df.drop(columns=['label'])

# Align columns and merge
common_cols = list(set(original_df.columns) & set(sampled_df.columns))
original_df = original_df[common_cols]
sampled_df = sampled_df[common_cols]

combined_df = pd.concat([original_df, sampled_df], ignore_index=True)
combined_df = combined_df.dropna(axis=1, how='any')  # safe drop NaNs

#  Prepare for adversarial model
X = combined_df.drop(columns=['is_synthetic'])
y = combined_df['is_synthetic']

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, stratify=y, test_size=0.3, random_state=42
)

#  Train adversarial classifier
clf = XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='logloss', random_state=42)
clf.fit(X_train, y_train)
y_pred_proba = clf.predict_proba(X_test)[:, 1]
auc_score = roc_auc_score(y_test, y_pred_proba)


print(f"\nAdversarial AUC (Synthetic vs Original): {auc_score:.4f}")
if auc_score > 0.9:
    print(" Synthetic data is very easy to detect.")
elif auc_score > 0.7:
    print(" Some detectable differences exist. ")
else:
    print("Synthetic data appears realistic. ")


working on C:\Users\vlad.serban\Desktop\CIC\part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00006-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00007-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00008-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00009-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Adversarial AUC (Synthetic vs Original): 0.8240
 Some detectable differences exist. 


In [2]:
#stratified
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

folder_path = os.path.join(os.getcwd(), 'CIC')
sampled_file = 'stratified_sample.csv'
chunksize = 900_000  # Tune based on your RAM

original_sampled_chunks = []
fraction_to_keep = 0.05  # Just keep 2% from each original chunk

for fname in os.listdir(folder_path):
    if fname.endswith('.csv'):
        fpath = os.path.join(folder_path, fname)
        try:
            print("working on " + fpath)
            for chunk in pd.read_csv(fpath, chunksize=chunksize):
                sampled_chunk = chunk.sample(frac=fraction_to_keep, random_state=42)
                sampled_chunk['is_synthetic'] = 0
                if 'label' in sampled_chunk.columns:
                    sampled_chunk = sampled_chunk.drop(columns=['label'])
                original_sampled_chunks.append(sampled_chunk)
        except Exception as e:
            print(f" Error reading {fname}: {e}")

original_df = pd.concat(original_sampled_chunks, ignore_index=True)

# Load synthetic/sampled data
print("Loading synthetic data...")
sampled_df = pd.read_csv(sampled_file)
sampled_df['is_synthetic'] = 1
if 'label' in sampled_df.columns:
    sampled_df = sampled_df.drop(columns=['label'])

# Align columns and merge
common_cols = list(set(original_df.columns) & set(sampled_df.columns))
original_df = original_df[common_cols]
sampled_df = sampled_df[common_cols]

combined_df = pd.concat([original_df, sampled_df], ignore_index=True)
combined_df = combined_df.dropna(axis=1, how='any')  # safe drop NaNs

#  Prepare for adversarial model
X = combined_df.drop(columns=['is_synthetic'])
y = combined_df['is_synthetic']

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, stratify=y, test_size=0.3, random_state=42
)

#  Train adversarial classifier
clf = XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='logloss', random_state=42)
clf.fit(X_train, y_train)
y_pred_proba = clf.predict_proba(X_test)[:, 1]
auc_score = roc_auc_score(y_test, y_pred_proba)


print(f"\nAdversarial AUC (Synthetic vs Original): {auc_score:.4f}")
if auc_score > 0.9:
    print(" Synthetic data is very easy to detect.")
elif auc_score > 0.7:
    print(" Some detectable differences exist. ")
else:
    print("Synthetic data appears realistic. ")


working on C:\Users\vlad.serban\Desktop\CIC\part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00006-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00007-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00008-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00009-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Adversarial AUC (Synthetic vs Original): 0.4869
Synthetic data appears realistic. 


In [3]:
#data diffusion
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

folder_path = os.path.join(os.getcwd(), 'CIC')
sampled_file = 'data_diffusion_data.csv'
chunksize = 900_000  # Tune based on your RAM

original_sampled_chunks = []
fraction_to_keep = 0.05  # Just keep 2% from each original chunk

for fname in os.listdir(folder_path):
    if fname.endswith('.csv'):
        fpath = os.path.join(folder_path, fname)
        try:
            print("working on " + fpath)
            for chunk in pd.read_csv(fpath, chunksize=chunksize):
                sampled_chunk = chunk.sample(frac=fraction_to_keep, random_state=42)
                sampled_chunk['is_synthetic'] = 0
                if 'label' in sampled_chunk.columns:
                    sampled_chunk = sampled_chunk.drop(columns=['label'])
                original_sampled_chunks.append(sampled_chunk)
        except Exception as e:
            print(f" Error reading {fname}: {e}")

original_df = pd.concat(original_sampled_chunks, ignore_index=True)

# Load synthetic/sampled data
print("Loading synthetic data...")
sampled_df = pd.read_csv(sampled_file)
sampled_df['is_synthetic'] = 1
if 'label' in sampled_df.columns:
    sampled_df = sampled_df.drop(columns=['label'])

# Align columns and merge
common_cols = list(set(original_df.columns) & set(sampled_df.columns))
original_df = original_df[common_cols]
sampled_df = sampled_df[common_cols]

combined_df = pd.concat([original_df, sampled_df], ignore_index=True)
combined_df = combined_df.dropna(axis=1, how='any')  # safe drop NaNs

#  Prepare for adversarial model
X = combined_df.drop(columns=['is_synthetic'])
y = combined_df['is_synthetic']

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, stratify=y, test_size=0.3, random_state=42
)

#  Train adversarial classifier
clf = XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='logloss', random_state=42)
clf.fit(X_train, y_train)
y_pred_proba = clf.predict_proba(X_test)[:, 1]
auc_score = roc_auc_score(y_test, y_pred_proba)


print(f"\nAdversarial AUC (Synthetic vs Original): {auc_score:.4f}")
if auc_score > 0.9:
    print(" Synthetic data is very easy to detect.")
elif auc_score > 0.7:
    print(" Some detectable differences exist. ")
else:
    print("Synthetic data appears realistic. ")


working on C:\Users\vlad.serban\Desktop\CIC\part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00006-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00007-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00008-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00009-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Adversarial AUC (Synthetic vs Original): 1.0000
 Synthetic data is very easy to detect.


In [4]:
#smote enn
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

folder_path = os.path.join(os.getcwd(), 'CIC')
sampled_file = 'smote_enn_data.csv'
chunksize = 900_000  # Tune based on your RAM

original_sampled_chunks = []
fraction_to_keep = 0.05  # Just keep 2% from each original chunk

for fname in os.listdir(folder_path):
    if fname.endswith('.csv'):
        fpath = os.path.join(folder_path, fname)
        try:
            print("working on " + fpath)
            for chunk in pd.read_csv(fpath, chunksize=chunksize):
                sampled_chunk = chunk.sample(frac=fraction_to_keep, random_state=42)
                sampled_chunk['is_synthetic'] = 0
                if 'label' in sampled_chunk.columns:
                    sampled_chunk = sampled_chunk.drop(columns=['label'])
                original_sampled_chunks.append(sampled_chunk)
        except Exception as e:
            print(f" Error reading {fname}: {e}")

original_df = pd.concat(original_sampled_chunks, ignore_index=True)

# Load synthetic/sampled data
print("Loading synthetic data...")
sampled_df = pd.read_csv(sampled_file)
sampled_df['is_synthetic'] = 1
if 'label' in sampled_df.columns:
    sampled_df = sampled_df.drop(columns=['label'])

# Align columns and merge
common_cols = list(set(original_df.columns) & set(sampled_df.columns))
original_df = original_df[common_cols]
sampled_df = sampled_df[common_cols]

combined_df = pd.concat([original_df, sampled_df], ignore_index=True)
combined_df = combined_df.dropna(axis=1, how='any')  # safe drop NaNs

#  Prepare for adversarial model
X = combined_df.drop(columns=['is_synthetic'])
y = combined_df['is_synthetic']

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, stratify=y, test_size=0.3, random_state=42
)

#  Train adversarial classifier
clf = XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='logloss', random_state=42)
clf.fit(X_train, y_train)
y_pred_proba = clf.predict_proba(X_test)[:, 1]
auc_score = roc_auc_score(y_test, y_pred_proba)


print(f"\nAdversarial AUC (Synthetic vs Original): {auc_score:.4f}")
if auc_score > 0.9:
    print(" Synthetic data is very easy to detect.")
elif auc_score > 0.7:
    print(" Some detectable differences exist. ")
else:
    print("Synthetic data appears realistic. ")


working on C:\Users\vlad.serban\Desktop\CIC\part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00006-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00007-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00008-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00009-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Adversarial AUC (Synthetic vs Original): 1.0000
 Synthetic data is very easy to detect.


In [1]:
#smote tomek
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

folder_path = os.path.join(os.getcwd(), 'CIC')
sampled_file = 'smote_tomek_data.csv'
chunksize = 900_000  # Tune based on your RAM

original_sampled_chunks = []
fraction_to_keep = 0.05  # Just keep 2% from each original chunk

for fname in os.listdir(folder_path):
    if fname.endswith('.csv'):
        fpath = os.path.join(folder_path, fname)
        try:
            print("working on " + fpath)
            for chunk in pd.read_csv(fpath, chunksize=chunksize):
                sampled_chunk = chunk.sample(frac=fraction_to_keep, random_state=42)
                sampled_chunk['is_synthetic'] = 0
                if 'label' in sampled_chunk.columns:
                    sampled_chunk = sampled_chunk.drop(columns=['label'])
                original_sampled_chunks.append(sampled_chunk)
        except Exception as e:
            print(f" Error reading {fname}: {e}")

original_df = pd.concat(original_sampled_chunks, ignore_index=True)

# Load synthetic/sampled data
print("Loading synthetic data...")
sampled_df = pd.read_csv(sampled_file)
sampled_df['is_synthetic'] = 1
if 'label' in sampled_df.columns:
    sampled_df = sampled_df.drop(columns=['label'])

# Align columns and merge
common_cols = list(set(original_df.columns) & set(sampled_df.columns))
original_df = original_df[common_cols]
sampled_df = sampled_df[common_cols]

combined_df = pd.concat([original_df, sampled_df], ignore_index=True)
combined_df = combined_df.dropna(axis=1, how='any')  # safe drop NaNs

#  Prepare for adversarial model
X = combined_df.drop(columns=['is_synthetic'])
y = combined_df['is_synthetic']

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, stratify=y, test_size=0.3, random_state=42
)

#  Train adversarial classifier
clf = XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='logloss', random_state=42)
clf.fit(X_train, y_train)
y_pred_proba = clf.predict_proba(X_test)[:, 1]
auc_score = roc_auc_score(y_test, y_pred_proba)


print(f"\nAdversarial AUC (Synthetic vs Original): {auc_score:.4f}")
if auc_score > 0.9:
    print(" Synthetic data is very easy to detect.")
elif auc_score > 0.7:
    print(" Some detectable differences exist. ")
else:
    print("Synthetic data appears realistic. ")


working on C:\Users\vlad.serban\Desktop\CIC\part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00006-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00007-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00008-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00009-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Adversarial AUC (Synthetic vs Original): 1.0000
 Synthetic data is very easy to detect.


In [1]:
#smote 
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

folder_path = os.path.join(os.getcwd(), 'CIC')
sampled_file = 'smote_data.csv'
chunksize = 900_000  # Tune based on your RAM

original_sampled_chunks = []
fraction_to_keep = 0.05  # Just keep 2% from each original chunk

for fname in os.listdir(folder_path):
    if fname.endswith('.csv'):
        fpath = os.path.join(folder_path, fname)
        try:
            print("working on " + fpath)
            for chunk in pd.read_csv(fpath, chunksize=chunksize):
                sampled_chunk = chunk.sample(frac=fraction_to_keep, random_state=42)
                sampled_chunk['is_synthetic'] = 0
                if 'label' in sampled_chunk.columns:
                    sampled_chunk = sampled_chunk.drop(columns=['label'])
                original_sampled_chunks.append(sampled_chunk)
        except Exception as e:
            print(f" Error reading {fname}: {e}")

original_df = pd.concat(original_sampled_chunks, ignore_index=True)

# Load synthetic/sampled data
print("Loading synthetic data...")
sampled_df = pd.read_csv(sampled_file)
sampled_df['is_synthetic'] = 1
if 'label' in sampled_df.columns:
    sampled_df = sampled_df.drop(columns=['label'])

# Align columns and merge
common_cols = list(set(original_df.columns) & set(sampled_df.columns))
original_df = original_df[common_cols]
sampled_df = sampled_df[common_cols]

combined_df = pd.concat([original_df, sampled_df], ignore_index=True)
combined_df = combined_df.dropna(axis=1, how='any')  # safe drop NaNs

#  Prepare for adversarial model
X = combined_df.drop(columns=['is_synthetic'])
y = combined_df['is_synthetic']

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, stratify=y, test_size=0.3, random_state=42
)

#  Train adversarial classifier
clf = XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='logloss', random_state=42)
clf.fit(X_train, y_train)
y_pred_proba = clf.predict_proba(X_test)[:, 1]
auc_score = roc_auc_score(y_test, y_pred_proba)


print(f"\nAdversarial AUC (Synthetic vs Original): {auc_score:.4f}")
if auc_score > 0.9:
    print(" Synthetic data is very easy to detect.")
elif auc_score > 0.7:
    print(" Some detectable differences exist. ")
else:
    print("Synthetic data appears realistic. ")


working on C:\Users\vlad.serban\Desktop\CIC\part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00006-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00007-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00008-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Desktop\CIC\part-00009-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Adversarial AUC (Synthetic vs Original): 1.0000
 Synthetic data is very easy to detect.
