In [3]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

def load_and_preprocess_data(file_paths, chunk_size=100000):
    dataframes = []

    for file in file_paths:
        # Read CSV file in chunks
        chunk_iter = pd.read_csv(file, chunksize=chunk_size)

        for chunk in chunk_iter:
            # Handle missing values for numeric columns
            numeric_cols = chunk.select_dtypes(include=[np.number]).columns
            imputer = SimpleImputer(strategy='mean')
            chunk[numeric_cols] = imputer.fit_transform(chunk[numeric_cols])

            # Handle categorical columns 
            non_numeric_cols = chunk.select_dtypes(exclude=[np.number]).columns
            for col in non_numeric_cols:
                encoder = LabelEncoder()
                chunk[col] = encoder.fit_transform(chunk[col].astype(str))

            
            dataframes.append(chunk)

    
    data = pd.concat(dataframes, ignore_index=True)

    target_column = 'Label' if 'Label' in data.columns else 'Class'
    X = data.drop(target_column, axis=1)
    y = data[target_column]

    # Downsample the data if it's too large
    max_samples = 100000
    if len(X) > max_samples:
        sampled_indices = np.random.choice(len(X), max_samples, replace=False)
        X = X.iloc[sampled_indices]
        y = y.iloc[sampled_indices]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    return X_train, X_test, y_train, y_test

def train_and_evaluate_models(X_train, X_test, y_train, y_test):
    
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)

    
    y_pred = rf_model.predict(X_test)

    
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))


file_paths = [
    'C:\\Users\\ascom\\CTU-13\\1\\capture20110810.binetflow',
    'C:\\Users\\ascom\\CTU-13\\2\\capture20110811.binetflow',
    'C:\\Users\\ascom\\CTU-13\\3\\capture20110812.binetflow',
    'C:\\Users\\ascom\\CTU-13\\4\\capture20110815.binetflow',
    'C:\\Users\\ascom\\CTU-13\\5\\capture20110815-2.binetflow',
    'C:\\Users\\ascom\\CTU-13\\6\\capture20110816.binetflow',
    'C:\\Users\\ascom\\CTU-13\\7\\capture20110816-2.binetflow',
    'C:\\Users\\ascom\\CTU-13\\8\\capture20110816-3.binetflow',
    'C:\\Users\\ascom\\CTU-13\\9\\capture20110817.binetflow',
    'C:\\Users\\ascom\\CTU-13\\10\\capture20110818.binetflow',
    'C:\\Users\\ascom\\CTU-13\\11\\capture20110818-2.binetflow',
    'C:\\Users\\ascom\\CTU-13\\12\\capture20110819.binetflow',
    'C:\\Users\\ascom\\CTU-13\\13\\capture20110815-3.binetflow'
]


X_train, X_test, y_train, y_test = load_and_preprocess_data(file_paths)

train_and_evaluate_models(X_train, X_test, y_train, y_test)


Accuracy: 0.92385
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.97      0.94       469
           1       0.95      0.45      0.61       117
           2       0.65      0.48      0.55       868
           3       0.91      0.97      0.94       721
           4       0.78      0.94      0.85      2266
           5       0.93      0.95      0.94       668
           6       0.97      1.00      0.98      5434
           7       1.00      0.80      0.89       158
           8       0.00      0.00      0.00         4
           9       0.00      0.00      0.00         6
          10       0.00      0.00      0.00         4
          11       0.00      0.00      0.00         4
          12       0.00      0.00      0.00         5
          13       0.00      0.00      0.00         6
          14       0.00      0.00      0.00         5
          15       0.00      0.00      0.00         7
          16       0.00      0.00      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from keras.models import Sequential
from keras.layers import Dense


def load_and_preprocess_data(file_paths):
    dataframes = []
    for file_path in file_paths:
        for chunk in pd.read_csv(file_path, chunksize=10000):
            columns_to_keep = ['Dur', 'TotPkts', 'TotBytes', 'SrcBytes', 'Label']
            chunk = chunk[columns_to_keep]
            chunk.dropna(inplace=True)  
            
            # Convert categorical labels to binary
            chunk['Label'] = chunk['Label'].apply(lambda x: 1 if 'Botnet' in x else 0)
            dataframes.append(chunk)


    data = pd.concat(dataframes, ignore_index=True)
    
   
    X = data.drop('Label', axis=1)
    y = data['Label']
    
    # Normalize features
    X = (X - X.min()) / (X.max() - X.min())
    
    return train_test_split(X, y, test_size=0.3, random_state=42)


def build_autoencoder(input_dim):
    encoder = Sequential([
        Dense(64, activation='relu', input_dim=input_dim),
        Dense(32, activation='relu'),
        Dense(16, activation='relu')
    ])
    decoder = Sequential([
        Dense(32, activation='relu', input_dim=16),
        Dense(64, activation='relu'),
        Dense(input_dim, activation='sigmoid')
    ])
    autoencoder = Sequential([encoder, decoder])
    return autoencoder


def extract_features_with_dnn(X_train, X_test):
    input_dim = X_train.shape[1]
    model = build_autoencoder(input_dim)
    model.compile(optimizer='adam', loss='mean_squared_error')
    model.fit(X_train, X_train, epochs=10, batch_size=32, verbose=0)  
    encoder = model.layers[0]  
    return encoder.predict(X_train), encoder.predict(X_test)


def train_and_evaluate_model(X_train, X_test, y_train, y_test):
    gbc = GradientBoostingClassifier()
    gbc.fit(X_train, y_train)
    y_pred = gbc.predict(X_test)
    print(classification_report(y_test, y_pred)) 


if __name__ == "__main__":
    dataset_dir = r"C:\Users\ascom\CTU-13"
    file_paths = [
        os.path.join(dataset_dir, subdir, file)
        for subdir in os.listdir(dataset_dir)
        for file in os.listdir(os.path.join(dataset_dir, subdir))
        if file.endswith('.binetflow')
    ]
    X_train, X_test, y_train, y_test = load_and_preprocess_data(file_paths)
    X_train_features, X_test_features = extract_features_with_dnn(X_train, X_test)
    train_and_evaluate_model(X_train_features, X_test_features, y_train, y_test)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m436991/436991[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m183s[0m 419us/step
[1m187282/187282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m152s[0m 810us/step
              precision    recall  f1-score   support

           0       0.99      1.00      0.99   5860113
           1       0.95      0.35      0.52    132897

    accuracy                           0.99   5993010
   macro avg       0.97      0.68      0.75   5993010
weighted avg       0.98      0.99      0.98   5993010

