In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from preprocessing import normalize


# Load the train and test data
train_data = pd.read_csv('data/ISCX_Botnet-Training.pcap_Flow_labelled.csv')
test_data = pd.read_csv('data/ISCX_Botnet-Testing.pcap_Flow_labelled.csv')

# Separate the features and the labels
X_train = normalize(train_data.drop('Label', axis=1).iloc[:, 7:])
y_train = train_data['Label']
X_test = test_data.drop('Label', axis=1).iloc[:, 7:]
y_test = test_data['Label']

# Encode the labels (if they are not already encoded)
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

# Split the train data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Define the preprocessing pipeline with an imputer and scaler
preprocessor = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler()
)

# Initialize and fit the classification models
models = [
    DecisionTreeClassifier(),
    LogisticRegression(),
    RandomForestClassifier()
    #,SVC()
]

for model in models:
    # Create a pipeline with the preprocessor and the model
    pipeline = make_pipeline(preprocessor, model)

    # Fit the pipeline on the training data
    pipeline.fit(X_train, y_train)

    # Predict on the validation data
    y_pred = pipeline.predict(X_val)

    # Evaluate the model
    accuracy = accuracy_score(y_val, y_pred)

    # Print the evaluation metrics 
    print(f"Model: {type(model).__name__}")
    print(f"Accuracy: {accuracy:.4f}")


Model: DecisionTreeClassifier
Accuracy: 0.9343


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model: LogisticRegression
Accuracy: 0.7571
Model: RandomForestClassifier
Accuracy: 0.9493


In [3]:
from sklearn.metrics import balanced_accuracy_score
bal_accuracy = balanced_accuracy_score(y_val, y_pred)


In [4]:
bal_accuracy

0.9488287781841918