In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Read the benign and dga data with low_memory=False
benign = pd.read_csv("benign2lakh.csv")
benign['class'] = 'benign'

dga = pd.read_csv("dga2lakh.csv", low_memory=False)
dga['class'] = 'dga'

# Combine the dataframes
data = pd.concat([benign, dga])

# Set random seed
seed = 1234

# Update the 'matched_word' column
data['matched_word'] = data['matched_word'].apply(lambda x: '1' if x != '' else '0')
data['matched_word'] = data['matched_word'].astype('category')

# Update the 'feedback_warning' column
data['feedback_warning'] = data['feedback_warning'].apply(lambda x: '1' if pd.notna(x) and x != '' else '0')
data['feedback_warning'] = data['feedback_warning'].astype('category')

# Convert the 'class' column to a categorical variable
data['class'] = data['class'].astype('category')

# Remove the 4th column (index 3 in zero-based indexing)
data = data.drop(data.columns[3], axis=1)

# Train-Test Split
X = data.drop(columns=['class', data.columns[0]])  # Drop target and first column
y = data['class']

# Handle NaN values by imputing with mean (you can choose a different strategy)
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Scale the data using StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=seed)

# Create and train the Random Forest model without hyperparameter tuning
clf_before_tuning = RandomForestClassifier(random_state=seed)
clf_before_tuning.fit(X_train, y_train)

# Predict on the test set before tuning
y_pred_before_tuning = clf_before_tuning.predict(X_test)

# Confusion Matrix before tuning
cm_before_tuning = confusion_matrix(y_test, y_pred_before_tuning)

# Classification Report before tuning
class_report_before_tuning = classification_report(y_test, y_pred_before_tuning, target_names=['benign', 'dga'])

# Create and train the Random Forest model with hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30],
    'criterion': ['gini', 'entropy']
}

clf_after_tuning = RandomForestClassifier(random_state=seed)
grid_search = GridSearchCV(estimator=clf_after_tuning, param_grid=param_grid, cv=5, n_jobs=-1, scoring='accuracy', verbose=1)
grid_search.fit(X_train, y_train)

# Get the best model from the grid search
best_rf = grid_search.best_estimator_

# Predict on the test set after tuning
y_pred_after_tuning = best_rf.predict(X_test)

# Confusion Matrix after tuning
cm_after_tuning = confusion_matrix(y_test, y_pred_after_tuning)

# Classification Report after tuning
class_report_after_tuning = classification_report(y_test, y_pred_after_tuning, target_names=['benign', 'dga'])

# Print results
print("# Results Before Hyperparameter Tuning")
print("#")
print("Confusion Matrix:")
print(cm_before_tuning)
print("Classification Report:")
print(class_report_before_tuning)
print("#")
print("# Results After Hyperparameter Tuning")
print("#")
print("Best Parameters:")
print(grid_search.best_params_)
print("Confusion Matrix:")
print(cm_after_tuning)
print("Classification Report:")
print(class_report_after_tuning)


Fitting 5 folds for each of 72 candidates, totalling 360 fits
# Results Before Hyperparameter Tuning
#
Confusion Matrix:
[[78263  1738]
 [ 2527 77472]]
Classification Report:
              precision    recall  f1-score   support

      benign       0.97      0.98      0.97     80001
         dga       0.98      0.97      0.97     79999

    accuracy                           0.97    160000
   macro avg       0.97      0.97      0.97    160000
weighted avg       0.97      0.97      0.97    160000

#
# Results After Hyperparameter Tuning
#
Best Parameters:
{'criterion': 'entropy', 'max_depth': 20, 'max_features': 'log2', 'n_estimators': 100}
Confusion Matrix:
[[78584  1417]
 [ 2693 77306]]
Classification Report:
              precision    recall  f1-score   support

      benign       0.97      0.98      0.97     80001
         dga       0.98      0.97      0.97     79999

    accuracy                           0.97    160000
   macro avg       0.97      0.97      0.97    160000
weighted