In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

FILE_PATH = '' # ссылку на файл .binetflow. Датасет можно скачать по ссылке в README

def load_and_process_data(filepath):
    chunks = pd.read_csv(filepath, comment='#', chunksize=500000)
    container = []
    found_bots = 0
    
    for chunk in chunks:
        chunk = chunk.dropna(subset=['Label'])
        bots = chunk[chunk['Label'].str.contains('Botnet', na=False)]
        normal = chunk[~chunk['Label'].str.contains('Botnet', na=False)].sample(frac=0.1)
        
        container.append(bots)
        container.append(normal)
        found_bots += len(bots)
        
        if found_bots > 20000: 
            break
            
    df = pd.concat(container)
    df['is_botnet'] = df['Label'].apply(lambda x: 1 if 'Botnet' in str(x) else 0)
    df['Dur'] = df['Dur'].replace(0, 0.000001)
    df['Pkts_per_Sec'] = df['TotPkts'] / df['Dur']
    df['Bytes_per_Sec'] = df['TotBytes'] / df['Dur']
    df['Bytes_per_Pkt'] = df['TotBytes'] / df['TotPkts']
    
    cols_to_encode = ['Proto', 'State', 'Dir']
    df_encoded = pd.get_dummies(df, columns=cols_to_encode, dummy_na=False)
    
    drop_cols = ['StartTime', 'SrcAddr', 'DstAddr', 'Sport', 'Dport', 'Label', 'is_botnet']
    X = df_encoded.drop(columns=[c for c in drop_cols if c in df_encoded.columns])
    y = df['is_botnet']
    X = X.fillna(0)
    
    return X, y, X.columns.tolist()

X, y, feature_names = load_and_process_data(FILE_PATH)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

clf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred, target_names=['Normal', 'Botnet']))

# сохраняю модель
if not os.path.exists('models'):
    os.makedirs('models')

joblib.dump(clf, 'models/bad_botnet_detector.pkl')
joblib.dump(feature_names, 'models/bad_features.pkl')

feature_imp = pd.Series(clf.feature_importances_, index=X.columns).sort_values(ascending=False).head(7)
plt.figure(figsize=(12, 8))
sns.barplot(x=feature_imp, y=feature_imp.index)
plt.title("анализ признаков")
plt.tight_layout()
plt.show()

In [2]:
# чтобы проц не работал на 100% и не ломал HyprLand
os.environ['OMP_NUM_THREADS'] = '2'      # ЛИМИТ для OpenMP
os.environ['MKL_NUM_THREADS'] = '2'      # ЛИМИТ для MKL
os.environ['OPENBLAS_NUM_THREADS'] = '2' # ЛИМИТ для OpenBLAS
os.environ['VECLIB_MAXIMUM_THREADS'] = '2'  # Для macOSbb

In [None]:
files = [
#    '../data/capture20110810.binetflow', |
#    '../data/capture20110811.binetflow', |  Загрузите свой датасет
#    '../data/capture20110812.binetflow'  |
]

def build_dataset(d_files):
    all_data = []
    for f in d_files:
        if not os.path.exists(f):
            continue
        chunks = pd.read_csv(f, comment='#', chunksize=400000)
        for chunk in chunks:
            chunk = chunk.dropna(subset=['Label'])
            bots = chunk[chunk['Label'].str.contains('Botnet', na=False)]
            normal = chunk[~chunk['Label'].str.contains('Botnet', na=False)].sample(frac=0.02, random_state=42)
            all_data.append(bots)
            all_data.append(normal)
    return pd.concat(all_data)

df = all_data(files)

df['Dur'] = df['Dur'].replace(0, 0.000001)
df['Pkts_per_Sec'] = df['TotPkts'] / df['Dur']
df['Bytes_per_Sec'] = df['TotBytes'] / df['Dur']
df['is_botnet'] = df['Label'].apply(lambda x: 1 if 'Botnet' in str(x) else 0)

df_encoded = pd.get_dummies(df, columns=['Proto', 'State', 'Dir'])

X = df_encoded.drop(columns=['StartTime', 'SrcAddr', 'DstAddr', 'Sport', 'Dport', 'Label', 'is_botnet'])
y = df['is_botnet']

model = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)
model.fit(X, y)

if not os.path.exists('models'):
    os.makedirs('models')

joblib.dump(model, 'models/universal_botnet_detector.pkl')
joblib.dump(X.columns.tolist(), 'models/universal_features.pkl')

In [None]:
def test_model(file, model_p, feat_p):
    model = joblib.load(model_p)
    f_names = joblib.load(feat_p)
    
    chunks = pd.read_csv(file, comment='#', chunksize=500000)
    
    data_list = []
    for i, chunk in enumerate(chunks):
        chunk = chunk.dropna(subset=['Label'])
        b = chunk[chunk['Label'].str.contains('Botnet', na=False)]
        n = chunk[~chunk['Label'].str.contains('Botnet', na=False)].sample(frac=0.05, random_state=42)
        
        data_list.append(b)
        data_list.append(n)
        if i >= 2: break 

    df = pd.concat(data_list)
    
    df['Dur'] = df['Dur'].replace(0, 0.000001)
    df['Pkts_Sec'] = df['TotPkts'] / df['Dur']
    df['Bytes_Sec'] = df['TotBytes'] / df['Dur']
    df['Bytes_Pkt'] = df['TotBytes'] / df['TotPkts']
    
    df_enc = pd.get_dummies(df, columns=['Proto', 'State', 'Dir'])
    
    X = df_enc.reindex(columns=f_names, fill_value=0)
    y = df['Label'].apply(lambda x: 1 if 'Botnet' in str(x) else 0)
    
    y_pred = model.predict(X)
    
    acc = accuracy_score(y, y_pred)
    rep = classification_report(y, y_pred, output_dict=True, zero_division=0)
    
    f1 = rep['1']['f1-score'] if '1' in rep else 0
    prec = rep['1']['precision'] if '1' in rep else 0
    
    return acc, f1, prec

files = [
    # '../data/capture20110810.binetflow',  | Ну и тут аналогично
    #               .   .  .                | 
]

results = []

for f in files:
    try:
        a, f1, p = test_model(f, 'models/universal_botnet_detector.pkl', 'models/universal_features.pkl')
        results.append({'File': f.split('/')[-1], 'Acc': round(a, 3), 'F1': round(f1, 3)})
    except:
        print(f"Error: {f}")

print(pd.DataFrame(results))

In [None]:


results['Short_Name'] = results['Файл'].apply(lambda x: x.split('/')[-2] if 'scenario' in x else x.split('/')[-2])

plt.figure(figsize=(12, 6))
plt.bar(results['Short_Name'], results['F1-Score (Боты)'], color='teal')

plt.title('Эффективность детектирования (F1-Score) по сценариям CTU-13')
plt.xlabel('Сценарии / Папки данных')
plt.ylabel('F1-Score')
plt.ylim(0, 1.0)
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.savefig('results/f1_results.png', dpi=300)
plt.show()

In [None]:
importance = pd.DataFrame({
    'Признак': features,
    'Важность': clf.feature_importances_
}).sort_values(by='Важность', ascending=False)

top_10 = importance.head(10)

plt.figure(figsize=(10, 6))
plt.barh(top_10['Признак'], top_10['Важность'], color='darkorange')
plt.gca().invert_yaxis()
plt.title('Топ-10 признаков, определяющих ботнет-трафик')
plt.xlabel('Относительная важность')
plt.tight_layout()

plt.savefig('feature_importance.png', dpi=300)
plt.show()

In [None]:
plot_df = results_table[['Файл', 'Точность (Acc)', 'F1-Score (Боты)']].copy()
plot_df['Сценарий'] = plot_df['Файл'].apply(lambda x: x.split('/')[-2])
plot_df = plot_df[['Сценарий', 'Точность (Acc)', 'F1-Score (Боты)']]


fig, ax = plt.subplots(figsize=(10, 6))
ax.axis('off')
ax.axis('tight')

table = ax.table(cellText=plot_df.values, 
                 colLabels=plot_df.columns, 
                 cellLoc='center', 
                 loc='center',
                 colColours=["#f2f2f2"] * 3)

table.auto_set_font_size(False)
table.set_fontsize(12)
table.scale(1.2, 2)

plt.title('Сводные показатели эффективности модели по сценариям', pad=20, fontsize=14)
plt.savefig('metrics_table.png', dpi=300, bbox_inches='tight')
plt.show()