In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import pandas as pd

In [5]:
df = pd.read_csv('/content/drive/MyDrive/rs-anomic/Modified New/merged_data.csv')

In [6]:
df.shape

(1540570, 7)

In [7]:
df.head()

Unnamed: 0,container_fs_usage_bytes,container_memory_rss,container_cpu_system_seconds_total,container_network_receive_bytes_total,container_network_receive_errors_total,container_memory_failures_total,anomaly_type
0,0.010736,0.027859,0.000262,1e-06,0.0,0.0,0
1,0.000506,0.017803,0.000265,2e-06,0.0,0.0,0
2,0.000589,0.018193,0.00028,2e-06,0.0,0.0,0
3,0.000589,0.018082,0.000285,2e-06,0.0,0.0,0
4,0.000589,0.018008,0.00029,2e-06,0.0,0.0,0


In [8]:
# prompt: do the sampling using anomaly_type such that train and test datasets consists of all kinds of anomaly types

from sklearn.model_selection import train_test_split

# Assuming 'anomaly_type' is the column containing the anomaly type information
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['anomaly_type'], random_state=42)

# Verify the distribution of anomaly types in both train and test sets
print("Train set anomaly type distribution:")
print(train_df['anomaly_type'].value_counts(normalize=True))

print("\nTest set anomaly type distribution:")
print(test_df['anomaly_type'].value_counts(normalize=True))


Train set anomaly type distribution:
anomaly_type
0     0.782593
12    0.039742
4     0.032499
5     0.021802
7     0.018358
10    0.017533
13    0.009815
9     0.009815
2     0.009815
11    0.009815
3     0.009815
8     0.009815
6     0.009813
1     0.009776
14    0.008997
Name: proportion, dtype: float64

Test set anomaly type distribution:
anomaly_type
0     0.782593
12    0.039742
4     0.032498
5     0.021804
7     0.018357
10    0.017532
6     0.009815
13    0.009815
8     0.009815
9     0.009815
11    0.009815
3     0.009815
2     0.009815
1     0.009776
14    0.008997
Name: proportion, dtype: float64


In [9]:
# prompt: undersample the anomaly_type = 0 such that it matches the number of non zero anomaly type count

from sklearn.utils import resample

# Separate majority and minority classes
df_majority = train_df[train_df.anomaly_type == 0]
df_minority = train_df[train_df.anomaly_type != 0]

# Downsample majority class
df_majority_downsampled = resample(df_majority,
                                 replace=False,  # sample without replacement
                                 n_samples=len(df_minority),  # to match minority class
                                 random_state=42)  # reproducible results

# Combine minority class with downsampled majority class
train_df_undersampled = pd.concat([df_majority_downsampled, df_minority])

# Display new class counts
print("Train set anomaly type distribution after undersampling:")
print(train_df_undersampled['anomaly_type'].value_counts(normalize=True))


Train set anomaly type distribution after undersampling:
anomaly_type
0     0.500000
12    0.091400
4     0.074741
5     0.050141
7     0.042222
10    0.040324
13    0.022572
9     0.022572
2     0.022572
11    0.022572
3     0.022572
8     0.022572
6     0.022568
1     0.022482
14    0.020691
Name: proportion, dtype: float64


In [10]:

# Separate majority and minority classes in the test set
df_majority_test = test_df[test_df.anomaly_type == 0]
df_minority_test = test_df[test_df.anomaly_type != 0]

# Downsample majority class in the test set
df_majority_downsampled_test = resample(df_majority_test,
                                 replace=False,  # sample without replacement
                                 n_samples=len(df_minority_test),  # to match minority class
                                 random_state=42)  # reproducible results

# Combine minority class with downsampled majority class in the test set
test_df_undersampled = pd.concat([df_majority_downsampled_test, df_minority_test])

# Display new class counts in the test set
print("Test set anomaly type distribution after undersampling:")
print(test_df_undersampled['anomaly_type'].value_counts(normalize=True))


Test set anomaly type distribution after undersampling:
anomaly_type
0     0.500000
12    0.091400
4     0.074739
5     0.050145
7     0.042218
10    0.040322
6     0.022572
13    0.022572
8     0.022572
9     0.022572
11    0.022572
3     0.022572
2     0.022572
1     0.022482
14    0.020691
Name: proportion, dtype: float64


In [11]:
# Prepare the data for model training and testing
X_train = train_df_undersampled.drop('anomaly_type', axis=1)
y_train = train_df_undersampled['anomaly_type']
X_test = test_df_undersampled.drop('anomaly_type', axis=1)
y_test = test_df_undersampled['anomaly_type']

In [12]:
# prompt: fit the test dataset to various ml models

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Define a list of models to evaluate
models = [
    LogisticRegression(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    #SVC()
    XGBClassifier(),
    AdaBoostClassifier(),
    KNeighborsClassifier()
]

# Define a list of evaluation metrics
metrics = [accuracy_score, precision_score, recall_score, f1_score]

# Iterate through each model and evaluate its performance
for model in models:
  print(f"\nEvaluating {model.__class__.__name__}")

  # Train the model
  model.fit(X_train, y_train)

  # Make predictions on the test set
  y_pred = model.predict(X_test)

  # Calculate evaluation metrics
  for metric in metrics:
    if metric == precision_score or metric == recall_score or metric == f1_score:
      score = metric(y_test, y_pred, average='weighted')
    else:
      score = metric(y_test, y_pred)
    print(f"{metric.__name__}: {score:.4f}")

  print(classification_report(y_test, y_pred))



Evaluating LogisticRegression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


accuracy_score: 0.5341
precision_score: 0.4449
recall_score: 0.5341
f1_score: 0.4448


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       0.73      0.94      0.82     66986
           1       0.00      0.00      0.00      3012
           2       1.00      0.07      0.13      3024
           3       0.00      0.00      0.00      3024
           4       0.06      0.01      0.02     10013
           5       0.00      0.00      0.00      6718
           6       0.00      0.00      0.00      3024
           7       0.85      0.05      0.10      5656
           8       0.00      0.00      0.00      3024
           9       0.00      0.00      0.00      3024
          10       0.00      0.00      0.00      5402
          11       0.00      0.00      0.00      3024
          12       0.17      0.64      0.27     12245
          13       0.00      0.00      0.00      3024
          14       0.00      0.00      0.00      2772

    accuracy                           0.53    133972
   macro avg       0.19      0.11      0.09    133972
weighted avg       0.44   



accuracy_score: 0.4865
precision_score: 0.4861
recall_score: 0.4865


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


f1_score: 0.4582


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       0.77      0.78      0.77     66986
           1       0.76      0.02      0.04      3012
           2       0.00      0.00      0.00      3024
           3       0.20      0.03      0.05      3024
           4       0.19      0.24      0.21     10013
           5       0.19      0.07      0.10      6718
           6       0.09      0.12      0.10      3024
           7       0.34      0.08      0.14      5656
           8       0.50      0.04      0.07      3024
           9       0.00      0.00      0.00      3024
          10       0.05      0.11      0.07      5402
          11       0.14      0.04      0.06      3024
          12       0.27      0.67      0.38     12245
          13       0.00      0.00      0.00      3024
          14       0.00      0.00      0.00      2772

    accuracy                           0.49    133972
   macro avg       0.23      0.15      0.13    133972
weighted avg       0.49   

In [None]:
# prompt: use other models

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier


# Define a list of models to evaluate, including the new ones
models = [
    GradientBoostingClassifier(),  # Added
    MLPClassifier()  # Added
]

# Define a list of evaluation metrics
metrics = [accuracy_score, precision_score, recall_score, f1_score]

# Iterate through each model and evaluate its performance
for model in models:
  print(f"\nEvaluating {model.__class__.__name__}")

  # Train the model
  model.fit(X_train, y_train)

  # Make predictions on the test set
  y_pred = model.predict(X_test)

  # Calculate evaluation metrics
  for metric in metrics:
    if metric == precision_score or metric == recall_score or metric == f1_score:
      score = metric(y_test, y_pred, average='weighted')
    else:
      score = metric(y_test, y_pred)
    print(f"{metric.__name__}: {score:.4f}")

  print(classification_report(y_test, y_pred))



Evaluating GradientBoostingClassifier
accuracy_score: 0.7974
precision_score: 0.8107
recall_score: 0.7974
f1_score: 0.7951
              precision    recall  f1-score   support

           0       0.98      0.96      0.97     66986
           1       0.57      0.52      0.55      3012
           2       0.68      0.57      0.62      3024
           3       0.62      0.32      0.43      3024
           4       0.42      0.70      0.52     10013
           5       0.98      0.90      0.94      6718
           6       0.67      0.62      0.64      3024
           7       0.68      0.68      0.68      5656
           8       0.73      0.30      0.42      3024
           9       0.53      0.27      0.36      3024
          10       0.76      0.78      0.77      5402
          11       0.60      0.39      0.47      3024
          12       0.68      0.80      0.73     12245
          13       0.55      0.29      0.38      3024
          14       0.48      0.58      0.52      2772

    accura

In [None]:
# prompt: export the model

import pickle

# Choose the best performing model from the evaluation results
best_model = RandomForestClassifier()  # Replace with the actual best model
best_model.fit(X_train, y_train)

# Save the model to a file
filename = '/content/drive/MyDrive/rs-anomic/Modified New/anomaly_detection_model.sav'
pickle.dump(best_model, open(filename, 'wb'))

print(f"Model saved to {filename}")


Model saved to /content/drive/MyDrive/rs-anomic/Modified New/anomaly_detection_model.sav
