## Load CP4S Data

In [None]:
!pip install matplotlib
!pip install sklearn
!pip install git+https://github.com/IBM/ibm-cp4s-client.git

In [None]:
from cp4s.client import CP4S
from os import environ as env
ac = CP4S(url=env['CP4S_API_ENDPOINT'],
         username=env['CP4S_APIKEY_USERNAME'],
         password=env['CP4S_APIKEY_PASSWORD'])

In [None]:
df = ac.search_df(
    query="[ipv4-addr:value = '127.0.0.1']",
    configs="all")

## Interactive analysis

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
feature_num_cols = ['network_traffic_dst_byte_count','network_traffic_src_byte_count','total_bytes']

X = df[feature_num_cols].values
y = df['network_traffic_src_addr'].values

# split it into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=0)
print('Training data size: %d' % X_train.shape[0])
print(' Testing data size: %d' % X_test.shape[0])

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=25, random_state=0)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
y_pred_train = clf.predict(X_train)

def normalize(probabilities):
    probabilities = np.array(probabilities)
    return list(probabilities/np.max(probabilities))

y_proba = clf.predict_proba(X_test)
anomaly_scores = []
for i in range(len(y_proba)):
    result = dict()
    y_proba_i = y_proba[i]

    group = y_test[i]
    details = {x: y for x, y in zip(clf.classes_, normalize(y_proba_i)) if y>0.0 or x==group}
    score = 1.0 - details[group]
    anomaly_scores.append(score)
anomaly_scores = np.array(anomaly_scores)


anomalies = (anomaly_scores==1.).sum()
anomalies_ratio = anomalies/y_test.shape[0]
print('Anomalies  : %d' % anomalies)
print('Anomalies Ratio  : %f' % anomalies_ratio)

In [None]:
from matplotlib import pyplot as plt

# give a pie chart
from sklearn.metrics import accuracy_score
labels = ['Normal', 'Anomaly']
colors = ['green', 'red']
plt.pie([1-anomalies_ratio, anomalies_ratio], labels= labels, colors=colors, startangle=90, autopct='%.1f%%')
plt.show()