## Load CP4S Data

In [None]:
!pip install matplotlib
!pip install sklearn
!pip install git+https://github.com/IBM/ibm-cp4s-client.git

In [None]:
from cp4s.client import CP4S
from os import environ as env
ac = CP4S(url=env['CP4S_API_ENDPOINT'],
         username=env['CP4S_APIKEY_USERNAME'],
         password=env['CP4S_APIKEY_PASSWORD'])

In [None]:
mdf = ac.search_df(
    query="[ipv4-addr:value = '127.0.0.1']",
    configs="all")

## Interactive analysis

In [None]:
import pandas as pd
import numpy as np

import matplotlib
import matplotlib.dates as md
from matplotlib import pyplot as plt

from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from datetime import datetime
from sklearn.ensemble import IsolationForest

# method to extract child count
def getChildCount(row):
  value=0
  for x in new_df.index:
    if row['process_pid']==new_df['process_parent_pid'][x]:
       value=value+1
  return value

In [None]:
# drop and rename
File1=mdf.drop(columns=['domain_name','process_binary_name','process_creator_user_ref','process_opened_connection_binary_hashes_md5','process_opened_connection_binary_name','process_opened_connection_command_line','process_opened_connection_created','process_opened_connection_creator_user_ref', 'process_opened_connection_name','process_opened_connection_opened_connection_','process_opened_connection_parent_name','process_opened_connection_parent_pid', 'process_opened_connection_pid','process_opened_connection_src_addr','process_parent_binary_hashes_md5', 'process_parent_binary_name'])
new_df=File1.rename(columns={'process_creator_user_user_id':'proc_username','process_opened_connection_count':'proc_netconn_count','process_parent_name':'parent_name','user_account_user_id':'proc_hostname','process_binary_hashes_md5':'proc_md5','process_command_line':'proc_cmdline'})

# add child count and duration
new_df['proc_child_count'] = new_df.apply(getChildCount, axis=1)
new_df['duration']=(pd.to_datetime(new_df['last_observed']))-(pd.to_datetime(new_df['first_observed']))

# drop more
new_df=new_df.drop(columns=['created_by_ref','first_observed','id','last_observed','network_traffic_src_addr','process_created','tod','cmd_len', 'network_traffic_dst_addr' ,'process_parent_pid', 'process_pid' ,'proc_hostname','process_opened_connection_dst_addr'])

In [None]:
# create dictionary to store count of unique txts in each column
def CreateCountDict():
  FinalDict={}
  cols=['proc_username','proc_cmdline','proc_md5','parent_name','proc_child_count','proc_netconn_count','process_name']
  for x in cols:
    dict1=(pd.DataFrame(new_df[x].value_counts())).to_dict()
    FinalDict.update(dict1)
  return FinalDict

# get the desired representation of data
def CountNormRepresntation(ProcessData):
  ProcessDataC=ProcessData.copy(deep=False)
  totalLength=len(ProcessDataC.index)
  cols=['proc_username','proc_cmdline','proc_md5','parent_name','proc_child_count','proc_netconn_count','process_name']
  for x in cols:
      y=ProcessDataC[x].unique()
      for i in y:
          ProcessDataC[x]=ProcessDataC[x].replace(i,FinalDict_x[x][i])
  return ProcessDataC

In [None]:
# replace unknown by label Unk
new_df=new_df.fillna("UnK")

# create dictionary and final data form
FinalDict_x=CreateCountDict()
ProcessDataC=CountNormRepresntation(new_df)

# normalize the data
cols_to_norm = ['proc_username','proc_cmdline','proc_md5','parent_name','process_name','proc_netconn_count','proc_child_count']
ProcessDataC[cols_to_norm] = ProcessDataC[cols_to_norm].apply(lambda x: (x - x.mean()) / (x.std()))

# remove the cols are not adding any info as same value
ProcessDataC=ProcessDataC.drop(columns=['proc_netconn_count','proc_child_count','duration'])

In [None]:
# pca for visualisation
pca = PCA(n_components=2)
datanew = pca.fit_transform(ProcessDataC)

# standardize these 2 new features
min_max_scaler = preprocessing.StandardScaler()
np_scaled = min_max_scaler.fit_transform(datanew)
datanew = pd.DataFrame(np_scaled)

# elbow method to decide on number of clusters
from sklearn.cluster import KMeans
n_cluster = range(1, 11)
kmeans = [KMeans(n_clusters=i).fit(datanew) for i in n_cluster]
scores = [kmeans[i].score(datanew) for i in range(len(kmeans))]
fig, ax = plt.subplots()
ax.plot(n_cluster, scores)
plt.show()

ProcessDataC['cluster'] = kmeans[1].predict(datanew)
print(ProcessDataC['cluster'].value_counts())

ProcessDataC['principal_feature1'] = datanew[0]
ProcessDataC['principal_feature2'] = datanew[1]

# plot the clusters
fig, ax = plt.subplots()
colors = {0:'red', 1:'blue'}
ax.scatter(ProcessDataC['principal_feature1'],ProcessDataC['principal_feature2'],c=ProcessDataC["cluster"].apply(lambda x: colors[x]))
plt.show()

In [None]:
x=new_df.loc[ProcessDataC["cluster"] == 0,:]
x['proc_cmdline'].unique()

In [None]:
#in cluster 0
x

In [None]:
#in cluster 1
x=new_df.loc[ProcessDataC["cluster"] == 1,:]
x

## Open a CP4S Case