# Feature Exploration for Proxy Model

- have many different feature models (by prefix)
- do boxplot and PCA for features



In [None]:
# Default settings, constants

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', -1)
pd.set_option('mode.chained_assignment', None)

FIGSIZE=(15,8)
matplotlib.rcParams['figure.figsize'] = FIGSIZE

In [None]:
# Data is from AQL.proxy_model query
from qradar import QRadar, AQL

qi = QRadar(console='YOUR-CONSOLE-IP-ADDRESS', username='admin', token='YOUR-SERVICE-TOKEN')
_df = pd.DataFrame.from_records(qi.search(AQL.proxy_model))
_df.fillna(0, inplace=True)

print(_df.shape)
_df.head(10)

In [None]:
_df.describe()

In [None]:
# Different Feature groups
ALL = 'All Columns'
PREFIX = [
    'General',
    'Network',
    'Time',
    'Proxy',
    ALL
]

In [None]:
from sklearn import preprocessing
import matplotlib.pyplot as plt

def boxplot(df, prefix):
    # drop text columns
    df = df.drop('user',axis=1).drop('timeslice',axis=1)
    
    min_max_scaler = preprocessing.MinMaxScaler() # StandardScaler, MinMaxScaler, RobustScaler
    scaled = pd.DataFrame(min_max_scaler.fit_transform(df.values), columns=df.columns)
    
    scaled.boxplot(figsize=FIGSIZE, rot=90)
    plt.title(f'Boxplot for {prefix}')
    plt.show()

for prefix in PREFIX:
    df = _df
    if prefix != ALL:
        cols = ['user', 'timeslice']
        cols.extend([col for col in _df if col.startswith(prefix.lower()+'_')])
        df = _df[cols]
    
    boxplot(df, prefix)

In [None]:
from sklearn.decomposition import PCA
from sklearn import preprocessing

X = 'PC 1'
Y = 'PC 2'

def pca(df, prefix):
    # drop text columns
    df = df.drop('user',axis=1).drop('timeslice',axis=1)
    
    # scale data or else some columns dominate
    min_max_scaler = preprocessing.StandardScaler() # StandardScaler, MinMaxScaler, RobustScaler
    df = pd.DataFrame(min_max_scaler.fit_transform(df.values), columns=df.columns)
    
    pca = PCA(n_components=2)
    components = pca.fit_transform(df)
    components_df = pd.DataFrame(components, columns = [X, Y])
    df[X] = components_df[X]
    df[Y] = components_df[Y]
    
    ax1 = df.plot(kind='scatter', x=X, y=Y, color='grey', s=1, title=f'PCA for {prefix}')
    plt.show()

for prefix in PREFIX:
    df = _df
    if prefix != ALL:
        cols = ['user', 'timeslice']
        cols.extend([col for col in _df if col.startswith(prefix.lower()+'_')])
        df = _df[cols]
    
    pca(df, prefix)

In [None]:
# users vs population, look for all outlier points and graph on PCA
# specific user vs self, plot own PCA