In [None]:
# %%writefile filename.py
# %load executor.py
# %load adaptive_algo.py
# %load feedback_collector.py
# %load UC__executor_clustering.py

In [3]:
%load SL1_ImportData.py

# Importing Data
'''
Description: 
    This file provide some function that are toe used for importing data .
Function this file Contains:
    - ImportData: Used to import data either from BQ or from Storage.
'''

# ----------------------------------------------- Loading Libraries ----------------------------------------------- #
import pandas as pd
import glob, os, ast, time
from datetime import datetime, date, timedelta
from SL0_GeneralFunc import LevBasedPrint, AddRecommendation


# ------------------------------------------ GrabAnySizeDatafromGoogleBQ ------------------------------------------ #
def Exec_BQ(query, projectid):
    LevBasedPrint('Inside "'+Exec_BQ.__name__+'" function.',3,1)
    LevBasedPrint('',3,1)
    return pd.io.gbq.read_gbq(query, project_id=projectid, index_col=None, col_order=None, reauth=False, private_key=None) #, verbose=True deprecated


def GenerateTableNames(config):
    '''
    Make use of Domain based parameters to get the data.
    '''
    # -----------<<<  Setting constant values that are to be used inside function  >>>----------- #
    DatasetName = 'ss-production-storage'
    SIDs = ast.literal_eval(config['DomainConfig']['SIDs'])
    DataGrabMethodology = config['DomainConfig']['UseStaticOrDynamicCurrentDay']
    LevBasedPrint('Inside "'+GenerateTableNames.__name__+'" function and configurations for this has been set.',3,1)
    LevBasedPrint('Data collection methodology that has been selected : ' + str(DataGrabMethodology),3)
    if DataGrabMethodology == 'static':
        Dates = ast.literal_eval(config['IfStatic']['Date']) 
        StaDataWindow = ast.literal_eval(config['IfStatic']['DataGrabWindow_Days'])
    elif DataGrabMethodology == 'dynamic':
        DynDataWindow = int(ast.literal_eval(config['IfDynamic']['DataGrabWindow_Hr']))
    else:
        txt = 'Exception: Wrong Configuration has been passed in "UseStaticOrDynamicCurrentDay".'
        AddRecommendation(txt, config)
        raise Exception(txt)
    
    # -----------------------------<<<  Generating Table Names  >>>------------------------------ #
    ## Generating Table Names
    if DataGrabMethodology == 'static':
        if StaDataWindow != '-':
            CustomDate = date(2000 + int(Dates[0][4:6]), int(Dates[0][2:4]), int(Dates[0][0:2])) 
            format = '%d%m%y'
            Dates = [ (CustomDate + timedelta(days=i)).strftime(format) for i in range(int(StaDataWindow)) ]
        TableToInclude = ''
        for i in range(len(SIDs)):
            for j in range(len(Dates)):
                TableToInclude += '\n\tTABLE_QUERY([{}.Citadel_Stream],\'table_id like "'.format(DatasetName) + SIDs[i] + '_' + Dates[j] + '_%"\'),'
    elif DataGrabMethodology == 'dynamic':
        CurrentTime = datetime(time.gmtime().tm_year, time.gmtime().tm_mon, time.gmtime().tm_mday, time.gmtime().tm_hour, time.gmtime().tm_min, time.gmtime().tm_sec) ## UTC        
        TableDateToTake = []
        while DynDataWindow >= -1:  ## -1 to even include the current hour table
            tempDate = CurrentTime - timedelta(days = 0, hours = DynDataWindow, minutes = 0)
            TableDateToTake.append(tempDate.strftime(format = '%d%m%y_%H'))
            DynDataWindow -= 1
        TableToInclude, TableCnt = '', 0
        for i in range(len(SIDs)):
            for j in range(len(TableDateToTake)):
                TableCnt += 0
                TableToInclude += '\n\tTABLE_QUERY([{}.Citadel_Stream],\'table_id like "'.format(DatasetName) + SIDs[i] + '_' + TableDateToTake[j] + '%"\'),'
        LevBasedPrint('Total number of tables accessed : '+str(TableCnt),3)
    # ---------------------------------------<<<  xyz  >>>--------------------------------------- #
    LevBasedPrint('',3,1)
    return TableToInclude
    # ------------------------------------------------------------------------------------------- #


def GrabAnySizeDatafromGoogleBQ(config):
    '''
    Incase if dataset size is too large then this function will enable the extraction of whole dataset by getting the data in chunks
    '''
    # -----------<<<  Setting constant values that are to be used inside function  >>>----------- #
    ModuleSetting = config['Config']['ModuleSettingRuleName']
    BQ_Cred = config['BigQueryConfig']['ProjectID']
    if ModuleSetting == 'ICLSSTA': BinSizeBasedOnPeriod_Hr = int(config['Config']['ICLSSTA_BinSizeBasedOnPeriod_Hr'])
    BQ_QueryFile = config['InputPaths']['BQ_DataImportQuery']
    LimitToStartWith = config['BigQueryConfig']['BQ_LimitToStart']
    LimitDecreaseFactor = float(config['BigQueryConfig']['BQ_LimitDecreaseFactor'])
    LevBasedPrint('Inside "'+GrabAnySizeDatafromGoogleBQ.__name__+'" function and configurations for this has been set.',2,1)
    
    # -------------------------<<<  Generating Tables Name To Query  >>>------------------------- #
    TableToInclude = GenerateTableNames(config)
    #print(TableToInclude)
    
    # -------------------------<<<  Creating Bin Setting For ICLSSTA  >>>------------------------ #
    ## Getting the string that will be used to create bins for grouping based on a certain TimePeriod
    GroupsToInclude = ''
    if ModuleSetting == 'ICLSSTA':
        for i in range(1000): ##even if the bin size is as small as an hour, BQ has a limitation of accessing upto a max of 1000 Table, so this is the max possible limit 
            ll_insec = int(i*BinSizeBasedOnPeriod_Hr *3600)
            ul_insec = int((i+1)*BinSizeBasedOnPeriod_Hr *3600 - 1)
            GroupsToInclude += '\n\tWHEN (CurrentTimeStamp - CurrentHitTimeStamp) BETWEEN {low} AND {upp} THEN "Bin_{WhichBin}"'.format(low= ll_insec,upp= ul_insec, WhichBin= i)
    
    # ------------------------<<<  Reading Query From External File  >>>------------------------- #
    LevBasedPrint('Read from a locally saved Query File', 2)
    queryfile = open(BQ_QueryFile, 'r')
    query = queryfile.read()
    queryfile.close()
    
    # --------------------<<<  Importing Data in Max possible batch size  >>>-------------------- #
    ## looping over the limit and offset to grab the maximum possible bite in terms of observation that can be gathered
    ## GP
    start = int(LimitToStartWith)  # should be equal to the maximum number of observation that you want to extract
    ratio = 1/LimitDecreaseFactor
    limit = 1000  ## util which pt to try to gather the data ## Hardcoded
    length = 1000
    # query='''SELECT 1 limit {lim} offset {off}'''
    
    DF = pd.DataFrame()
    ##GP
    for i in [ int(start * ratio ** (n - 1)) for n in range(1, length + 1) if start * ratio ** (n - 1) > limit ]:
        if DF.shape == (0, 0):
            try:
                offcurr = 0
                while offcurr < start:
                    LevBasedPrint('Setting used in extracting data from BQ:\tNo. of obs. extracted per cycle (limit) = ' + str(i) + '\tOffset = ' + str(offcurr),2)
                    QueryToUse = query.format(BinToUse = GroupsToInclude, TableToInclude = TableToInclude, lim = str(i), off = str(offcurr))
                    tempDF = Exec_BQ(QueryToUse, BQ_Cred)
                    DF = DF.append(tempDF, ignore_index = True)
                    offcurr += i

            except Exception as error:
                txt = 'Exception: In importing data from BQ was thrown!\nLimit used: ' + str(i) + '\n' + str(error)
                LevBasedPrint(txt, 2)
                AddRecommendation(txt, config)
                # raise Exception(txt)
    
    # ---------------------------------------<<<  xyz  >>>--------------------------------------- #
    LevBasedPrint('',2,1)
    return DF
    # ------------------------------------------------------------------------------------------- #


# -------------------------------------------------- ImportData --------------------------------------------------- #
def ImportData(config):
    """
    Can be used to import data from either storage or BQ
    
    
    Extracts any size data from any SID of any number of days.
    
    Works in Two Configuration(config['aim']['Task']), namely 'TrainTest' & 'GlTest'
    'TrainTest' is for models training purpose where This Dataset is split later too make dataset size adequate for training uing sampling
    'GlTest' is purely for prediction purpose, i.e. it will be used as testset only and will consume saved model to provide labels to observations
    """
    # -----------<<<  Setting constant values that are to be used inside function  >>>----------- #
    AccessDataFrom = config['DataCollection']['GetDataFrom']
    if AccessDataFrom == 'BQ':
        SettingToUse = config['IterationAim']['Task']
        if SettingToUse: GlTestDataSize = int(config['IterationAim']['GlTest_DataGrabWindow_Hr'])
        FileLocalSavingName = config['InputPaths']['BQ_RawDataStoringName'].format(SettingToUse)
        GetNewCopy = config['DomainConfig']['BQ_GetNewCopyOfData']
    elif AccessDataFrom ==  'Storage':
        FileName = config['InputPaths']['Storage_RawData']
    else:
        print('Wrong setting in "GetDataFrom", current value is {}'.format(AccessDataFrom))
        txt = 'Exception: Wrong Configuration has been passed in "GetDataFrom".'
        AddRecommendation(txt, config)
        raise Exception(txt)
    LevBasedPrint('Inside "'+ImportData.__name__+'" function and configurations for this has been set.',1,1)
    
    
    LevBasedPrint('Accessing data from {}'.format(AccessDataFrom), 1)
    # ----------------------------<<<  Accessing Data from BQ  >>>------------------------------- #
    if AccessDataFrom == 'BQ':
        
        # -----------------------<<<  Setting Configuration for GlTest  >>>-------------------------- #
        if(SettingToUse == 'GlTest'):
            config['IfStatic']['DataGrabWindow_Days'] = str(int(GlTestDataSize/24 + 1))
            config['IfDynamic']['DataGrabWindow_Hr'] = str(GlTestDataSize + 1)

        # --------------------------<<<  Get New Copy Of Data Or Reuse  >>>-------------------------- #
        if (os.path.exists(FileLocalSavingName) == False) | (GetNewCopy in ['True', 'true', 'T', 't', 'Yes', 'yes', 'Y', 'y']):
            DF = GrabAnySizeDatafromGoogleBQ(config)
            # if(SettingToUse == 'GlTest'):
            #     DF.drop(DF[DF.BinsBackFromCurrent != 'Bin_0'].index, inplace=True)
            #     DF.reset_index(drop=True, inplace=True)
            DF.to_csv(FileLocalSavingName, index=False)#, sep='|', encoding='utf-8')
            LevBasedPrint('Data extracted from BQ and saved locally to the File: '+ FileLocalSavingName, 1)
        else:
            DF = pd.read_csv(FileLocalSavingName)#, sep='|', encoding='utf-8')
            LevBasedPrint('Data Loaded From the File: '+ FileLocalSavingName, 1)
        LevBasedPrint('Data Shape: '+str(DF.shape), 1 )
    # --------------------------<<<  Accessing Data from Storage  >>>---------------------------- #
    elif AccessDataFrom == 'Storage':
        DF = pd.read_csv(FileName)#, sep='|', encoding='utf-8')
        LevBasedPrint('Data Loaded From the File: '+ FileName, 1)
    
    # ---------------------------------------<<<  xyz  >>>--------------------------------------- #
    LevBasedPrint('',1,1)
    return DF
    # ------------------------------------------------------------------------------------------- #


# ----------------------------------------------------------------------------------------------------------------- #
## AP
# start = int(LimitToStartWith)  # should be equal to the maximum number of observation that you want to extract
# stop = -1
# step = -int(start/LimitDecreaseFactor)
# limit = int(start/LimitDecreaseFactor)  ## util which pt to try to gather the data
##AP
# for i in [i for i in range(start,stop, step) if i >= limit]:

In [4]:


import configparser
import pandas as pd
import os, ast, time
from SL0_GeneralFunc import GetBackSomeDirectoryAndGetAbsPath, TimeCataloging, CreateKey, LevBasedPrint, AddRecommendation

StartTime = int(time.time())
print('Execution Start ' + str(StartTime))

ConfigFilePath = '../config/ISLSSTA_Config.ini'
_, absModConfPath = GetBackSomeDirectoryAndGetAbsPath(ConfigFilePath)

config = configparser.ConfigParser()
config.read(absModConfPath)

# config_clust['aim']['Task'] = 'GlTest'    ################################ Using This To Change The Configuration

input_raw_df = ImportData(config)
input_raw_df.head()

Execution Start 1543927626
	+--------------------------------------------------------------------------------------------------------
	| Inside "ImportData" function and configurations for this has been set.
	| Accessing data from Storage
	| Data Loaded From the File: ../data/InputData/RawExtractedData_TrainTest.csv
	+--------------------------------------------------------------------------------------------------------


Unnamed: 0,SID,BinsBackFromCurrent,apidata__zpsbd6,RecentHit_TimeStamp,isBotHits,Hits,D_UzmaToD_UA,HitsToD_Uzmc,D_PageVisitedToHits,PageActToD_PageVisit,BrowsrActToD_BrowsrUsed,AvgMedianTimeDiffBWHits,AvgAvgTimeDiffBWHits,StandDeviatAvgTimeDiffBWHits,AvgHitsPerUnitTime,DiffOfAvgTimeDiffBWHitsWhnGrpIPAndIPUzma,ZScoreAvgAvgTimeDiffBWHits
0,3641,Bin_1,86.176.164.90,1543881689,0,2,1.0,1.0,1.0,1.0,8.0,6.0,3.0,8.485281,0.666644,3.0,7.682213
1,3641,Bin_1,107.77.210.218,1543881738,0,4,1.0,1.0,0.75,5.333333,60.0,9.0,4.4375,5.560276,0.901388,3.3125,6.395656
2,3641,Bin_1,96.10.138.178,1543881734,0,5,1.0,1.0,1.0,1.6,80.0,5.0,5.033333,17.306068,0.993358,7.966667,5.302515
3,3641,Bin_1,99.203.14.40,1543881687,0,4,1.0,1.333333,0.75,1.333333,4.0,9.0,5.125,12.041595,0.780473,6.375,4.431772
4,3641,Bin_1,142.177.187.158,1543881630,0,2,1.0,1.0,1.5,4.666667,40.0,10.0,5.25,14.849242,0.380945,5.25,3.19103


In [None]:
from UCorCS_DailySIDTrafficStatus import UnderstandEnvironmentData

if config_clust['TriggerTheseFunctions']['UnderstandEnvironmentData'] != 'False': ## To Run Code Below Or Not
    UnderstandEnvironmentData(config_clust)

In [None]:
print([i for i in config_clust['MovingOutputFile']['DimClustAlgoPair'].split("'") if len(i) > 3])
print([ i for i in config_clust['DataProcessing_General']['FeatureToIgnore'].split("'") if len(i) > 2 ])

In [None]:
config_clust['aim']['Task'] = 'GlTest'
# ImportData_1(config_clust)

## Load Raw Input Data

In [None]:
input_raw_df = ImportData_1(config_clust)
input_raw_df.head()
# SettingToUse = config_clust['aim']['Task']

In [None]:
SettingToUse = config_clust['aim']['Task']
if(SettingToUse == 'TrainTest'):
    FileLocalSavingName = config_clust['input']['dataset_dir'] + config_clust['input']['RawDataStorName_TrainTest']
elif(SettingToUse == 'GlTest'):
    FileLocalSavingName = config_clust['input']['dataset_dir'] + config_clust['input']['RawDataStorName_TrainTest']
    #FileLocalSavingName = config_clust['input']['dataset_dir'] + config_clust['input']['RawDataStorName_GlTest']

input_raw_df = pd.read_csv(FileLocalSavingName, sep = '|', encoding="utf-8")
print(input_raw_df.shape)
input_raw_df.head()

In [None]:
if config_clust['aim']['PaceMode'] == 'Off':
    temp = pd.DataFrame(input_raw_df.isnull().sum(), columns = ['IsNullSum'])
    temp['dtypes'] = input_raw_df.dtypes.tolist()
    temp['IsNaSum'] = input_raw_df.isna().sum().tolist()
    temp = temp.join(input_raw_df.describe().T).fillna('')
    display(temp)
    print('\nMax isBotHits :', input_raw_df['isBotHits'].max())

    # np.isinf(input_raw_df[[ i for i in AllFeature if i not in FeatureToIgnore ]]).any()
    # np.isnan(yy).any()

    # np.isinf(xx).any()
    # np.isinf(yy).any()

## Data PreProcessing

In [None]:
from UC_DataProcessing_Executor import DataPreProcess_1
from UC_DataExploration import DataExploration_1
from UC_DataProcessing_GenMiniFunc import GenerateCorrelationPlot

# TrainDF, TestDF, OutlierDF 
train_processed_raw_df, test_processed_raw_df, outlier_df = DataPreProcess_1(input_raw_df, config_clust)

if config_clust['TriggerTheseFunctions']['DataExploration'] != 'False': 
    print('Initiating Data Exploration Mode')
    DataExploration_1(input_raw_df, config_clust)
    #DataExploration_1(TrainDF, config_clust)

if config_clust['TriggerTheseFunctions']['GenerateCorrelationPlot'] != 'False':
    GenerateCorrelationPlot(train_processed_raw_df, config_clust)
    GenerateCorrelationPlot(outlier_df, config_clust)


In [None]:
if config_clust['aim']['PaceMode'] == 'Off':
    for i in [TrainDF, TestDF, OutlierDF]:
    #     print()
        df = i.copy()
        temp = pd.DataFrame(df.isnull().sum(), columns = ['IsNullSum'])
        temp['dtypes'] = df.dtypes.tolist()
        temp['IsNaSum'] = df.isna().sum().tolist()
        temp = temp.join(df.describe().T).fillna('')
        display(temp)
        print('\nMax isBotHits :', input_raw_df['isBotHits'].max())

## Feature Transformation and Dimension Transformation  

#### Applying Dimension Transformation on Orginal Dataset

In [None]:
import os, glob
from UC_DataDimensionProcessing import DimensionReduction_1

## Removing Previous Iteration Files
PreviousIterationFiles = glob.glob(config_clust['MovingOutputFile']['DirToMoveFrom'] + ('DataDimensionTransformation_' + '*.{FileType}').format(FileType='csv'))
[ os.unlink(path) for path in PreviousIterationFiles ]

# DimRedClustAlgoDict = ast.literal_eval('''{ ('LDA', 'LDA_param1') : [('DBSCAN', 'DBSCAN_param_1')] , ('ICA', 'ICA_param1') : [('IsolationForest', 'IsolationForest_param_1')] , ('PCA', 'PCA_param1') : [('KMeans', 'KMeans_param_1')] }''')
DimRedClustAlgoDict = ast.literal_eval(config_clust['AnomalyClusterConfiguration']['DataTransfRedNClustAlgo'])
for DimRed in DimRedClustAlgoDict.keys():
    print('Data Dimension transformation Algo Used : ', DimRed[0], '\t\tWith Params : ', DimRed[1])
    train_dimen_transf_df, test_dimen_transf_df = DimensionReduction_1(train_processed_raw_df, test_processed_raw_df, DimRed[0], DimRed[1], config_clust) 
    # display(train_dimen_transf_df.head())
    # display(test_dimen_transf_df.head())


In [None]:
# print(train_processed_raw_df['isBotHits'].max())
# print(train_dimen_transf_df['isBotHits'].max())
# 

## Applying Clustering

In [None]:
from UC_DataClustering import ClusteringApplied_1

## Removing Previous Iteration Files
PreviousIterationFiles = glob.glob(config_clust['MovingOutputFile']['DirToMoveFrom'] + ('*ModelData_' + '*.{FileType}').format(FileType='csv'))
[ os.unlink(path) for path in PreviousIterationFiles ]

DimRedClustAlgoDict = ast.literal_eval(config_clust['AnomalyClusterConfiguration']['DataTransfRedNClustAlgo'])
# DimRedClustAlgoDict = ast.literal_eval('''{ ('LDA', 'LDA_param1') : [('DBSCAN', 'DBSCAN_param_1')] , ('ICA', 'ICA_param1') : [('IsolationForest', 'IsolationForest_param_1')] , ('PCA', 'PCA_param1') : [('KMeans', 'KMeans_param_1')] }''')

for DimRed in DimRedClustAlgoDict.keys():
    print('Data Dimension transformation Algo Used : ', DimRed[0], '\t\tWith Params : ', DimRed[1])
    try:
        train_dimen_transf_df = pd.read_csv((config_clust['input']['dataset_dir'] + 'DataDimensionTransformation_Train__' + DimRed[0] + '_With_'+ DimRed[1] + '.csv'), sep = '|', encoding="utf-8")
    except:
        train_dimen_transf_df = None
    try:
        test_dimen_transf_df = pd.read_csv((config_clust['input']['dataset_dir'] + 'DataDimensionTransformation_Test__' + DimRed[0] + '_With_'+ DimRed[1] + '.csv'), sep = '|', encoding="utf-8")
    except:
        test_dimen_transf_df = None
    for ClustAlgo in DimRedClustAlgoDict[DimRed]:
        AlgoCombination = {'DimensionTransformation' : (DimRed[0], DimRed[1]), 
                           'AnomalyClustering': (ClustAlgo[0], ClustAlgo[1])}
        print('|\t\tData Segmentation Algo Used : ', ClustAlgo[0], '\t\tWith Params : ', ClustAlgo[1])
        TrainDF, TestDF = ClusteringApplied_1(train_dimen_transf_df, test_dimen_transf_df, outlier_df, AlgoCombination, config_clust)
        

In [None]:
from UC_DataClustering_NonInliner import CreateAdditionalClusters
CreateAdditionalClusters(outlier_df, config_clust)

## Combine and Move Train Test Results

In [None]:
from UC_OutputTransformer_CombinerMover import MoveFileToAdaptDir

MoveFileToAdaptDir(config_clust)

## Do Cluster Evaluation

In [None]:
from UC_ClusterEvaluation import ClustersEvaluation

if config_clust['TriggerTheseFunctions']['ClustersEvaluation'] != 'False':
    # ClustersEvaluation(config_clust, 'SingleFile', None, ('data/keyModelsData/ClusterModelData_TrainTest_PCA_KMeans.csv', 'path'))
    df = ClustersEvaluation(config_clust, 'MultipleFiles', None, (None,None))
    df.set_index(['Algorithm'])
    # df.head()

## Ensembling the Cluster Results Evaluating is Also done in this.

In [None]:
from UC_OutputTransformer import OutputTransformer
Output_Keysets_Df, EnsembleEval_DF = OutputTransformer(config_clust)

display(Output_Keysets_Df.head())
display(EnsembleEval_DF)

In [None]:
EndTime = time.time()

In [None]:
EndTime - StartTime

## Testing Start -- Anomaly autoencoder

In [None]:
# Importing the libraries
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable

In [None]:
config = config_clust
df = input_raw_df.copy()
# FilterAnomalyCases(WholeDF.iloc[0:3,], config)

## Outliers to be removed from the features
AllFeature = [ i for i in config['DataProcessing_General']['AllFeaturesToUtilize'].split("'") if len(i) > 2 ]
FeatureToIgnore = [ i for i in config['DataProcessing_General']['FeatureToIgnore'].split("'") if len(i) > 2 ]
ColToAnalysis = [ i for i in AllFeature if i not in FeatureToIgnore ]

# display(df.head(10))  ## Original Dataset

## Copying the feature to a new DF which are to be ignored in dimension tranformation
df_transformed = df[FeatureToIgnore].reset_index(drop=True)

In [None]:
training_set = np.array(training_set, dtype = 'int')

# Converting the data into Torch tensors
training_set = torch.FloatTensor(training_set)
test_set = torch.FloatTensor(test_set)

# Creating the architecture of the Neural Network
class SAE(nn.Module):
    def __init__(self, ):
        super(SAE, self).__init__()
        self.fc1 = nn.Linear(nb_movies, 20)
        self.fc2 = nn.Linear(20, 10)
        self.fc3 = nn.Linear(10, 20)
        self.fc4 = nn.Linear(20, nb_movies)
        self.activation = nn.Sigmoid()
    def forward(self, x):
        x = self.activation(self.fc1(x))
        x = self.activation(self.fc2(x))
        x = self.activation(self.fc3(x))
        x = self.fc4(x)
        return x
sae = SAE()
criterion = nn.MSELoss()
optimizer = optim.RMSprop(sae.parameters(), lr = 0.01, weight_decay = 0.5)

In [None]:
## NN using Numpy
import numpy as np

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

In [None]:
## NN Using Pytorch
import torch


dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

In [None]:
# AutoEncoders



# # Importing the dataset
# movies = pd.read_csv('ml-1m/movies.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')
# users = pd.read_csv('ml-1m/users.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')
# ratings = pd.read_csv('ml-1m/ratings.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')

# # Preparing the training set and the test set
# training_set = pd.read_csv('ml-100k/u1.base', delimiter = '\t')
# training_set = np.array(training_set, dtype = 'int')
# test_set = pd.read_csv('ml-100k/u1.test', delimiter = '\t')
# test_set = np.array(test_set, dtype = 'int')

# # Getting the number of users and movies
# nb_users = int(max(max(training_set[:,0]), max(test_set[:,0])))
# nb_movies = int(max(max(training_set[:,1]), max(test_set[:,1])))

# # Converting the data into an array with users in lines and movies in columns
# def convert(data):
#     new_data = []
#     for id_users in range(1, nb_users + 1):
#         id_movies = data[:,1][data[:,0] == id_users]
#         id_ratings = data[:,2][data[:,0] == id_users]
#         ratings = np.zeros(nb_movies)
#         ratings[id_movies - 1] = id_ratings
#         new_data.append(list(ratings))
#     return new_data
# training_set = convert(training_set)
# test_set = convert(test_set)

# # Converting the data into Torch tensors
# training_set = torch.FloatTensor(training_set)
# test_set = torch.FloatTensor(test_set)

# Creating the architecture of the Neural Network
class SAE(nn.Module):
    def __init__(self, ):
        super(SAE, self).__init__()
        self.fc1 = nn.Linear(nb_movies, 20)
        self.fc2 = nn.Linear(20, 10)
        self.fc3 = nn.Linear(10, 20)
        self.fc4 = nn.Linear(20, nb_movies)
        self.activation = nn.Sigmoid()
    def forward(self, x):
        x = self.activation(self.fc1(x))
        x = self.activation(self.fc2(x))
        x = self.activation(self.fc3(x))
        x = self.fc4(x)
        return x
sae = SAE()
criterion = nn.MSELoss()
optimizer = optim.RMSprop(sae.parameters(), lr = 0.01, weight_decay = 0.5)

# Training the SAE
nb_epoch = 200
for epoch in range(1, nb_epoch + 1):
    train_loss = 0
    s = 0.
    for id_user in range(nb_users):
        input = Variable(training_set[id_user]).unsqueeze(0)
        target = input.clone()
        if torch.sum(target.data > 0) > 0:
            output = sae(input)
            target.require_grad = False
            output[target == 0] = 0
            loss = criterion(output, target)
            mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10)
            loss.backward()
            train_loss += np.sqrt(loss.data[0]*mean_corrector)
            s += 1.
            optimizer.step()
    print('epoch: '+str(epoch)+' loss: '+str(train_loss/s))

# Testing the SAE
test_loss = 0
s = 0.
for id_user in range(nb_users):
    input = Variable(training_set[id_user]).unsqueeze(0)
    target = Variable(test_set[id_user])
    if torch.sum(target.data > 0) > 0:
        output = sae(input)
        target.require_grad = False
        output[target == 0] = 0
        loss = criterion(output, target)
        mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10)
        test_loss += np.sqrt(loss.data[0]*mean_corrector)
        s += 1.
print('test loss: '+str(test_loss/s))

## Testing ---- END

In [None]:
from pyclustering.cluster import cluster_visualizer;
from pyclustering.cluster.cure import cure;

from pyclustering.utils import read_sample;

from pyclustering.samples.definitions import FCPS_SAMPLES;

# Input data in following format [ [0.1, 0.5], [0.3, 0.1], ... ].
input_data = read_sample(FCPS_SAMPLES.SAMPLE_LSUN);

# Allocate three clusters:
cure_instance = cure(input_data, 3);
cure_instance.process();
clusters = cure_instance.get_clusters();

# Visualize clusters:
visualizer = cluster_visualizer();
visualizer.append_clusters(clusters, None);
visualizer.show();

In [None]:
import logging
import logging.handlers
logger = None

logging_conf = config_reader['logging']
config['log_file'] = logging_conf['log_file']
log_level_dict = {
    'CRITICAL' : logging.CRITICAL,
    'ERROR' : logging.ERROR,
    'WARNING' : logging.WARNING,
    'INFO' : logging.INFO,
    'DEBUG' : logging.DEBUG
}
config['log_level'] = log_level_dict[logging_conf['log_level']]
config['log_maxBytes'] = logging_conf.getint('maxBytes')
config['log_backupCount'] = logging_conf.getint('backupCount')


except Exception as ex:
        logger.error('Exception in making API call to {}: {}'.format(path,ex))
        exit_with_error()
        
logger.info('response to add rule api call: {}'.format(res.json()))


global logger, s
read_config('./config.ini')

'''
initialize logging
'''
logger = logging.getLogger('SsLogger')
logger.setLevel(config['log_level'])
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler = logging.handlers.RotatingFileHandler(config['log_file'], maxBytes=config['log_maxBytes'], backupCount=config['log_backupCount'])
handler.setFormatter(formatter)
logger.addHandler(handler)

'''
read the IPs/CIDRs to be blacklisted
'''
blacklisted_ips_cidrs = read_blacklisted_ips()
if len(blacklisted_ips_cidrs) > (config['max_ips_per_rule']*config['max_rules_per_policy']):
    logger.error('Error: too many IPs/CIDRs ({}). Maximum limit: {}'.format(len(blacklisted_ips_cidrs),config['max_ips_per_rule']*config['max_rules_per_policy']))
    exit_with_error()

'''
initiate session and authenticate
'''
logger.info('initiating session')