# CNMa
###### Correlation N Matching

In [None]:
#import IP_address_filter
#import SG_IP_matching
#import xml_to_csv
import VT_API
import OS_API
import pandas as pd
import numpy as np
from datetime import datetime
import time
import os
from tqdm import tqdm
import json
import re
import csv

## Search in OpenSearch

In [None]:
#columns of interest
COLS = ['peerIP',
        'peerPort',
        'hostIP',
        'hostPort',
        'commands',
        'hashes',
        'urls',
        'loggedin',
        'startTime',
        'endTime',
        'sort_num',
        'peerCountry',
        'hostCountry'
       ]

SIZE = 10000

### How to craft filter query
Query inputting: list of dictionaries/jsons \
4 main filters:
- range: usually to filter on time ranges. time can be exact (2022-01-05T23:59:30) or relative (now-1w).
- exists: filter out records that have an existing field of choice
- match: filter out records that exactly matches a value on the chosen field, must match Exactly.
- query_string: filter out records that contains value(s) on the chosen field, can take in arguments such as AND and/or OR.

All filters are optional and can be repeated multiple times if require multiple matches. 

In [None]:
defaultquery = [{"range": {
                "startTime": {
                  "gte": "now-8w",
                  "lte": "now-6w",
                  "format": "strict_date_optional_time"
                }}},
                {"exists": {"field": "hashes"}},
                {"match": {"geoip.country_name": "Singapore"}},
                {"query_string": {"default_field": "commands", "query": "wget"}}
               ]

In [None]:
size = 10000 #batch size, max 10000
search_after = [0]
df_os = pd.DataFrame(columns=COLS)

queryls = defaultquery
print(queryls)

total = 0

while (size == SIZE):
    print(f"Continue from record: {search_after}")
    data = OS_API.opensearch_request(size,queryls,search_after)
    df_os = pd.concat([df_os,data])
    
    #update params
    size = len(data) #exit once it completes last batch
    total+=size #update total records number
    search_after = data['sort_num'][len(data)-1] #update sort number to continue after
    #time.sleep(1)
    
print(f'Done. Total records: {total}')

In [None]:
df_os

In [None]:
#export to csv
'''
df_os.to_csv("OS_pull" + "_" + str(datetime.now().strftime('%Y_%m_%d_%H_%M_%S')) + ".csv", index=False)
'''

## Searching Hashes 

In [None]:
hashls = []

for i in df_os['hashes']:
    for j in i:
        if j not in hashls:
            hashls.append(j)

hashls

In [None]:
resultls = []

for hsh in tqdm(hashls):
    result = VT_API.virustotal_request(hsh)
    resultls.append(result)
    time.sleep(5)

resultls
    

## Searching for urls within Commands
### Note: jupyter hyperlinks these outputs, do take caution to prevent accidental clicking

In [None]:
commandls = []
index = []

for i,row in df_os.iterrows():
    for c in row['commands']:
        try:
            #regex to find url starting with http(s)
            found = re.findall("https?:\\/\\/(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[/]*)", c)
            #print(i, found)

            for f in found:
                if f not in commandls:
                    commandls.append(f)

        except AttributeError:
            print(i, 'No matches found')
            found = ''

#commandls

In [None]:
#testing
'''
listed=['http://123.34.123.341|ersfgfadsilu  http://123.34.123.341|ersfgfadsilu', 'http://123.34.123.341', 'aiodusnfavic 123.34.123.341|ersfgfadsilu', 'http://123.34.123.341|ersfgfadsilu 123.34.123.341', 'vnoaids.abfna']
for c in listed:
    found = re.findall("https?:\\/\\/(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)|[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)", c)
    print(found)
'''

In [None]:
commandresults = []

for com in tqdm(commandls):
    result = VT_API.virustotal_request(com)
    commandresults.append(result)
    time.sleep(5)
    
commandresults

## ***Only run this in a VM**
### Curl/Wget to download infra links, to obtain file/hash

In [None]:
dlls = []

for i,row in df_os.iterrows():
    for c in row['commands']:
        try:
            #regex to find url starting with http(s)
            found = re.findall("https?:\\/\\/(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)", c)
            #print(i, found)

            for f in found:
                if f not in dlls:
                    dlls.append(f)

        except AttributeError:
            print(i, 'No matches found')
            found = ''

#dlls

In [None]:
filename = "dllinks" + "_" + str(datetime.now().strftime('%Y_%m_%d_%H_%M_%S') + ".csv")

with open(filename, 'w', newline='') as f:
    wr = csv.writer(f, quoting=csv.QUOTE_ALL)
    for x in dlls:
        wr.writerow([x])


## Work in Progress (Ignore below)
Interactive component to customise input

In [None]:
def yesnocheck(param):
    '''
    Forces input to be y or n, otherwise loops back.
    Returns y or n.
    '''
    while True:
        # Note: Python 2.x users should use raw_input, the equivalent of 3.x's input
        ans = input(f"{param}")
        if ans not in ['y', 'n']:
            continue            
        else:
            break
    return ans

#ans = yesnocheck("test")
#ans

In [None]:
def craft_query(defmode):
    '''
    Interactive component to customize input
    '''
    #defmode to return default eg.
    while defmode:
        return defaultquery
    
    queryls = []
    
    #custom input
    #Time range
    ans = yesnocheck("Filter on Time Range? y/n")
    if ans == 'y':
        start = str(input("Input start date:"))
        end = str(input("Input end date:"))
        rnge = {"range": {"startTime": {"gte": start, "lte": end, "format": "strict_date_optional_time"}}}
        queryls.append(rnge)    

    #Existing field
    while True:
        ans = yesnocheck("Filter if a field Exists? y/n")
        #print(ans)
        if ans == 'y':
            exist = input("Please enter field:")
            exst = {'exists': {'field': exist}}
            queryls.append(exst)
        else:
            break
    
    #Exact match on field
    while True:
        ans = yesnocheck("Exact match on a field? y/n")
        #print(ans)
        if ans == 'y':
            field = input("Please enter field:")
            value = input("Enter value to match:")
            mtch = {"match": {field: value}}
            queryls.append(mtch)
        else:
            break
    
    #General match query
    while True:
        ans = yesnocheck("General search query on a field? y/n")
        #print(ans)
        if ans == 'y':
            field = input("Please enter field:")
            query = input("Enter query:")
            qry = {"query_string": {"default_field": field, "query": query}}
            queryls.append(qry)
        else:
            break
    
    return queryls
    
#queryls = craft_query(True)
#queryls