In [1]:
import pandas as pd

## Import data dan read data

In [2]:
with open('data/allbrand.txt', 'r') as f:
    data = f.read()

In [3]:
brands = data.split("\n")

In [4]:
import json

data_phising = open("data/data_phishing_37175.json")
data_phising = json.load(data_phising) 
data_legitimate = open("data/data_legitimate_36400.json")
data_legitimate = json.load(data_legitimate)
with open('data/keywords.txt', 'r') as f:
    data = f.read()
keywords = data.split("\n")

## Word decomposer module

In [43]:
def words_raw_extraction(domain, subdomain, path):
    w_domain = re.split("\-|\.|\/|\?|\=|\@|\&|\%|\:|\_", domain.lower())
    w_subdomain = re.split("\-|\.|\/|\?|\=|\@|\&|\%|\:|\_", subdomain.lower())
    w_path = re.split("\-|\.|\/|\?|\=|\@|\&|\%|\:|\_", path.lower())

    raw_words = w_domain + w_path + w_subdomain
    #raw_words = list(set(raw_words))
    raw_words = list(filter(None, raw_words))

    return raw_words

## Maliciouness Analysis Module

In [42]:
import math
import pickle

accepted_chars = 'abcdefghijklmnopqrstuvwxyz '

pos = dict([(char, idx) for idx, char in enumerate(accepted_chars)])

def normalize(line):
    """ Return only the subset of chars from accepted_chars.
    This helps keep the  model relatively small by ignoring punctuation,
    infrequenty symbols, etc. """
    return [c.lower() for c in line if c.lower() in accepted_chars]

def ngram(n, l):
    """ Return all n grams from l after normalizing """
    filtered = normalize(l)
    for start in range(0, len(filtered) - n + 1):
        yield ''.join(filtered[start:start + n])

def train():
    """ Write a simple model as a pickle file """
    k = len(accepted_chars)
    # Assume we have seen 10 of each character pair.  This acts as a kind of
    # prior or smoothing factor.  This way, if we see a character transition
    # live that we've never observed in the past, we won't assume the entire
    # string has 0 probability.
    counts = [[10 for i in range(k)] for i in range(k)]

    # Count transitions from big text file, taken
    # from http://norvig.com/spell-correct.html
    for line in open('big.txt'):
        for a, b in ngram(2, line):
            counts[pos[a]][pos[b]] += 1

    # Normalize the counts so that they become log probabilities.
    # We use log probabilities rather than straight probabilities to avoid
    # numeric underflow issues with long texts.
    # This contains a justification:
    # http://squarecog.wordpress.com/2009/01/10/dealing-with-underflow-in-joint-probability-calculations/
    for i, row in enumerate(counts):
        s = float(sum(row))
        for j in range(len(row)):
            row[j] = math.log(row[j] / s)

    # Find the probability of generating a few arbitrarily choosen good and
    # bad phrases.
    good_probs = [avg_transition_prob(l, counts) for l in open('good.txt')]
    bad_probs = [avg_transition_prob(l, counts) for l in open('bad.txt')]

    # Assert that we actually are capable of detecting the junk.
    assert min(good_probs) > max(bad_probs)

    # And pick a threshold halfway between the worst good and best bad inputs.
    thresh = (min(good_probs) + max(bad_probs)) / 2
    pickle.dump({'mat': counts, 'thresh': thresh}, open('gib_model.pki', 'wb'))

def avg_transition_prob(l, log_prob_mat):
    """ Return the average transition prob from l through log_prob_mat. """
    log_prob = 0.0
    transition_ct = 0
    for a, b in ngram(2, l):
        log_prob += log_prob_mat[pos[a]][pos[b]]
        transition_ct += 1
    # The exponentiation translates from log probs to probs.
    return math.exp(log_prob / (transition_ct or 1))

if __name__ == '__main__':
    train()


## NLP Feature Extraction (40 Feature)

In [44]:
import sys
import json
import pprint
import pickle
import pygtrie
import requests

from traceback import format_exc
from word_with_nlp import nlp_class
from word_splitter_file import WordSplitterClass

from ns_log import NsLog


class url_rules:
    def __init__(self):

        print("initializing")

        self.logger = NsLog("log")
        self.path_data = "data/"
        self.name_brand_file = "allbrand.txt"
#         self.path_alexa_files = "data/alexa-tld/"

        self.nlp_manager = nlp_class()
        self.pp = pprint.PrettyPrinter(indent=4)
        self.word_splitter = WordSplitterClass()

        allbrand_txt = open("{0}{1}".format(self.path_data, self.name_brand_file), "r")
        self.allbrand = self.__txt_to_list(allbrand_txt)


    def __txt_to_list(self, txt_object):

        list = []

        for line in txt_object:
            list.append(line.strip())

        txt_object.close()

        return list

    def rules_main(self, domain, tld, subdomain, path, words_raw):

        features = {}
        info_nlp = {}

        try:
            features.update(self.digit_count(domain, subdomain, path))             # digitcount
            features.update(self.length(domain, subdomain, path))                  # uzunluk
            features.update(self.tld_check(tld))                                   # tld check
            features.update(self.check_rule_5(words_raw))                          # www-com
            features.update(self.punny_code(domain))                               # punnycode
            features.update(self.random_domain(domain))                            # random_domain
            features.update(self.subdomain_count(subdomain))                       # subdomain count
            features.update(self.char_repeat(words_raw))                           # char_repeat
#             features.update(self.alexa_check(domain, tld))                         # alexa1m  check
            #features.update(self.alexa_trie(domain, tld))                         # alexa1m check trie
            features.update(self.special_chars(domain, subdomain, path))           # - . / @
            features.update(self.check_domain_in_list(domain))
    
            result_nlp = self.nlp_features(words_raw)
            features.update(result_nlp['features'])                                # words_info
    
            info_nlp = result_nlp['info']
        
        except:
            self.logger.error("url_rules.main() Error : {0}".format(format_exc()))

        return info_nlp, features

    def digit_count(self, domain, subdomain, path):

        result = {'domain_digit_count': 0,
                  'subdomain_digit_count': 0,
                  'path_digit_count': 0}

        for letter in domain:
            if letter.isdigit():
                result['domain_digit_count'] = result['domain_digit_count'] + 1

        for letter in subdomain:
            if letter.isdigit():
                result['subdomain_digit_count'] = result['subdomain_digit_count'] + 1

        for letter in path:
            if letter.isdigit():
                result['path_digit_count'] = result['path_digit_count'] + 1

        return result

    def length(self, domain, subdomain, path):

        domain_uzunluk = len(domain)
        subdomain_uzunluk = len(subdomain)
        path_uzunluk = len(path)

        result = {}

        result['domain_length'] = domain_uzunluk
        result['subdomain_length'] = subdomain_uzunluk
        result['path_length'] = path_uzunluk

        return result

    def tld_check(self, tld):

        common_tld = ["com", "org", "net", "de", "edu", "gov"]

        result = {}

        if tld in common_tld:
            result["isKnownTld"] = 1
        else:
            result["isKnownTld"] = 0

        return result

    def check_rule_5(self, words_raw):

        result = {'www': 0, "com": 0}

        for word in words_raw:
            if not word.find('www') == -1:
                result['www'] = result['www'] + 1

            if not word.find('com') == -1:
                result['com'] = result['com'] + 1

        return result

    def punny_code(self, line):

        result = {}

        if line.startswith("xn--"):

            result['punnyCode'] = 1
            return result

        else:
            result['punnyCode'] = 0
            return result

    def random_domain(self, domain):

        result = {'random_domain': self.nlp_manager.check_word_random(domain)}

        return result

    def subdomain_count(self, line):

        sub = line.split(".")

        result = {}
        result['subDomainCount'] = len(sub)

        return result

    def __all_same(self, items):
        return all(x == items[0] for x in items)

    def char_repeat(self, words_raw):

        result = {'char_repeat': 0}
        repeat = {'2': 0, '3': 0, '4': 0, '5': 0}
        part = [2, 3, 4, 5]

        "sliding window mantigi repeat sayisi kadar eleman al" \
        "hepsi ayni mi diye bak - ayni ise artir"

        for word in words_raw:
            for char_repeat_count in part:
                for i in range(len(word) - char_repeat_count + 1):
                    sub_word = word[i:i + char_repeat_count]
                    if self.__all_same(sub_word):
                        repeat[str(char_repeat_count)] = repeat[str(char_repeat_count)] + 1

        result['char_repeat'] = sum(list(repeat.values()))

        return result

    def alexa_check(self, domain, tld):

        is_find_tld = 0
        is_find = 0
        line = domain+"."+tld

        letter = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
                  "n", "o", "p", "r", "s", "t", "u", "v", "y", "z", "w", "x", "q",
                  "1", "2", "3", "4", "5", "6", "7", "8", "9", "0"]

        try:
            if line[0] in letter:
                alexa_txt = open("{0}{1}.txt".format(self.path_alexa_files, line[0]), "r")
                alexaList_tld = []  #tldli
                alexa_list = []  #tldsiz

                for alexa_line in alexa_txt:
                    alexaList_tld.append(alexa_line.strip())
                    alexa_list.append(alexa_line.strip().split(".")[0])
                alexa_txt.close()

                for alexa_line in alexaList_tld:
                    if line.strip() == alexa_line.strip():
                        is_find_tld = 1
                        break

                for alexa_line in alexa_list:
                    line_domain = line.split(".")[0]
                    if line_domain.strip() == alexa_line.strip():
                        is_find = 1
                        break
        except:
            self.logger.debug(line + "işlenirken hata uzunluktan dolayı")
            self.logger.error("url_rules.check_rule_11()-Alexa  /  Error : {0}".format(format_exc()))

        result = {}

        if is_find_tld == 1:
            result['alexa1m_tld'] = 1
        else:
            result['alexa1m_tld'] = 0

        if is_find == 1:
            result['alexa1m'] = 1
        else:
            result['alexa1m'] = 0

        return result

    def alexa_trie(self, domain, tld):

        line = domain+"."+tld

        result = {}

        try:
            #if self.alexa1mm[line[0].lower()].has_key(line):
            if self.trie_alexa_tld.has_key(line):
                result['alexa1m_tld_trie'] = 1
            else:
                result['alexa1m_tld_trie'] = 0

            if self.trie_alexa_tldsiz.has_key(domain):
                result['alexa1m_tldsiz_trie'] = 1
            else:
                result['alexa1m_tldsiz_trie'] = 0
        except:
            self.logger.debug(line + "işlenirken alexa")
            self.logger.error("url_rules.check_rule_11()-Alexa  /  Error : {0}".format(format_exc()))

        return result

    def special_chars(self, domain, subdomain, path):

        special_char = {'-': 0, ".": 0, "/": 0, '@': 0, '?': 0, '&': 0, '=': 0, "_": 0}
        special_char_letter = special_char.keys()

        for l in domain:
            if l in special_char_letter:
                special_char[l] = special_char[l] + 1

        for l in subdomain:
            if l in special_char_letter:
                special_char[l] = special_char[l] + 1

        for l in path:
            if l in special_char_letter:
                special_char[l] = special_char[l] + 1

        return special_char

    def check_domain_in_list(self, domain):

        result = {}
        if domain in self.allbrand:
            result['domain_in_brand_list'] = 1
        else:
            result['domain_in_brand_list'] = 0

        return result

    def nlp_features(self, words_raw):

        """
        keywords_in_words, brands_in_words,
        dga_in_words, len_lt_7, len_gt_7 
        """
        grouped_words = self.nlp_manager.parse(words_raw)

        splitted_words = self.word_splitter._splitl(grouped_words['len_gt_7'])
        """
        found_keywords, found_brands,
        similar_to_keyword, similar_to_brand,
        other_words, target_words
        """

        fraud_analyze_result = self.nlp_manager.fraud_analysis(grouped_words, splitted_words)

        result = self.nlp_manager.evaluate(grouped_words, fraud_analyze_result, splitted_words)
        split = {'raw': grouped_words['len_gt_7'], 'splitted': splitted_words}
        result['info']['compoun_words'] = split

        return result

## Rules extraction (Ubah NLP  Feature jadi dictionary)

In [45]:
from tqdm import tqdm

from traceback import format_exc
from ns_log import NsLog
from url_rules import url_rules
from active_rules import active_rules


class rule_extraction:

    def __init__(self):
        self.logger = NsLog("log")
        self.url_rules_o = url_rules()
        self.active_rules_o = active_rules()

    def extraction(self, parsed_domains):

        self.logger.info("rule_extraction.extraction() is running")

        self.parser_object = domain_parser()
        domain_features = []
        try:
            for line in tqdm(parsed_domains):  # self.bar(parsed_domains)
                info = line

                #  info['mail'] = 'whoisden cekilecek'
                nlp_info, url_features = self.url_rules_o.rules_main(info['domain'],
                                                                     info['tld'],
                                                                     info['subdomain'],
                                                                     info['path'],
                                                                     info['words_raw'])  # url kurallarin calistigi yer

                info['nlp_info'] = nlp_info
                info['nlp_info']['words_raw'] = info['words_raw']
                info.pop("words_raw", None)

              #  domain_info, dns_features = self.dns_rules_o.rules_main(line_lst)  # dns rules

                outputDict = {}

              #  info['dns_records'] = domain_info

                outputDict['info'] = info
                outputDict['url_features'] = url_features

              #  outputDict['dns_features'] = dns_features

                domain_features.append(outputDict)

            #domain_features = self.active_rules_o.goog_safe_browsing(domain_features)  # active kuralların çalıştığı yer
        except:
            self.logger.error("Error : {0}".format(format_exc()))

        return domain_features


## Ubah dari dictionary jadi JSON

In [46]:
from domain_parser import domain_parser
import json

In [47]:
data = json.loads(open("parsed_domain_list.json", "r").read())

In [12]:
len(data)

73575

In [13]:
rules = rule_extraction()
extracted_rules = rules.extraction(data)

2020-12-31 01:39:53,906 - INFO - rule_extraction.extraction() is running
  0%|                                                                                        | 0/73575 [00:00<?, ?it/s]

initializing


100%|████████████████████████████████████████████████████████████████████████████| 73575/73575 [51:06<00:00, 24.00it/s]


In [14]:
file = open("extracted_rules.json", "w")
file.write(json.dumps(extracted_rules))
file.close()

## Convert json ke arff

In [48]:
def convert_for_train(features, param):
    # arff convert header
    features_keys_active = []
    features_keys_url = list()
    
    ArffStr = '''@relation weka-test\n\n'''
    features_keys_url = list(features[0]['url_features'].keys())

    if param == '-a':
        features_keys_active = list(features[0]['active_features'].keys())

    for line in features_keys_url:
        ArffStr = ArffStr + '@attribute ' + line + " numeric\n"

    if param == '-a':
        for line in features_keys_active:
            ArffStr = ArffStr + '@attribute ' + line + " numeric\n"

    ArffStr = ArffStr + '@attribute class {phising, legitimate}' + "\n\n@data\n"
    # header son


    for each_domain in features:
        tmp = ""

        for key in features_keys_url:
            tmp = tmp + str(each_domain['url_features'][key])+","

        if param == '-a':
            for key_a in features_keys_active:
                tmp = tmp + str(each_domain['active_features'][key_a]) + ","

        tmp = tmp + each_domain['info']['class']+"\n"
        ArffStr = ArffStr + tmp

    return ArffStr

In [16]:
arff_str = convert_for_train(extracted_rules, '') # todo active_features icin -a param girilecek

In [17]:
file = open("features_1.arff", "w")
file.write(arff_str)
file.close()

# Train data NLP feature

In [2]:
from scipy.io import arff
import numpy as np

In [3]:
train = []
target = []

file = "features_NLP.arff"

train_dataset, train_meta = arff.loadarff(open(file, "r"))
train = train_dataset[train_meta.names()[:-1]]  # everything but the last column
target = train_dataset[train_meta.names()[len(train_meta.names()) - 1]]  # last column
train = np.asarray(train.tolist(), dtype=np.float32)  # olay burda

## Tes cross validation 10 fold data

In [6]:
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import recall_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [7]:
NB = GaussianNB()
clf = NB.fit(train, target)
scores = cross_val_score(clf, train, target, cv=10)
prec_scores = cross_val_score(clf, train, target, cv=10,scoring="precision_weighted")
recall_scores = cross_val_score(clf, train, target, cv=10,scoring="recall_weighted")
f1_scores = cross_val_score(clf, train, target, cv=10,scoring="f1_weighted")
print("Accuracy : ",np.average(scores))
print("Precision : ",np.average(prec_scores))
print("Recall : ",np.average(recall_scores))
print("F1 : ",np.average(f1_scores))

Accuracy :  0.6388724888933339
Precision :  0.7436953007424807
Recall :  0.6388724888933339
F1 :  0.5965821045834743


In [None]:
decisionTree = DecisionTreeClassifier()
clf = decisionTree
scores = cross_val_score(clf, train, target, cv=10)
prec_scores = cross_val_score(clf, train, target, cv=10,scoring="precision_weighted")
recall_scores = cross_val_score(clf, train, target, cv=10,scoring="recall_weighted")
f1_scores = cross_val_score(clf, train, target, cv=10,scoring="f1_weighted")
print("Accuracy : ",np.average(scores))
print("Precision : ",np.average(prec_scores))
print("Recall : ",np.average(recall_scores))
print("F1 : ",np.average(f1_scores))

In [None]:
clf = RandomForestClassifier(n_estimators=10, random_state=0, verbose=1)
scores = cross_val_score(clf, train, target, cv=10)
prec_scores = cross_val_score(clf, train, target, cv=10,scoring="precision_weighted")
recall_scores = cross_val_score(clf, train, target, cv=10,scoring="recall_weighted")
f1_scores = cross_val_score(clf, train, target, cv=10,scoring="f1_weighted")
print("Accuracy : ",np.average(scores))
print("Precision : ",np.average(prec_scores))
print("Recall : ",np.average(recall_scores))
print("F1 : ",np.average(f1_scores))

## Ubah url ke JSON

In [54]:
ArffStr = '''@relation weka-test\n\n'''
ArffStr = ArffStr + '@attribute url string\n'
ArffStr += '@attribute class {phishing, legitimate}' + "\n\n@data\n"

for word in data_legitimate:
    if('"' in word):
        ArffStr += "'"+word+"',legitimate\n"
    elif("'" in word):
        ArffStr += '"'+word+'",legitimate\n'
    else:
        ArffStr += "'"+word+"',legitimate\n"

for word in data_phising:
    if('"' in word):
        ArffStr += "'"+word+"',phishing\n"
    elif("'" in word):
        ArffStr += '"'+word+'",phishing\n'
    else:
        ArffStr += "'"+word+"',phishing\n"

In [55]:
file = open("url.arff", "w")
file.write(ArffStr)
file.close()

# Convert data url ke arff

In [12]:
data = data_phising + data_legitimate

In [90]:
extracted_rules = json.loads(open("extracted_rules.json", "r").read())

In [99]:
arffStr = convert_for_NLP_with_features(extracted_rules, data)

In [26]:
arffStr

'@relation weka-test\n\n@attribute url string\n@attribute words string\n@attribute domain_digit_count numeric\n@attribute subdomain_digit_count numeric\n@attribute path_digit_count numeric\n@attribute domain_length numeric\n@attribute subdomain_length numeric\n@attribute path_length numeric\n@attribute isKnownTld numeric\n@attribute www numeric\n@attribute com numeric\n@attribute punnyCode numeric\n@attribute random_domain numeric\n@attribute subDomainCount numeric\n@attribute char_repeat numeric\n@attribute - numeric\n@attribute . numeric\n@attribute / numeric\n@attribute @ numeric\n@attribute ? numeric\n@attribute & numeric\n@attribute = numeric\n@attribute _ numeric\n@attribute domain_in_brand_list numeric\n@attribute raw_word_count numeric\n@attribute splitted_word_count numeric\n@attribute average_word_length numeric\n@attribute longest_word_length numeric\n@attribute shortest_word_length numeric\n@attribute std_word_length numeric\n@attribute compound_word_count numeric\n@attribu

In [100]:
file = open("Hybrid_features.arff", "w")
file.write(arffStr)
file.close()