In [1]:
# Import lib
import pandas as pd
from sklearn.utils import shuffle

In [2]:
# Read dga.txt
alexa_file = pd.read_csv('alexa-top-1m.txt', header=None)
alexa_file.head()

Unnamed: 0,0
0,google.com
1,youtube.com
2,tmall.com
3,qq.com
4,baidu.com


In [3]:
# Process Alexa data
alexa_domains = alexa_file[0].tolist()

alexa_archive = pd.DataFrame()
alexa_archive['domain'] = alexa_domains
alexa_archive['source'] = "Alexa"
alexa_archive['class'] = 0

alexa_archive

Unnamed: 0,domain,source,class
0,google.com,Alexa,0
1,youtube.com,Alexa,0
2,tmall.com,Alexa,0
3,qq.com,Alexa,0
4,baidu.com,Alexa,0
...,...,...,...
764161,viralnewstalk.com,Alexa,0
764162,wildlifepestsolutions.com,Alexa,0
764163,wonderfirstore.com,Alexa,0
764164,zapforex.com,Alexa,0


In [4]:
# Read dga.txt
dga_file = pd.read_csv('dga.txt', header=None)
dga_file.head()

Unnamed: 0,0,1,2,3
0,pfamswwocljvyq.net,Domain used by Cryptolocker - Flashback DGA fo...,2020-07-09,http://osint.bambenekconsulting.com/manual/cl.txt
1,djwvfctcauagiv.biz,Domain used by Cryptolocker - Flashback DGA fo...,2020-07-09,http://osint.bambenekconsulting.com/manual/cl.txt
2,qwcklhxpslwmyx.ru,Domain used by Cryptolocker - Flashback DGA fo...,2020-07-09,http://osint.bambenekconsulting.com/manual/cl.txt
3,inymbmchhhlwug.org,Domain used by Cryptolocker - Flashback DGA fo...,2020-07-09,http://osint.bambenekconsulting.com/manual/cl.txt
4,jpeiwrimajrlde.co.uk,Domain used by Cryptolocker - Flashback DGA fo...,2020-07-09,http://osint.bambenekconsulting.com/manual/cl.txt


In [5]:
def extract_source(source):
    return str(source).split()[3]

dga_archive = pd.DataFrame()
dga_archive['domain'] = dga_file[0]
dga_archive['source'] = dga_file[1].map(extract_source)

# Extract DGA classes
source_counts = dga_archive['source'].value_counts(sort=True)
print(source_counts)

dga_classes_dict = dict()

for i in range(15):
    # top 15 class
    dga_classes_dict[source_counts.index[i]] = i + 1
    
print(dga_classes_dict)

def is_class(source):
    return (source in dga_classes_dict)

def extract_class(source):
    return dga_classes_dict[source]

# Use only top 15 class domain
dga_archive = dga_archive[dga_archive['source'].map(is_class)]
dga_archive['class'] = dga_archive['source'].map(extract_class)

dga_archive

banjori        439223
qsnatch        105725
tinba           66789
Post            66000
ramnit          64605
                ...  
mirai               2
dromedan            2
g01                 1
madmax              1
xshellghost         1
Name: source, Length: 62, dtype: int64
{'qakbot': 6, 'pykspa': 13, 'shiotob/urlzone/bebloh': 9, 'dyre': 15, 'kraken': 14, 'ramnit': 5, 'tinba': 3, 'ranbyus': 12, 'necurs': 7, 'monerodownloader': 10, 'Post': 4, 'qsnatch': 2, 'banjori': 1, 'simda': 11, 'murofet': 8}


Unnamed: 0,domain,source,class
999,13z8ot279ya361nctxe61qk069r.com,Post,4
1000,o6lu15demoig1ccngms1pigvc5.net,Post,4
1001,k1x4z91801j4lw4fjar17kzgs.biz,Post,4
1002,nvhfot1gcy9mkrbhtzz68uq2u.org,Post,4
1003,1peyon81ng2a76gwapjq11g2cu3.com,Post,4
...,...,...,...
973445,zzyb.xyz,qsnatch,2
973446,zzy.xyz,qsnatch,2
973447,zzza.biz,qsnatch,2
973448,zzzan.biz,qsnatch,2


In [6]:
# Combine Alexa and DGA data
result = pd.concat([alexa_archive, dga_archive], sort=False)

# Shuffle and save the labeled data
result = shuffle(result, random_state=33)
result.to_csv("./dga_label.csv", mode='w', index=False)

result.head()

Unnamed: 0,domain,source,class
512758,tzhpalitydevonianizuwb.com,banjori,1
619141,brpafhmph.com,ramnit,5
307514,hvrqidablyhoosieraw.com,banjori,1
148738,fenews.co.uk,Alexa,0
192833,bdswvinskycattederifg.com,banjori,1
