In [4]:
import pandas as pd
import numpy as np
import scipy
import math
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn import neighbors
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
import statsmodels.formula.api as smf
from statsmodels.sandbox.regression.predstd import wls_prediction_std

%matplotlib inline





Malware Capture Facility
CVUT University, Prague, Czech Republic

These files were generated as part of a research project in the CVUT University, Prague, Czech Republic.
The goal is to store long-lived real botnet traffic and to generate labeled netflows files.
Any question feel free to contact us:
Sebastian Garcia, sebastian.garcia@agents.fel.cvut.cz
Vojtech Uhlir <vojtech.uhlir@agents.fel.cvut.cz

Disclaimer: You are free to use these files as long as you reference this project and the authors.
#########################

CLF
===
The CLF (Common Log Format) file contains the web logs of the pcap file as extracted by the justsniffer tool. The command used was:
justniffer -f file.pcap > file.clf


Weblogs
=======
The weblogs are files similar to the CLF file but with another format. They were generated with these command :

justniffer -f $1 -p "port 80 or port 8080 or port 3128" -l "%request.timestamp2(%s) %dest.port %response.code %response.size %source.port %request.size http://%request.header.host%request.url %connection.time %dest.ip %source.ip %response.header.content-type %request.header.referer %request.header.user-agent" |awk '{if ($11 ~ /\;/) print $1" "$2" "$3" "$4" "$5" "$6" "$7" "($8*1000)" "$9" "$10" "substr($11,1,match($11,/\;/)-1)" "$13" "$14" "substr($0,index($0,$15)); else print $1" "$2" "$3" "$4" "$5" "$6" "$7" "($8*1000)" "$9" "substr($0,index($0,$10))}'|awk '{printf "%.3f %s %s %s %s %s %s %.0f %s %s\n", $1, $2, $3, $4, $5, $6, $7, $8, $9, substr($0,index($0,$10))}'|grep -v "Mb\|rZl" > $FILE.weblog
# The last grep is to avoid some lines with binary data. Sometimes the botnet uses these port but not for http, so we delete them


Netflows
========
The netflows are generated using the 2013-08-12_argus.conf file, the 2013-08-12_ra.conf file and the 2013-08-12_ralabel.conf conf file. We are using bidirectional argus records.
The command used is this:
1- argus -F argus.conf -r file.pcap -w file.argus
2- ralabel -f ralabel.conf -r file.argus -w file.argus.labeled
3- mv file.argus.labeled file.argus (this is to add labels to the argus file)
4- ra -F ra.conf -Z b -nr file.argus > file.argus.netflow.labeled

If you need the netflows without the labels, just regenerate them without the ralabel command.

Pcap
====
The pcap capture files were done by Virtualbox, because the vms were NATed. This means that all the captures start on 19707/1/1 because of a bug in virtualbox. Then, the pcap captures can not be merged.

Labels
======
Labels were assigned using the ralabel program from the argus suite. The assignment rules are not being published, but can requested by mail.



Generic info
------------
Binary used: 39UvZmv.exe
Md5:46b3df3eaf1312f80788abd43343a9d2  
- The password of the zip is: infected
Probable Name: ?
Virustotal link: https://www.virustotal.com/en/file/8330196e9f62ab96fde8d184d7629d73cd30127dc65050c7c55d586ce367c9c8/analysis/

Infected Machines:
Windows Name: Win8, IP: 10.0.2.22 (Label: Botnet-V1)
Windows Name: Win12, IP: 10.0.2.112 (Label: Botnet-V2)


Histogram of labels
===================
For Win8
--------
  14754 Background-ARP
  21815 From-Botnet-V1-UDP-Establishedd
 123305 From-Botnet-V1-UDP-Attempt
 314679 From-Botnet-V1-TCP-Established
1147621 From-Botnet-V1-SPAM
1341984 From-Botnet-V1-DNS
3528135 From-Botnet-V1-TCP-Attempt
18539704 Background

For Win12
--------
  19978 From-Botnet-V2-UDP-Establishedd
 103722 Background-ARP
 229546 From-Botnet-V2-UDP-Attempt
 296121 From-Botnet-V2-TCP-Established
1352063 From-Botnet-V2-SPAM
1574132 From-Botnet-V2-DNS
5723624 From-Botnet-V2-TCP-Attempt
19291438 Background


Timeline
========
Thu Sep  5 15:40:07 CEST 2013
Started the vm win8. 
Once a windows has booted up, the malware wakes up by itself automaticaly and starts to sent a lot of data.


Thu Sep  5 15:50:06 CEST 2013
We infect Win12 i

Malware binary deleted iself and starts to send a lot of encrypted data. 

Tue Oct  1 13:37:58 CEST 2013
Win8 is stopped

Tue Oct  1 13:38:29 CEST 2013
Win12 is stopped



Traffic Analysis
================

In [5]:
df = pd.read_csv('2013-10-01_capture-win12.netflow.csv', low_memory=False)
df.fillna(0, inplace=True)


In [6]:
df.describe()

Unnamed: 0,Dur,sTos,dTos,TotPkts,TotBytes
count,5422750.0,5422750.0,5422750.0,5422750.0,5422750.0
mean,1.150334,1.482076,0.0,4.443069,584.008
std,1.543644,16.80363,0.0,11.98677,2897.215
min,0.0,0.0,0.0,1.0,54.0
25%,0.0,0.0,0.0,1.0,62.0
50%,0.009785,0.0,0.0,2.0,124.0
75%,2.994563,0.0,0.0,2.0,184.0
max,9.195051,192.0,0.0,614.0,578465.0


In [7]:
#building ordinary least squares model
linear_formula = 'TotBytes ~ Dur+sTos+dTos+TotPkts'

lm = smf.ols(formula=linear_formula, data=df).fit()

In [8]:
lm.params

Intercept     96.136054
Dur         -165.355228
sTos          -0.625527
dTos           0.000000
TotPkts      152.825171
dtype: float64

In [9]:
lm.pvalues

  return self.params / self.bse
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


Intercept    0.000000e+00
Dur          0.000000e+00
sTos         2.568300e-26
dTos                  NaN
TotPkts      0.000000e+00
dtype: float64

In [10]:
lm.rsquared

0.3688405476256026

In [11]:
lm.conf_int()

Unnamed: 0,0,1
Intercept,93.70747,98.564638
Dur,-166.694463,-164.015992
sTos,-0.741036,-0.510017
dTos,0.0,0.0
TotPkts,152.652857,152.997485


In [12]:
correlation_matrix = df.corr()
print(correlation_matrix)

               Dur      sTos  dTos   TotPkts  TotBytes
Dur       1.000000  0.058019   NaN  0.346264  0.130627
sTos      0.058019  1.000000   NaN  0.040127  0.016632
dTos           NaN       NaN   NaN       NaN       NaN
TotPkts   0.346264  0.040127   NaN  1.000000  0.601638
TotBytes  0.130627  0.016632   NaN  0.601638  1.000000


In [None]:
df.dropna()
knn = neighbors.KNeighborsRegressor(n_neighbors=5)
X = df[['Dur', 'sTos', 'dTos', 'TotPkts']]
Y = df['TotBytes']
knn.fit(X, Y)

T = pd.DataFrame(np.arange(0, 100, 0.1)[:, np.newaxis])
T[1] = T[0]
T[2] = T[0]
T[3] = T[0]

Y_ = knn.predict(T)

score = cross_val_score(knn, X, Y, cv=5)
print('Unweighted Accuracy: %0.2f (+/- %0.2f)' % (score.mean(), score.std()))

In [None]:
knn_n = neighbors.KNeighborsRegressor(n_neighbors=5)
X = df[['Dur', 'sTos', 'dTos', 'TotPkts']]
normalized_X = preprocessing.normalize(X)
Y = df['TotBytes']
knn_n.fit(normalized_X, Y)

T = pd.DataFrame(np.arange(0, 1, 0.01)[:, np.newaxis])
T[1] = T[0]
T[2] = T[0]
T[3] = T[0]

Y_ = knn_n.predict(T)

score = cross_val_score(knn_n, normalized_X, Y, cv=5)
print('Unweighted Normalized Accuracy: %0.2f (+/- %0.2f)' % (score.mean(), score.std()))