In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Preparación de los datos
#### 'Features' description
- ts:     Hora de inicio del flujo
- te:     Hora de finalización del flujo
- td:     Duración del flujo
- sa:     Dirección IP origen
- da:     Dirección IP de destino
- sp:     Puerto de origen
- dp:     Puerto de destino
- pr:     Protocolo
- flg:    Banderas
- fwd:    Estado de reenvío
- stos:   Tipo de servicio de origen
- ipkt:   Paquetes de entrada
- ibyt:   Bytes de entrada
- opkt:   Paquetes de salida
- obyt:   Bytes de salida
- in:     Interfaz de entrada
- out:    Interfaz de salida
- sas:    Número de sistema autónomo BGP de origen
- das:    Número de sistema autónomo BGP de destino
- smk:    Máscara de red de origen
- dmk:    Máscara de red de destino
- dtos:   Tipo de servicio de destino
- dir:    Dirección del flujo
- nh:     Enrutador de siguiente salto
- nhb:    Enrutador de siguiente salto BGP
- svln:   VLAN de origen
- dvln:   VLAN de destino
- ismc:   Dirección MAC de la fuente de entrada   
- odmc:   Dirección MAC de destino de salida
- idmc:   Introduzca la dirección MAC de destino
- osmc:   Dirección MAC de la fuente de salida
- mpls1:  Etiqueta MPLS 1
- mpls2:  Etiqueta MPLS 2
- mpls3:  Etiqueta MPLS 3
- mpls4:  Etiqueta MPLS 4
- mpls5:  Etiqueta MPLS 5
- mpls6:  Etiqueta MPLS 6
- mpls7:  Etiqueta MPLS 7
- mpls8:  Etiqueta MPLS 8
- mpls9:  Etiqueta MPLS 9
- mpls10: Etiqueta MPLS 10
- cl:     Latencia del cliente
- sl:     Latencia del servidor
- al:     Latencia de la aplicación
- ra:     Exportando la dirección IP del sistema (enrutador)
- eng:    Tipo / ID de motor
- exid:   Exportando el ID del sistema
- tr:     Marca de tiempo de flujo recibido
- type:   Tipo de registro Mina/ no Mina 
- cryptocurrency: : Tipo criptomoneda
- id_Cryptocurrency:: Id Criptomoneda

In [9]:
# Importamos el conjunto de datos
data = pd.read_csv('./dataA.csv')
data.head()

Unnamed: 0,index,References,ts,te,td,sa,da,sp,dp,pr,...,cl,sl,al,ra,eng,exid,tr,cryptocurrency,id_Cryptocurrency,type
0,33812,134-csves/0.csv,2020-09-10 08:48:49,2020-09-10 08:48:59,9.557,120.55.220.55,192.168.0.13,443.0,56780.0,TCP,...,0.0,0.0,0.0,0.0.0.0,0/0,0.0,1969-12-31 19:00:00.000,,0,not_mine
1,227478,653-csves/0.csv,2020-09-15 19:33:50,2020-09-15 19:33:50,0.0,104.91.165.202,192.168.0.13,80.0,45158.0,TCP,...,0.0,0.0,0.0,0.0.0.0,0/0,0.0,1969-12-31 19:00:00.000,,0,not_mine
2,287057,834-csves/0.csv,2020-09-17 16:16:41,2020-09-17 16:19:42,180.22,192.168.0.13,23.52.112.10,58542.0,443.0,TCP,...,0.0,0.0,0.0,0.0.0.0,0/0,0.0,1969-12-31 19:00:00.000,,0,not_mine
3,387249,1121-csves/0.csv,2020-09-20 12:20:12,2020-09-20 12:20:12,0.0,192.168.0.13,13.35.111.67,58192.0,443.0,TCP,...,0.0,0.0,0.0,0.0.0.0,0/0,0.0,1969-12-31 19:00:00.000,,0,not_mine
4,236501,682-csves/0.csv,2020-09-16 14:09:03,2020-09-16 14:09:52,49.616,192.168.0.13,200.196.225.98,60108.0,443.0,TCP,...,0.0,0.0,0.0,0.0.0.0,0/0,0.0,1969-12-31 19:00:00.000,,0,not_mine


# Análisis inicial

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 465084 entries, 0 to 465083
Data columns (total 53 columns):
index                465084 non-null int64
References           465084 non-null object
ts                   465084 non-null object
te                   464976 non-null object
td                   464976 non-null float64
sa                   464976 non-null object
da                   464976 non-null object
sp                   464976 non-null float64
dp                   464976 non-null float64
pr                   464976 non-null object
flg                  464976 non-null object
fwd                  464976 non-null float64
stos                 464976 non-null float64
ipkt                 464976 non-null float64
ibyt                 464976 non-null float64
opkt                 464976 non-null float64
obyt                 464976 non-null float64
in                   464976 non-null float64
out                  464976 non-null float64
sas                  464976 non-null float6

### Inspección datos nulos

In [12]:
avaliable = data.notnull()
avaliable.apply(pd.Series.value_counts).T

Unnamed: 0,False,True
index,,465084.0
References,,465084.0
ts,,465084.0
te,108.0,464976.0
td,108.0,464976.0
sa,108.0,464976.0
da,108.0,464976.0
sp,108.0,464976.0
dp,108.0,464976.0
pr,108.0,464976.0


Columnas con valores nulos:
- te: 108.0 valores 
- td: 108.0 valores 
- sa: 108.0 valores 
- da: 108.0 valores 
- sp: 108.0 valores 
- dp: 108.0 valores 
- pr: 108.0 valores 
- flg: 108.0 valores 
- fwd: 108.0 valores 
- stos: 108.0 valores 
- ipkt: 108.0 valores 
- ibyt: 108.0 valores 
- opkt: 108.0 valores 
- obyt: 108.0 valores 
- in: 108.0 valores 
- out: 108.0 valores
- sas: 108.0 valores
- das: 108.0 valores
- smk: 108.0 valores
- dmk: 108.0 valores
- dtos: 108.0 valores
- dir: 108.0 valores
- nh: 108.0 valores
- nhb: 108.0 valores
- svln: 108.0 valores
- dvln: 108.0 valores
- ismc: 108.0 valores
- odmc: 108.0 valores
- idmc: 108.0 valores
- osmc: 108.0 valores
- mpls1: 108.0 	Valores
- mpls2: 108.0 	Valores
- mpls3: 108.0 	Valores
- mpls4: 108.0 	Valores
- mpls5: 108.0 	Valores
- mpls6: 108.0 	Valores
- mpls7: 108.0 	Valores
- mpls8: 108.0 	Valores
- mpls9: 108.0 	Valores
- mpls10: 108.0 Valores
- cl:	108.0 Valores
- sl:	108.0 Valores
- al:	108.0 Valores
- ra:	108.0 Valores
- eng:	108.0 Valores
- exid:	108.0 Valores
- tr:	108.0 Valores
- cryptocurrency: 451995.0 Valores

### Estructuración del conjunto de datos
Agrupo las columnas de tipo int64, float64 y object por separado, para hacer una analisis mas organizado

In [15]:
data_numeric_int = (data.dtypes == "int64")
data_numeric_float = (data.dtypes == "float64")
data_object  = (data.dtypes == "object")
data_numeric_int = [c for c in data_numeric_int.index if data_numeric_int[c]]
data_numeric_float = [c for c in data_numeric_float.index if data_numeric_float[c]]
data_object = [c for c in data_object.index if data_object[c]]

'Features' numericas discretas

In [17]:
data_numeric_int

['index', 'id_Cryptocurrency']

'Features' numericas continuas

In [33]:
print(data_numeric_float)

['td', 'sp', 'dp', 'fwd', 'stos', 'ipkt', 'ibyt', 'opkt', 'obyt', 'in', 'out', 'sas', 'das', 'smk', 'dmk', 'dtos', 'dir', 'svln', 'dvln', 'cl', 'sl', 'al', 'exid']


'Features' categoricas

In [34]:
print(data_object)

['References', 'ts', 'te', 'sa', 'da', 'pr', 'flg', 'nh', 'nhb', 'ismc', 'odmc', 'idmc', 'osmc', 'mpls1', 'mpls2', 'mpls3', 'mpls4', 'mpls5', 'mpls6', 'mpls7', 'mpls8', 'mpls9', 'mpls10', 'ra', 'eng', 'tr', 'cryptocurrency', 'type']


# Inspección de datos no agrupados

In [36]:
#Datos categoricos
data[data_object].describe().T

Unnamed: 0,count,unique,top,freq
References,465084,3487,1033-csves/1.csv,3313
ts,465084,79737,2020-09-15 12:51:23,192
te,464976,105209,2020-09-15 12:51:23,192
sa,464976,10226,192.168.0.13,220340
da,464976,10252,192.168.0.13,217974
pr,464976,4,TCP,346633
flg,464976,20,...A....,131167
nh,464976,1,0.0.0.0,464976
nhb,464976,1,0.0.0.0,464976
ismc,464976,1,00:00:00:00:00:00,464976


In [38]:
#Datos Continuos
data[data_numeric_float].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
td,464976.0,39.263422,67.287066,0.0,0.0,1.196,49.00025,599.533
sp,464976.0,23549.590486,24079.914259,0.0,443.0,4444.0,47184.0,65531.0
dp,464976.0,22467.779939,23889.626954,53.0,443.0,1900.0,46292.0,64790.0
fwd,464976.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
stos,464976.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ipkt,464976.0,27.2138,364.232301,1.0,1.0,3.0,9.0,110863.0
ibyt,464976.0,20610.796146,489035.645227,8.0,39.0,96.0,1058.0,163454700.0
opkt,464976.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
obyt,464976.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
in,464976.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
# Datos discretos
data[data_numeric_int].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
index,465084.0,219749.894886,133767.80344,0.0,103181.75,219452.5,335723.25,451994.0
id_Cryptocurrency,465084.0,0.102579,0.621102,0.0,0.0,0.0,0.0,5.0
