In [272]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from termcolor import colored
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer



from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline

# Preparación de los datos

In [273]:
print(colored(f'Cargamos los datos', attrs=['bold']))
data = pd.read_csv('./data_identificada.csv')
data.head(2)

[1mCargamos los datos[0m


Unnamed: 0,index,Name,Netflows,First_Protocol,Second_Protocol,Third_Protocol,p1_d,p2_d,p3_d,duration,...,second_dp,third_dp,p1_ip,p2_ip,p3_ip,p1_ib,p2_ib,p3_ib,Type,Cryptocurrency
0,2169,1113-csves/1.csv,400,TCP,UDP,,0.0,0.0,9.04175,4287.992,...,53.0,80.0,1.0,1.0,5.5,32.0,33.0,873.0,not_mine,Does not have
1,418,214-csves/1.csv,46,UDP,,,0.0,0.0,0.0,0.0,...,49129.0,47733.0,1.0,1.0,1.0,33.0,49.0,110.75,not_mine,Does not have


In [274]:
data = data.drop(['Second_Protocol', 'Third_Protocol'], axis = 1)

# Eliminar caracteristicas que no aportan valor

In [275]:
data = data.drop(['index', 'Name'], axis = 1)

# Imputación de valores ausentes

In [276]:
punter = pd.concat([data['second_sp'],data['second_dp'],data['third_sp'],data['third_dp']], axis = 1)
imputer = SimpleImputer(missing_values = np.nan, strategy = "median")
values = imputer.fit_transform(punter)

In [277]:
temp = data.drop(['second_sp','second_dp','third_sp','third_dp'], axis = 1)
punter = pd.DataFrame(values, columns = punter.columns)
data = pd.concat([temp,punter], axis = 1)
data.head(2)

Unnamed: 0,Netflows,First_Protocol,p1_d,p2_d,p3_d,duration,max_d,min_d,#packets,Avg_bps,...,p3_ip,p1_ib,p2_ib,p3_ib,Type,Cryptocurrency,second_sp,second_dp,third_sp,third_dp
0,400,TCP,0.0,0.0,9.04175,4287.992,65.519,0.0,6750,360512,...,5.5,32.0,33.0,873.0,not_mine,Does not have,53.0,53.0,80.0,80.0
1,46,UDP,0.0,0.0,0.0,0.0,0.0,0.0,46,1296,...,1.0,33.0,49.0,110.75,not_mine,Does not have,49129.0,49129.0,47733.0,47733.0


# Exclusión de variables con varianza próxima a cero

# Eliminar la multicolinialidad

# OneHotEncoder

In [278]:
data_categoric = data[data.select_dtypes(include=['object']).columns.to_list()]
one_hot =  OneHotEncoder(drop="first")
one_hot.fit_transform(data_categoric)

<4733x9 sparse matrix of type '<class 'numpy.float64'>'
	with 11481 stored elements in Compressed Sparse Row format>

In [279]:
one_hot.categories_

[array(['ICMP', 'ICMP6', 'TCP', 'UDP'], dtype=object),
 array(['mine', 'not_mine'], dtype=object),
 array(['Bitcash', 'Bitcoin', 'Does not have', 'Etherium', 'Litecoin',
        'Monero'], dtype=object)]

In [280]:
dataDummy = pd.get_dummies(data_categoric)
dataDummy.head()

Unnamed: 0,First_Protocol_ICMP,First_Protocol_ICMP6,First_Protocol_TCP,First_Protocol_UDP,Type_mine,Type_not_mine,Cryptocurrency_Bitcash,Cryptocurrency_Bitcoin,Cryptocurrency_Does not have,Cryptocurrency_Etherium,Cryptocurrency_Litecoin,Cryptocurrency_Monero
0,0,0,1,0,0,1,0,0,1,0,0,0
1,0,0,0,1,0,1,0,0,1,0,0,0
2,0,0,1,0,0,1,0,0,1,0,0,0
3,0,0,1,0,1,0,0,1,0,0,0,0
4,0,0,1,0,0,1,0,0,1,0,0,0


# Estandarización

In [281]:
data_numeric = data[data.select_dtypes(include=['float64', 'int64']).columns.to_list()]

In [282]:
preprocessor = ColumnTransformer([
    ('scale', StandardScaler(), data_numeric.columns), 
], remainder='passthrough')

In [283]:
values = preprocessor.fit_transform(data_numeric)
values

array([[ 1.5753182 , -0.70094869, -0.80447685, ..., -0.80602925,
        -0.48227582, -0.36157031],
       [-0.27272335, -0.70094869, -0.80447685, ...,  1.11078473,
         2.01069298,  2.94298939],
       [ 0.47380191, -0.15963957, -0.24099573, ..., -0.80329519,
        -0.48227582, -0.36157031],
       ...,
       [-0.50242342,  0.70358428,  0.64043234, ...,  1.3982912 ,
        -0.48002628, -0.3363976 ],
       [-0.50242342,  2.11540637,  2.09284154, ..., -0.67776254,
        -0.48002628, -0.3363976 ],
       [-0.50242342,  2.43640092,  2.42316607, ...,  1.48773419,
        -0.48002628, -0.3363976 ]])

In [284]:
data_standarizada =  pd.DataFrame(values, columns = data_numeric.columns)
data_standarizada.head(1)

Unnamed: 0,Netflows,p1_d,p2_d,p3_d,duration,max_d,min_d,#packets,Avg_bps,Avg_pps,...,p1_ip,p2_ip,p3_ip,p1_ib,p2_ib,p3_ib,second_sp,second_dp,third_sp,third_dp
0,1.575318,-0.700949,-0.804477,-0.909496,0.054551,-0.864733,-0.625504,0.570941,1.236305,1.285332,...,-0.368592,-0.401682,-0.402089,-0.424797,-0.488919,-0.510645,-0.825878,-0.806029,-0.482276,-0.36157


### Concatenación de los conjuntos de datos


In [285]:
data_p = pd.concat([data_standarizada, dataDummy], axis = 1)
data_p.columns

Index(['Netflows', 'p1_d', 'p2_d', 'p3_d', 'duration', 'max_d', 'min_d',
       '#packets', 'Avg_bps', 'Avg_pps', 'Avg_bpp', '#Bytes', '#sp', '#dp',
       'first_sp', 'first_dp', 'p1_ip', 'p2_ip', 'p3_ip', 'p1_ib', 'p2_ib',
       'p3_ib', 'second_sp', 'second_dp', 'third_sp', 'third_dp',
       'First_Protocol_ICMP', 'First_Protocol_ICMP6', 'First_Protocol_TCP',
       'First_Protocol_UDP', 'Type_mine', 'Type_not_mine',
       'Cryptocurrency_Bitcash', 'Cryptocurrency_Bitcoin',
       'Cryptocurrency_Does not have', 'Cryptocurrency_Etherium',
       'Cryptocurrency_Litecoin', 'Cryptocurrency_Monero'],
      dtype='object')

In [286]:
data_p.to_csv('data_i.csv', index=False)

https://machinelearningmastery.com/iterative-imputation-for-missing-values-in-machine-learning/