# Cargamos las librerias y los datasets

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
severity_type = pd.read_csv('data/severity_type.csv')
resource_type = pd.read_csv('data/resource_type.csv')
event_type = pd.read_csv('data/event_type.csv')
log_feature = pd.read_csv('data/log_feature.csv')

## Event type

La estrategia a seguir es codificar las variables categoricas, para esto crearemos una columna por cada variable en el dataset, para event_type tenemos 53 distintas varibles por lo que el dataset resultante sera de (18552, 54. Esto nos permite tener una matriz en la cual cada id tendra asociado sus tipos de eventos distintos.

In [3]:
event_type.head(5)

Unnamed: 0,id,event_type
0,6597,event_type 11
1,8011,event_type 15
2,2597,event_type 15
3,5022,event_type 15
4,5022,event_type 11


In [4]:
event_type_matrix = pd.get_dummies(event_type, columns=['event_type']).groupby(['id'], as_index=False).sum()
event_type_matrix.head(5)

Unnamed: 0,id,event_type_event_type 1,event_type_event_type 10,event_type_event_type 11,event_type_event_type 12,event_type_event_type 13,event_type_event_type 14,event_type_event_type 15,event_type_event_type 17,event_type_event_type 18,...,event_type_event_type 5,event_type_event_type 50,event_type_event_type 51,event_type_event_type 52,event_type_event_type 53,event_type_event_type 54,event_type_event_type 6,event_type_event_type 7,event_type_event_type 8,event_type_event_type 9
0,1,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Resource 

In [5]:
resource_type.head()

Unnamed: 0,id,resource_type
0,6597,resource_type 8
1,8011,resource_type 8
2,2597,resource_type 8
3,5022,resource_type 8
4,6852,resource_type 8


In [6]:
resource_type_matrix = pd.get_dummies(resource_type, columns=['resource_type']).groupby(['id'], as_index=False).sum()
resource_type_matrix.head()

Unnamed: 0,id,resource_type_resource_type 1,resource_type_resource_type 10,resource_type_resource_type 2,resource_type_resource_type 3,resource_type_resource_type 4,resource_type_resource_type 5,resource_type_resource_type 6,resource_type_resource_type 7,resource_type_resource_type 8,resource_type_resource_type 9
0,1,0,0,0,0,0,0,1,0,1,0
1,2,0,0,1,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,0,1,0
3,4,0,0,1,0,0,0,0,0,0,0
4,5,0,0,1,0,0,0,0,0,0,0


In [7]:
resource_type_matrix.shape

(18552, 11)

### Log_feature

La estrategía a seguir es similar a la de los demas datasets, se crearan columnas por cada varible categorica, el dataset resultante será de una dimension (18552, 388), la columna volume se quedará igual

In [8]:
log_feature.head()

Unnamed: 0,id,log_feature,volume
0,6597,feature 68,6
1,8011,feature 68,7
2,2597,feature 68,1
3,5022,feature 172,2
4,5022,feature 56,1


In [9]:
log_feature_matrix = pd.get_dummies(log_feature, columns=['log_feature']).groupby(['id'], as_index=False).sum()

In [10]:
log_feature_matrix.head()

Unnamed: 0,id,volume,log_feature_feature 1,log_feature_feature 10,log_feature_feature 100,log_feature_feature 101,log_feature_feature 102,log_feature_feature 103,log_feature_feature 104,log_feature_feature 105,...,log_feature_feature 90,log_feature_feature 91,log_feature_feature 92,log_feature_feature 93,log_feature_feature 94,log_feature_feature 95,log_feature_feature 96,log_feature_feature 97,log_feature_feature 98,log_feature_feature 99
0,1,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,17,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Severity type

Este dataset solo tiene un registro por id

In [11]:
severity_type.head()

Unnamed: 0,id,severity_type
0,6597,severity_type 2
1,8011,severity_type 2
2,2597,severity_type 2
3,5022,severity_type 1
4,6852,severity_type 1


In [12]:
severity_type_matrix = pd.get_dummies(severity_type, columns=['severity_type']).groupby(['id'], as_index=False).sum()

In [13]:
severity_type_matrix.head()

Unnamed: 0,id,severity_type_severity_type 1,severity_type_severity_type 2,severity_type_severity_type 3,severity_type_severity_type 4,severity_type_severity_type 5
0,1,1,0,0,0,0
1,2,0,1,0,0,0
2,3,1,0,0,0,0
3,4,0,0,0,1,0
4,5,0,1,0,0,0


### Train y Test Locations

La estrategía a seguir es etiquetar cada variable categorica con un numero, utilizando LabelEncoder de Sklearn. Se mezclaran ambos datasets para que LabelEncoder etiquete las locaciones del mismo modo

In [14]:
completo = pd.concat([train, test], axis=0, sort=False)

In [15]:
le = LabelEncoder()
completo.location = le.fit_transform(completo.location)

train = completo.dropna()

In [16]:
test = completo.iloc[7381:, :-1]

### Generando un dataset con todas las variables 

In [17]:
# Entrenamiento
train = train.merge(event_type_matrix, on=['id'])
train = train.merge(resource_type_matrix, on=['id'])
train = train.merge(log_feature_matrix, on=['id'])
train = train.merge(severity_type_matrix, on=['id'])

In [18]:
# Prueba 
test = test.merge(event_type_matrix, on=['id'])
test = test.merge(resource_type_matrix, on=['id'])
test = test.merge(log_feature_matrix, on=['id'])
test = test.merge(severity_type_matrix, on=['id'])

In [19]:
test.shape[1] == train.shape[1]-1

True

In [20]:
train.to_csv('train_completo.csv', index=False)
test.to_csv('test_completo.csv', index=False)