## Examen: Aprendizaje Automatico

## JFME

### Organizacion general del proyecto (version simple)

1. main.py : funcionalidad principal
2. experiments.py : registro de experimentos
3. functions.py : funciones de proyecto
4. otros archivos complementarios

In [1]:
import numpy as np
import pandas as pd
import functions as fn
import pickle

## Read Input Data

In [2]:
data_train = pd.read_csv('files/train.csv')
data_test = pd.read_csv('files/test.csv')
ids_train = data_train['id']
data_train.drop('id', inplace=True, axis=1)
ids_test = data_test['id']
data_test.drop('id', inplace=True, axis=1)

## Preprocessing variables

In [3]:
feats = ['bone_length', 'rotting_flesh', 'hair_length', 'has_soul']

# ------------------------------------------------------------------------------------- D1: DATA SCALING -- #
mu_train, std_train = data_train[feats].mean(axis=0), data_train[feats].std(axis=0)
z_train = (data_train[feats] - mu_train)/std_train
data_train[feats] = z_train

mu_test, std_test = data_test[feats].mean(axis=0), data_test[feats].std(axis=0)
z_test = (data_test[feats] - mu_test)/std_test
data_test[feats] = z_test

In [4]:
# -- Dummy variables with color
data_train = pd.concat([data_train, pd.get_dummies(data_train['color'], prefix = 'color')], axis=1)
data_train = data_train.drop('color', 1)
data_test = pd.concat([data_test, pd.get_dummies(data_test['color'], prefix = 'color')], axis=1)
data_test = data_test.drop('color', 1)

In [5]:
# -- One-hot encode target variable
data_train['type'] = fn.variable_onehot(p_data=data_train['type'])

In [6]:
# -- Add Bias 
data_train['bias'] = 1
data_columns = list(data_train.columns)
data_columns.remove('bias')
data_train = data_train[['bias'] + data_columns]

data_test['bias'] = 1
data_columns = list(data_test.columns)
data_columns.remove('bias')
data_test = data_test[['bias'] + data_columns]

# -- Convert to np.array
train_data_ovr = fn.data_ovr(p_df=data_train, p_target='type')

### Training multiple models using One Vs Rest Heuristic

In [7]:
# ------------------------------------------------------------------------------- learning based in ovr -- #  
models_ovr = fn.ovr_learning(p_data_ovr=train_data_ovr)

mejora encontrada para:  data_0
mejora encontrada para:  data_1
mejora encontrada para:  data_2


In [8]:
### Results

In [9]:
# model inf
models_ovr['model_0']['train']['cost']
models_ovr['model_0']['val']['cost']
models_ovr['model_0']['fitted_cost']
models_ovr['model_0']['weights']
models_ovr['model_0']['params']

models_ovr['model_0']['train']['cost']
models_ovr['model_0']['val']['cost']
models_ovr['model_0']['fitted_cost']
models_ovr['model_0']['weights']
models_ovr['model_0']['params']

models_ovr['model_0']['train']['cost']
models_ovr['model_0']['val']['cost']
models_ovr['model_0']['fitted_cost']
models_ovr['model_0']['weights']
models_ovr['model_0']['params']

{'lambda': 0.25, 'alpha': 0.1}

In [10]:
# -- convert to np.array
test_data_ovr = np.array(data_test)

# -- prediction based in ovr
# vote weighting (ocurrences in train data)
oc = data_train['type'].value_counts()
vw = [np.round(oc[0]/oc.sum(), 4),
      np.round(oc[1]/oc.sum(), 4),
      np.round(oc[2]/oc.sum(), 4)]

In [11]:
# result
result = fn.ovr_predict(p_data_ovr=test_data_ovr, p_models_ovr=models_ovr, p_vote_w=vw)

# probabilistic results
result.head()

# check for balance of classes before summit results
result['decision'].value_counts()

0    227
2    183
1    119
Name: decision, dtype: int64

In [12]:
# define experiment tag
experiment = 'submission_v23'

# -------------------------------------------------------------------------------------- SUBMISSION FILE -- #
submission = pd.DataFrame({'id': ids_test, 'type': result['decision']})
type_dict_sub = {0: 'Ghoul', 1: 'Goblin', 2: 'Ghost'}
submission['type'] = submission['type'].map(type_dict_sub).astype(object)
submission.to_csv('files/submissions/' + experiment + '.csv', index=False)

# ------------------------------------------------------------------------------------------ PICKLE RICK -- #
pickle_rick = 'files/submissions/' + experiment + '.dat'
with open(pickle_rick, "wb") as f:
    pickle.dump(models_ovr, f)

### Experimento mas alto

In [None]:
# -- [v223]
# 221 - inv-weighted hypercost

# scalling = standarizacion
# dummies = color
# bias = si
# epochs = 10000
# sample split = 0.20
# tolerance = 1e-4
# hyper_m0 = {'lambda': 1.1, 'alpha': 0.01}
# hyper_m1 = {'lambda': 1.1, 'alpha': 0.01}
# hyper_m2 = {'lambda': 1.1, 'alpha': 0.01}
# funcion para eleccion de hyper = train*0.2 + val*0.8

# peso para voto = [0.3477, 0.3369, 0.3154]
# 0    219
# 2    199
# 1    111
# acc_test = 0.7710
# archivo = submission_v223.csv

In [18]:
 # read the file
p_data_file = 'files/submissions/v223/submission_v223.dat'
with open(p_data_file, 'rb') as handle:
    loaded_data = pickle.load(handle)

In [22]:
loaded_data['model_0']['weights']

array([[-0.93651695,  0.93096208, -0.05026663,  1.26367671,  1.11980552,
        -0.01226787, -0.46240712, -0.60463049, -0.59249564, -0.5292599 ,
        -0.12712157]])

In [24]:
loaded_data['model_1']['weights']

array([[-0.4493858 , -0.04434608, -0.54610138,  0.08303783,  0.00189016,
         0.01766091, -0.00620579,  0.04717678, -0.05580948, -0.03131267,
        -0.19821383]])

In [25]:
loaded_data['model_2']['weights']

array([[-0.69380248, -0.53084577,  0.80817991, -0.88950331, -0.82819126,
        -0.00586256,  0.08341573,  0.00269831, -0.15032823,  0.02644557,
        -0.21345502]])