In [61]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from scipy.stats import multivariate_normal
from datetime import datetime

In [4]:
normales = pd.read_excel("estaturas.xlsx", sheet_name="normales")
normales

Unnamed: 0,Estatura(metros),Edad(años)
0,1.77,26.0
1,1.74,31.0
2,1.72,24.0
3,1.78,34.0
4,1.65,32.0
5,1.66,29.0
6,1.64,27.0
7,1.85,34.0
8,1.85,26.0
9,1.75,21.0


In [49]:
valtest = pd.read_excel("estaturas.xlsx", sheet_name="valtest(normales)")
valtest 

Unnamed: 0,Estatura(metros),Edad(años)
0,1.82,25.0
1,1.8,27.0
2,1.6,31.0
3,1.6,35.0
4,1.82,30.0
5,1.76,32.0
6,1.79,31.0
7,1.61,31.0


In [50]:
anomalias = pd.read_excel("estaturas.xlsx", sheet_name="valtest(anomalias)").rename(columns = {"Estatura": "Estatura(metros)",
                                                                                               "Edad": "Edad(años)"})
anomalias 

Unnamed: 0,Estatura(metros),Edad(años)
0,0.25,2019-04-02 00:00:00
1,175.1,2019-05-02 00:00:00
2,0.15,250.0
3,150.0,14.0


In [51]:
#Es necesario limpiar las edades mal coloadas en el set de anomalias
anomalias.loc[0:1, "Edad(años)"] = datetime.now().year - pd.to_datetime(anomalias.loc[0:1, "Edad(años)"]).dt.year
anomalias

Unnamed: 0,Estatura(metros),Edad(años)
0,0.25,3.0
1,175.1,3.0
2,0.15,250.0
3,150.0,14.0


In [63]:
pre_cv = pd.concat([valtest.loc[0:3], anomalias.loc[0:1]], ignore_index=True)
pre_cv

Unnamed: 0,Estatura(metros),Edad(años)
0,1.82,25.0
1,1.8,27.0
2,1.6,31.0
3,1.6,35.0
4,0.25,3.0
5,175.1,3.0


In [65]:
cv_labels = np.array([0, 0, 0, 0, 1, 1]).reshape(-1, 1)
cv_labels

array([[0],
       [0],
       [0],
       [0],
       [1],
       [1]])

In [53]:
pre_test = pd.concat([valtest.loc[4:7], anomalias.loc[2:3]], ignore_index=True)
pre_test

Unnamed: 0,Estatura(metros),Edad(años)
0,1.82,30.0
1,1.76,32.0
2,1.79,31.0
3,1.61,31.0
4,0.15,250.0
5,150.0,14.0


In [66]:
test_labels = np.array([0, 0, 0, 0, 1, 1]).reshape(-1, 1)
test_labels

array([[0],
       [0],
       [0],
       [0],
       [1],
       [1]])

In [54]:
scaler = StandardScaler()
scaler.fit(normales)
train = scaler.transform(normales)
train

array([[ 0.62277299, -0.60419651],
       [ 0.21359569,  0.17304657],
       [-0.05918917, -0.91509374],
       [ 0.75916542,  0.63939242],
       [-1.01393619,  0.32849519],
       [-0.87754376, -0.13785066],
       [-1.15032862, -0.44874789],
       [ 1.71391244,  0.63939242],
       [ 1.71391244, -0.60419651],
       [ 0.34998813, -1.38143959],
       [ 0.07720326, -1.07054236],
       [ 0.89555785, -0.75964513],
       [ 1.30473515, -0.91509374],
       [-1.01393619, -0.75964513],
       [ 0.89555785, -0.75964513],
       [-0.05918917,  0.63939242],
       [-0.33197403, -0.60419651],
       [-2.10507564, -0.75964513],
       [-0.87754376, -0.60419651],
       [ 0.34998813, -0.44874789],
       [ 0.62277299,  0.32849519],
       [ 1.03195028,  1.57208412],
       [ 1.71391244, -0.75964513],
       [ 1.30473515, -0.91509374],
       [ 0.34998813,  1.26118689],
       [ 0.07720326,  0.63939242],
       [-2.37786051, -0.13785066],
       [ 0.48638056, -0.29329928],
       [-0.60475889,

In [55]:
cv = scaler.transform(pre_cv)
cv

array([[ 1.30473515e+00, -7.59645125e-01],
       [ 1.03195028e+00, -4.48747892e-01],
       [-1.69589835e+00,  1.73046573e-01],
       [-1.69589835e+00,  7.94841038e-01],
       [-2.01088766e+01, -4.17951468e+00],
       [ 2.36471279e+03, -4.17951468e+00]])

In [56]:
test = scaler.transform(pre_test)
test

array([[ 1.30473515e+00,  1.75979566e-02],
       [ 4.86380558e-01,  3.28495189e-01],
       [ 8.95557853e-01,  1.73046573e-01],
       [-1.55950592e+00,  1.73046573e-01],
       [-2.14728009e+01,  3.42162935e+01],
       [ 2.02236779e+03, -2.46957990e+00]])

# Implementando modelo

### Distribucion normal multivariable

In [88]:
def estimate_gaussian(dataset):
    mu = np.mean(dataset, axis = 0)
    sigma = np.cov(dataset.T)
    return mu, sigma

In [89]:
def multivariate_gaussian(dataset,mu,sigma):
    p = multivariate_normal(mean=mu, cov=sigma)
    return p.pdf(dataset)

In [90]:
mu, sigma = estimate_gaussian(train)
mu, sigma

(array([-1.52550928e-15, -6.59849534e-17]),
 array([[1.01923077, 0.0976939 ],
        [0.0976939 , 1.01923077]]))

# Experimentos

In [91]:
def model_printer(thresholds, data, labels, mu, sigma):
    results = []
    for prob in thresholds:
        mn_probs = multivariate_gaussian(data, mu, sigma)
        mn_preds = (mn_probs < prob)
        
        results.append(["mn", prob, f1_score(labels, mn_preds ,average='binary')])
        
    return results

In [94]:
results = model_printer(np.arange(0, 1, 0.05), cv, cv_labels, mu, sigma)
pd.DataFrame(results, columns = ["model", "threshold", "f1_score"]).sort_values("f1_score", ascending = False)

Unnamed: 0,model,threshold,f1_score
1,mn,0.05,0.571429
10,mn,0.5,0.5
18,mn,0.9,0.5
17,mn,0.85,0.5
16,mn,0.8,0.5
15,mn,0.75,0.5
14,mn,0.7,0.5
13,mn,0.65,0.5
12,mn,0.6,0.5
11,mn,0.55,0.5


In [100]:
results = model_printer(np.arange(0, 0.05, 0.05/50), cv, cv_labels, mu, sigma)
res = pd.DataFrame(results, columns = ["model", "threshold", "f1_score"]).sort_values(["f1_score", "threshold"], ascending = [False, False])
res

Unnamed: 0,model,threshold,f1_score
24,mn,0.024,1.0
23,mn,0.023,1.0
22,mn,0.022,1.0
21,mn,0.021,1.0
20,mn,0.02,1.0
19,mn,0.019,1.0
18,mn,0.018,1.0
17,mn,0.017,1.0
16,mn,0.016,1.0
15,mn,0.015,1.0


Nos quedaremos con los thresholds que mejoraron el F1 en el test de CV para realizar el testing

In [109]:
thresholds = res[res["f1_score"]==1]["threshold"].tolist()
results = model_printer(thresholds, test, test_labels, mu, sigma)
pd.DataFrame(results, columns = ["model", "threshold", "f1_score"]).sort_values(["f1_score", "threshold"], ascending = [False, False])

Unnamed: 0,model,threshold,f1_score
0,mn,0.024,1.0
1,mn,0.023,1.0
2,mn,0.022,1.0
3,mn,0.021,1.0
4,mn,0.02,1.0
5,mn,0.019,1.0
6,mn,0.018,1.0
7,mn,0.017,1.0
8,mn,0.016,1.0
9,mn,0.015,1.0


El umbral mas restrictivo es de 0.001 y el menos restrictivo es del 0.024. Estos podrian elegirse segun la necesidad del negocio, en dependencia de si es importante reducir falsos positivos o falsos negativos.

Pero vale la pena recalcar que, idealmente, se necesitan muchos mas valores de cv y de testing para validar de mejor manera la hipotesis.