In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cvxpy as cp
import seaborn as sns
import emm
import warnings
warnings.filterwarnings('ignore')

# Plot styles
plt.style.use('seaborn-notebook')

import matplotlib.pyplot as plt

SMALL_SIZE = 12
MEDIUM_SIZE = 16
BIGGER_SIZE = 22

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [2]:
# Generate example data
m = 5000
# Target distribution
mu0 = np.array([115]); sig0 = np.array([10])
mu1 = np.array([135]); sig1 = np.array([15])
X0 = np.random.normal(loc=mu0, scale=sig0, size=m // 2)
X1 = np.random.normal(loc=mu1, scale=sig1, size=m // 2)
y0 = np.zeros(m // 2)
y1 = np.ones(m // 2)
X = np.concatenate([X0,X1])
y = np.concatenate([y0,y1])

target = pd.DataFrame({'feature' : X})
target['Outcome'] = y

# Corpus distribution
mu = np.array([125]); sig = np.array([15])
corpus = np.random.normal(loc=mu, scale=sig, size=m)
corpus = pd.DataFrame({'feature': corpus})

In [27]:
histLoss0 = emm.losses.CorpusKLLoss(mean=mu0[0], std=sig0[0])
histLoss1 = emm.losses.CorpusKLLoss(mean=mu1[0], std=sig1[0])
lam = 0.1
margsKL = {
            0 : {
                     'feature' : {'fun' : [histLoss0.fun], 'loss' : [histLoss0]},
                 'regularizer' : {'reg' : emm.regularizers.EntropyRegularizer(),
                                  'lam' : lam}
                },
            1 : {
                     'feature' : {'fun' : [histLoss1.fun], 'loss' : [histLoss1]},
                 'regularizer' : {'reg' : emm.regularizers.EntropyRegularizer(),
                                  'lam' : lam}
                }
          }
                              
margsLS = {
            0 : {
                     'feature' : {'fun' : ['mean'], 'loss' : [emm.losses.LeastSquaresLoss(mu0[0])]},
                 'regularizer' : {'reg' : emm.regularizers.EntropyRegularizer(),
                                  'lam' : lam}
                                  
                },
            1 : {
                     'feature' :  {'fun' : ['mean'], 'loss' : [emm.losses.LeastSquaresLoss(mu1[0])]},
                 'regularizer' : {'reg' : emm.regularizers.EntropyRegularizer(),
                                  'lam' : lam}
                }
           }
    
margsEq = {
            0 : {
                     'feature' : {'fun' : ['mean'], 'loss' : [emm.losses.EqualityLoss(mu0[0])]},
                 'regularizer' : {'reg' : emm.regularizers.EntropyRegularizer(),
                                  'lam' : lam}
                },
            1 : {
                     'feature' :  {'fun' : ['mean'], 'loss' : [emm.losses.EqualityLoss(mu1[0])]},
                 'regularizer' : {'reg' : emm.regularizers.EntropyRegularizer(),
                                  'lam' : lam}
                }
           }
    

    
margs = [margsKL, margsLS, margsEq]

from sklearn.linear_model import LogisticRegression
param_grid_log =  {"classifier" : [LogisticRegression()],
                 "classifier__C": np.logspace(-3,3,7), 
                   "classifier__penalty":["l1","l2"]}
from sklearn.tree import DecisionTreeClassifier
param_grid_tree =  {'classifier' : [DecisionTreeClassifier()],
     'classifier__max_depth' : [2,4,6,7,9]}
param_grid = [param_grid_log, param_grid_tree]
   
(rwcs, js, scores) = emm.metrics.multiple_models(target,corpus,margs, 
                                        param_grid, verbose=True)

CVX took 1.33032 seconds
CVX took 2.91266 seconds
Fitting 5 folds for each of 14 candidates, totalling 70 fits
Target data: the best parameters are given by 
 LogisticRegression(C=0.01)
 the best mean cross-validation accuracy 79.9 +/- 1.28792% on training dataset 

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Target data: the best parameters are given by 
 DecisionTreeClassifier(max_depth=4)
 the best mean cross-validation accuracy 80.725 +/- 1.44568% on training dataset 

CVX took 0.46269 seconds
CVX took 0.48979 seconds
Fitting 5 folds for each of 14 candidates, totalling 70 fits
Target data: the best parameters are given by 
 LogisticRegression(C=0.001)
 the best mean cross-validation accuracy 80.075 +/- 1.61941% on training dataset 

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Target data: the best parameters are given by 
 DecisionTreeClassifier(max_depth=2)
 the best mean cross-validation accuracy 79.65 +/- 1.11355% on training dataset 

CVX took 0

In [28]:
print(js)

[0.05629240634440292, 0.11763757497764345, 0.11765585987475284]


In [29]:
l = []
for d in scores:
    l += [d['accuracy_score']]
    
df = pd.DataFrame(l)
display(df)
display(df.RR/df.SR)

Unnamed: 0,RR,RS,SS,SR
0,0.8,0.762321,0.766241,0.798
1,0.787,0.789738,0.787675,0.79
2,0.793,0.744226,0.736209,0.791
3,0.807,0.733628,0.740189,0.804
4,0.802,0.756395,0.75784,0.798
5,0.789,0.7402,0.733047,0.784


0    1.002506
1    0.996203
2    1.002528
3    1.003731
4    1.005013
5    1.006378
dtype: float64

In [35]:
zip(['-18','18-30','30-40','40-60','60+'],[  0.05,   0.1,   0.25,   0.35,  0.25])

<zip at 0x26fed9e3900>