In [8]:
import pandas as pd
import pickle as pkl

from sklearn.linear_model import Lasso, LassoCV, LogisticRegressionCV, LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate, KFold, cross_val_predict
from sklearn.metrics import r2_score, explained_variance_score, normalized_mutual_info_score, \
    mutual_info_score, mean_absolute_error, make_scorer
from sklearn.dummy import DummyRegressor
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.ensemble import IsolationForest

from skll.metrics import spearman, pearson 


import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')
from pylab import rcParams
rcParams['figure.figsize'] = 6,6

Custom model implementations and functions are stored in `src/dairyml.py`

In [9]:
from dairyml import PerfectClassifierMeanRegressor, plot_r2, BoundedLasso, BoundedLassoPlusLogReg, plot_coefficients, DummyRegressorCustom

## Import the Data
Load the data from the pickle files created in `preproccess.ipynb`

In [10]:
with open("../pkl/data/data_outliers_removed", "rb" ) as f:
    [X, Y] = pkl.load(f)

## Baseline Models

We will use the below splitter for cross-validation: 10 folds, with shuffling

In [12]:
splitter= KFold(n_splits=10,shuffle=True,random_state=7)

Choose statistics used to evaluate the models

In [14]:
scoring = {'r2':make_scorer(r2_score), 
           'SRC':make_scorer(spearman), 
           'PCC':make_scorer(pearson), 
           'MI':make_scorer(mutual_info_score), 
           'MAE':make_scorer(mean_absolute_error)}
# scoring = [r2_score]

Import results df to store the training results

In [16]:
try:
    overall_results = pd.read_csv('../reports/model_results.csv',index_col=0)
except FileNotFoundError:
    overall_results = pd.DataFrame(columns = scoring.keys())

### Mean

The prediction for each food is the mean of the training lactose values

> The first will just assume that the lactose content in any sample is the mean (or median if you have wild outliers)

In [18]:
dummy_mean = DummyRegressor(strategy='mean')
dummy_mean_raw_results = cross_validate(dummy_mean,X,Y,cv=splitter,scoring=scoring)
dummy_mean_scores = {}

for score_name in scoring.keys():
#     dummy_mean_scores[score_name] = np.round(np.mean(dummy_mean_raw_results['test_'+score_name]),2)
    overall_results.loc['Dummy Mean',score_name] = np.round(np.mean(dummy_mean_raw_results['test_'+score_name]),2)
overall_results

Unnamed: 0,r2,SRC,PCC,MI,MAE
Dummy Mean,-0.02,0,0,0,1.94


### Median of all values

The prediction for each food is the median of the training lactose values
> The first will just assume that the lactose content in any sample is the mean (or median if you have wild outliers)

In [20]:
dummy_median_all = DummyRegressor(strategy='median')
dummy_median_all_raw_results = cross_validate(dummy_median_all,X,Y,cv=splitter,scoring=scoring)
dummy_median_all_scores = {}

for score_name in scoring.keys():
    overall_results.loc['Dummy Median All',score_name] = np.round(np.mean(dummy_median_all_raw_results['test_'+score_name]),2)
overall_results

Unnamed: 0,r2,SRC,PCC,MI,MAE
Dummy Mean,-0.02,0,0,0,1.94
Dummy Median All,-0.32,0,0,0,1.68


### Median  of nonzero values

The prediction for each food is the median of the _non-zero_ training lactose values
> the first says all have lactose with the median value of the lactose foods as the prediction for all samples

In [21]:
dummy_median_nonzero = DummyRegressorCustom(strategy='median_nonzero',constant=np.median(Y[Y > 0]))
dummy_median_nonzero_raw_results = cross_validate(dummy_median_nonzero,X,Y,cv=splitter,scoring=scoring)
dummy_median_nonzero_scores = {}


for score_name in scoring.keys():
    overall_results.loc['Dummy Median Nonzero',score_name] = np.round(np.mean(dummy_median_nonzero_raw_results['test_'+score_name]),2)
overall_results

Unnamed: 0,r2,SRC,PCC,MI,MAE
Dummy Mean,-0.02,0,0,0,1.94
Dummy Median All,-0.32,0,0,0,1.68
Dummy Median Nonzero,-0.08,0,0,0,1.77


### Perfect classifier plus mean regressor
- This classifier predicts 0 where the true value is 0, or the mean of the training lactose values where the true value is nonzero

- See dairyml.py for implementation

(This class is not compliant with sklearn conventions, this was the easiest implementation I could think of to include perfect classification)

In [22]:
scoring_pcmr = {'r2':r2_score, 
           'SRC':spearman, 
           'PCC':pearson, 
           'MI':mutual_info_score, 
           'MAE':mean_absolute_error}

In [23]:
pcmr = PerfectClassifierMeanRegressor()
pcmr.fit(X,Y)
dummy_pcmr_raw_results = pcmr.cross_val(scoring=scoring_pcmr)
dummy_median_scores = {}

for score_name in scoring.keys():
    overall_results.loc['Perfect Clasif., Mean Regr.',score_name] = np.round(np.mean(dummy_pcmr_raw_results[score_name]),2)
overall_results

Unnamed: 0,r2,SRC,PCC,MI,MAE
Dummy Mean,-0.02,0.0,-0.0,-0.0,1.94
Dummy Median All,-0.32,0.0,-0.0,-0.0,1.68
Dummy Median Nonzero,-0.08,0.0,-0.0,-0.0,1.77
"Perfect Clasif., Mean Regr.",0.13,0.73,0.41,0.53,1.53


### Save results to CSV

In [25]:
overall_results.to_csv('../reports/model_results.csv')