In [1]:
import pandas as pd
import pickle as pkl

from sklearn.linear_model import Lasso, LassoCV, LogisticRegressionCV, LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate, KFold, cross_val_predict
from sklearn.metrics import r2_score, explained_variance_score, mean_absolute_error, make_scorer
from sklearn.dummy import DummyRegressor
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.ensemble import IsolationForest
from mutual_info import mutual_info_regression

from skll.metrics import spearman, pearson 


import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')
from pylab import rcParams
rcParams['figure.figsize'] = 6,6

Custom model implementations and functions are stored in `src/dairyml.py`

In [2]:
from dairyml import *

## Import the Data
Load the data from the pickle files created in `preproccess.ipynb`

In [3]:
with open("../pkl/data/data_outliers_removed", "rb" ) as f:
    [X, Y] = pkl.load(f)

In [4]:
Y_reshaped = Y.values.reshape(-1,1)

## Baseline Models

The below object splits the data into 10 folds, with shuffling. Random state is fixed for repeatability.

In [5]:
splitter= KFold(n_splits=10,shuffle=True,random_state=7)

Choose statistics used to evaluate the models
- R^2
- SRC = Spearman Rank Correlation
- PCC = Pearson Correlation
- MI = Mutual information
- MAE = Mean absolute error

### Choose scoring measures for cross-validation

In [6]:
from scoring import *

Import results dataframe from CSV to store the testing results, create it if it doesn't exist

In [7]:
try:
    overall_results = pd.read_csv('../reports/cv_results.csv',index_col=0)
except FileNotFoundError:
    overall_results = pd.DataFrame()

### Mean

The prediction for each food is the mean of the training lactose values

In [8]:
dummy_mean = DummyRegressor(strategy='mean')
dummy_mean_raw_results = cross_validate(dummy_mean,X,Y_reshaped,cv=splitter,scoring=scoring)

overall_results = scores_to_df(overall_results,'Dummy Mean',dummy_mean_raw_results,refit=None)

In [9]:
with open("../models/baseline_mean.model", "wb" ) as f:
    pkl.dump(dummy_mean,f)

### Median of all values

The prediction for each food is the median of the training lactose values

In [10]:
dummy_median_all = DummyRegressor(strategy='median')
dummy_median_all_raw_results = cross_validate(dummy_median_all,X,Y_reshaped,cv=splitter,scoring=scoring)
dummy_median_all_scores = {}

overall_results = scores_to_df(overall_results,'Dummy Median All',dummy_median_all_raw_results,refit=None)

In [11]:
with open("../models/baseline_median.model", "wb" ) as f:
    pkl.dump(dummy_median_all,f)

### Median  of nonzero values

The prediction for each food is the median of the _non-zero_ training lactose values

In [12]:
dummy_median_nonzero = DummyRegressorCustom(strategy='median_nonzero',constant=np.median(Y_reshaped[Y_reshaped > 0]))
dummy_median_nonzero_raw_results = cross_validate(dummy_median_nonzero,X,Y_reshaped,cv=splitter,scoring=scoring)
dummy_median_nonzero_scores = {}

overall_results = scores_to_df(overall_results,'Dummy Median Nonzero',dummy_median_nonzero_raw_results,refit=None)

In [13]:
with open("../models/baseline_median_nonzero.model", "wb" ) as f:
    pkl.dump(dummy_median_nonzero,f)

### Perfect classifier plus mean regressor
- This classifier predicts 0 where the true value is 0, or the mean of the training lactose values where the true value is nonzero
- The classification into 0 or nonzero is perfect (taken from the testing data)

- See dairyml.py for implementation

(This class is not compliant with sklearn conventions, this was the easiest implementation I could think of to include perfect classification)

In [14]:
scoring_pcmr = {'r2':r2_score, 
           'SRC':spearman, 
           'PCC':pearson, 
           'MI': lambda x,y : mutual_info_regression(x.reshape(-1,1),y.reshape(-1,1)), 
           'MAE':mean_absolute_error}

In [15]:
pcmr = PerfectClassifierMeanRegressor()
pcmr.fit(X,Y)
dummy_pcmr_raw_results = pcmr.cross_val(scoring=scoring_pcmr)
dummy_median_scores = {}

overall_results = scores_to_df(overall_results,'Perfect Clasif., Mean Regr.',dummy_pcmr_raw_results,refit=None)

In [16]:
with open("../models/baseline_pcmr.model", "wb" ) as f:
    pkl.dump(pcmr,f)

### Save results to CSV

In [17]:
overall_results.to_csv('../reports/cv_results.csv')