# Baseline Models and Classifiers

This notebook adapts the feature engineering from the original paper to our windowed approach.  We use linear & other
 simple regression models here to serve as a baseline for the deep learning models.

In [54]:
import pandas as pd
import numpy as np
from sklearn.linear_model import ElasticNet, LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, accuracy_score

# change working directory to base, to make all imports and file paths work
import os
#os.chdir(os.pardir)
os.chdir(r"C:\Users\PC-1\Documents\GitHub\Projects\battery_island")
print("Current directory: %s" % os.getcwd())

Current directory: C:\Users\PC-1\Documents\GitHub\Projects\battery_island


In [55]:
df = pd.read_csv("./data/processed/rebuild_windowed_features_50_5_1.csv")

In [56]:
df.columns.values

array(['cell_key', 'cell_batch', 'cell_num', 'minimum_dQ_window',
       'variance_dQ_window', 'skewness_dQ_window', 'kurtosis_dQ_window',
       'slope_lin_fit_window', 'intercept_lin_fit_window',
       'discharge_capacity_1', 'diff_discharge_capacity_max_1',
       'mean_discharge_time', 'minimum_IR_window', 'diff_IR_window',
       'target_remaining', 'target_current', 'target_classifier'],
      dtype=object)

In [57]:
print(df["target_classifier"].value_counts())

0.0    4148
1.0    2326
Name: target_classifier, dtype: int64


In [58]:
df.describe()

Unnamed: 0,cell_batch,cell_num,minimum_dQ_window,variance_dQ_window,skewness_dQ_window,kurtosis_dQ_window,slope_lin_fit_window,intercept_lin_fit_window,discharge_capacity_1,diff_discharge_capacity_max_1,mean_discharge_time,minimum_IR_window,diff_IR_window,target_remaining,target_current,target_classifier
count,6474.0,6474.0,6474.0,6474.0,6474.0,6474.0,6474.0,6474.0,6474.0,6474.0,6474.0,6474.0,6474.0,6474.0,6474.0,6474.0
mean,1.0,21.669601,-4.033639,-9.876923,-0.426947,-0.406053,-0.000205,1.053025,1.052759,0.0027,13.735394,0.016433,0.000147,419.687519,465.78962,0.359283
std,0.0,13.910576,1.233619,1.36131,1.010082,1.128265,0.000287,0.034846,0.035436,0.064285,0.778931,0.002014,0.000592,267.329307,267.28278,0.479827
min,1.0,0.0,-15.232677,-13.121696,-8.627348,-7.86258,-0.003821,0.922702,0.922034,0.0,11.05912,0.0,-0.004035,2.0,50.0,0.0
25%,1.0,7.0,-4.395173,-10.836907,-0.853847,-0.995698,-0.000229,1.042337,1.042055,0.0,13.487345,0.016255,-2.5e-05,199.0,245.0,0.0
50%,1.0,23.0,-3.796769,-9.986102,-0.154603,-0.465569,-7.7e-05,1.064146,1.064172,0.0,13.97464,0.016636,8.2e-05,396.0,441.0,0.0
75%,1.0,33.0,-3.289231,-8.909548,0.184834,0.16471,-4.3e-05,1.076654,1.076539,0.000205,14.282609,0.01699,0.000222,602.0,650.0,1.0
max,1.0,45.0,-0.639437,-5.993006,1.246655,3.428991,0.002369,1.197957,1.539054,1.817914,14.772992,0.019886,0.017966,1176.0,1221.0,1.0


In [59]:
df.head()

Unnamed: 0,cell_key,cell_batch,cell_num,minimum_dQ_window,variance_dQ_window,skewness_dQ_window,kurtosis_dQ_window,slope_lin_fit_window,intercept_lin_fit_window,discharge_capacity_1,diff_discharge_capacity_max_1,mean_discharge_time,minimum_IR_window,diff_IR_window,target_remaining,target_current,target_classifier
0,b1c0,1,0,-6.581155,-11.155981,0.605746,1.831074,-0.0005664953,1.098772,1.070689,0.468365,14.437256,0.01656,-0.000113,1140.0,50.0,1.0
1,b1c0,1,0,-4.355252,-12.00152,0.064097,0.48557,-0.000821151,1.105443,1.073992,0.465063,14.443377,0.01656,8e-06,1135.0,55.0,1.0
2,b1c0,1,0,-0.639437,-7.331691,0.204566,-1.054242,-0.001063728,1.11156,1.539054,0.0,14.446563,0.016539,-0.002412,1130.0,60.0,1.0
3,b1c0,1,0,-4.803327,-12.056189,-0.225991,0.029918,1.280844e-05,1.076131,1.072405,0.004677,14.466869,0.016444,8e-06,1125.0,65.0,1.0
4,b1c0,1,0,-4.361737,-11.800815,-0.059717,-0.917141,-9.811717e-07,1.07655,1.076654,0.000429,14.466661,0.016444,2.9e-05,1120.0,70.0,1.0


# Preprocessing and feature selection

In [60]:
batch_1_2_keys = df['cell_key'][df['cell_batch']!=3].unique()
train_keys = batch_1_2_keys[1::2]
test_keys = batch_1_2_keys[0::2]
train_ind = df[df['cell_key'].isin(train_keys)].index
test_ind = df[df['cell_key'].isin(test_keys)].index
secondary_test_ind = df[df['cell_batch']==3].index

splits = [train_ind, test_ind, secondary_test_ind]
# secondary_test_ind currently empty, need to fix this code.
splits.pop()

Int64Index([], dtype='int64')

In [61]:
# Define feature and target columns for regression models

varmod_features = ["variance_dQ_window"]
dismod_features = [
    "variance_dQ_window",
    "minimum_dQ_window",
    "skewness_dQ_window",
    "kurtosis_dQ_window",
    "discharge_capacity_1",
    "diff_discharge_capacity_max_1",
]
fullmod_features = [
    "minimum_dQ_window",
    "variance_dQ_window",
    "slope_lin_fit_window",
    "intercept_lin_fit_window",
    "discharge_capacity_1",
    "mean_discharge_time",
    "minimum_IR_window",
    "diff_IR_window",
]
targetmod = ["target_remaining"]  # , "target_current"

# Define feature and target columns for classifiers

varclf_features = ["variance_dQ_window"]
fullclf_features = [
    "minimum_dQ_window",
    "variance_dQ_window",
    "discharge_capacity_1",
    "diff_IR_window",
]
targetclf = ["target_classifier"]

In [62]:
def get_split(data, features, target, split):
    X = data.iloc[split,:].loc[:,features]
    y = data.iloc[split,:].loc[:,target]
    return X, y

def eval_model(model, data, features, target, splits):
    ''' TODO: Redo splits allocations. Last index set is empty and don't trust the way in which sets were split, not
    very scientific. '''
    mse = list()
    mae = list()
    mpe = list()
    for split in splits:
        #print(split.astype(str))
        print(split)
        X, y = get_split(data, features, target, split)
        pred = model.predict(X)
        mse.append(mean_squared_error(pred, y))
        mae.append(float(np.mean(np.abs(y-pred.reshape(-1,1)))))
        mpe.append(float(np.mean(np.abs((y - pred.reshape(-1,1))) / y * 100)))
    return mse, mae, mpe

def eval_classifier(model, data, features, target, splits):
    acc = list()    
    for split in splits:
        X, y = get_split(data, features, target, split)
        pred = model.predict(X)
        acc.append(accuracy_score(pred, y.values.ravel()))
    return acc

# Variance Model

In [63]:
# Train Elastic net
x_train, y_train = get_split(df, varmod_features, targetmod, train_ind)

alphas = np.linspace(0.0001,1,30)
parameters = {
    "alpha": alphas,
    "l1_ratio": [0.01, 0.25, 0.5, 0.75, 1.]
}
enet = ElasticNet(random_state=54)
regr = GridSearchCV(enet, parameters, cv=4)
print("Elastic Net: %s" % regr.fit(x_train, y_train).score(x_train, y_train))

"""
Because an elastic net with alpha = 0 is technically a linear regression
and elastic net produces inaccuracies with a small alpha,
we also train a linear regression model.
Linear regression performs slighty better at RMSE,
Elastic net performs slightly better at MPE.
We decide to take the linear regression scores.
"""
lin_reg = LinearRegression()
print("Linear Regression: %s" % lin_reg.fit(x_train, y_train).score(x_train, y_train))

varmod_mse, varmod_mae, varmod_mpe = eval_model(lin_reg, df, varmod_features, targetmod, splits)


# Add Random Forest
rf_params = {
    "max_depth": [2, 3],
    "n_estimators": [10, 100]
}
rfst = RandomForestRegressor(random_state=54)
rfst_grid = GridSearchCV(rfst, rf_params, cv=4)
print("Random Forest: %s" % rfst_grid.fit(x_train, y_train).score(x_train, y_train))

varmod_rf_mse, varmod_rf_mae, varmod_rf_mpe = eval_model(rfst_grid, df, varmod_features, targetmod, splits)
print('varmod_rf_mse', varmod_rf_mse)
print('varmod_rf_mae', varmod_rf_mae)
print('varmod_rf_mpe', varmod_mpe)

Elastic Net: 0.31910735222047126
Linear Regression: 0.3191073522205553
Int64Index([ 228,  229,  230,  231,  232,  233,  234,  235,  236,  237,
            ...
            6354, 6355, 6356, 6357, 6358, 6359, 6360, 6361, 6362, 6363],
           dtype='int64', length=3123)
Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            6464, 6465, 6466, 6467, 6468, 6469, 6470, 6471, 6472, 6473],
           dtype='int64', length=3351)


  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)


Random Forest: 0.36307435911524233
Int64Index([ 228,  229,  230,  231,  232,  233,  234,  235,  236,  237,
            ...
            6354, 6355, 6356, 6357, 6358, 6359, 6360, 6361, 6362, 6363],
           dtype='int64', length=3123)
Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            6464, 6465, 6466, 6467, 6468, 6469, 6470, 6471, 6472, 6473],
           dtype='int64', length=3351)
varmod_rf_mse [43386.12771826679, 52694.903597590775]
varmod_rf_mae [166.87005638157137, 183.03692475141034]
varmod_rf_mpe [134.38089696330604, 153.1970658424267]


  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


# Discharge Model

In [64]:
# Train Elastic net
x_train, y_train = get_split(df, dismod_features, targetmod, train_ind)

alphas = np.linspace(0.1,1,20)
parameters = {
    "alpha": alphas,
    "l1_ratio": [0.01, 0.25, 0.5, 0.75, 1.]
}
enet = ElasticNet(random_state=54)
regr = GridSearchCV(enet, parameters, cv=4)
print("Elastic Net: %s" % regr.fit(x_train, y_train).score(x_train, y_train))

dismod_mse, dismod_mae, dismod_mpe = eval_model(regr, df, dismod_features, targetmod, splits)

Elastic Net: 0.623430876917282
Int64Index([ 228,  229,  230,  231,  232,  233,  234,  235,  236,  237,
            ...
            6354, 6355, 6356, 6357, 6358, 6359, 6360, 6361, 6362, 6363],
           dtype='int64', length=3123)
Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            6464, 6465, 6466, 6467, 6468, 6469, 6470, 6471, 6472, 6473],
           dtype='int64', length=3351)


  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


# Full Model

In [65]:
# Train Elastic net model
# raising the alpha minimum to 0.59 silences the convergence warnings,
# but decreases the score significantly - what's wrong here? 

x_train, y_train = get_split(df, fullmod_features, targetmod, train_ind)

alphas = np.linspace(0.001,1,20)
parameters = {
    "alpha": alphas,
    "l1_ratio": [0.001, 0.75, 1.]
}
enet = ElasticNet(random_state=54)
regr = GridSearchCV(enet, parameters, cv=4)
print("Elastic Net: %s" % regr.fit(x_train, y_train).score(x_train, y_train))

fullmod_mse, fullmod_mae, fullmod_mpe = eval_model(regr, df, fullmod_features, targetmod, splits)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Elastic Net: 0.7520921839465965
Int64Index([ 228,  229,  230,  231,  232,  233,  234,  235,  236,  237,
            ...
            6354, 6355, 6356, 6357, 6358, 6359, 6360, 6361, 6362, 6363],
           dtype='int64', length=3123)
Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            6464, 6465, 6466, 6467, 6468, 6469, 6470, 6471, 6472, 6473],
           dtype='int64', length=3351)


  model = cd_fast.enet_coordinate_descent(
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


# Evaluate all linear regression models

In [66]:
pd.DataFrame({"Model":["Variance model", "Discharge model", "Full model"],
              "MAE - Train": [varmod_mae[0],dismod_mae[0],fullmod_mae[0]],
              "MAE - Primary test": [varmod_mae[1],dismod_mae[1],fullmod_mae[1]],
              #"MAE - Secondary test": [varmod_mae[2],dismod_mae[2],fullmod_mae[2]],
              "MSE - Train": [varmod_mse[0],dismod_mse[0],fullmod_mse[0]],
              "MSE - Primary test": [varmod_mse[1],dismod_mse[1],fullmod_mse[1]],
              #"MSE - Secondary test": [varmod_mse[2],dismod_mse[2],fullmod_mse[2]],
              "MPE - Train": [varmod_mpe[0],dismod_mpe[0],fullmod_mpe[0]],
              "MPE - Primary test": [varmod_mpe[1],dismod_mpe[1],fullmod_mpe[1]],
              #"MPE - Secondary test": [varmod_mpe[2],dismod_mpe[2],fullmod_mpe[2]]
             })

Unnamed: 0,Model,MAE - Train,MAE - Primary test,MSE - Train,MSE - Primary test,MPE - Train,MPE - Primary test
0,Variance model,176.059119,192.360618,46381.074152,56627.238495,134.380897,153.197066
1,Discharge model,120.746249,137.240702,25651.151438,33066.161719,103.098096,118.896832
2,Full model,94.570483,113.500848,16886.995089,24651.316252,113.58626,133.179104


# Variance Classifier

In [67]:
# Train Logistic Regression
x_train, y_train = get_split(df, varclf_features, targetclf, train_ind)

parameters = {"C": [0.01,0.1,0.5,0.75,1]}

logreg = LogisticRegression(solver="liblinear", random_state=54)
clf = GridSearchCV(logreg, parameters, cv=4)
print("Logreg: %s" % clf.fit(x_train, y_train.values.ravel()).score(x_train, y_train.values.ravel()))

varclf_acc = eval_classifier(clf, df, varclf_features, targetclf, splits)

Logreg: 0.7431956452129362


# Full Classifier

In [68]:
# Train Logistic Regression
# Why is the full classifier worse than the variance classifier?
x_train, y_train = get_split(df, fullclf_features, targetclf, train_ind)

parameters = {"C": [0.01,0.1,0.5,0.75,1]}

logreg = LogisticRegression(solver="liblinear", random_state=54)
clf = GridSearchCV(logreg, parameters, cv=4)
print("Logreg: %s" % clf.fit(x_train, y_train.values.ravel()).score(x_train, y_train.values.ravel()))

fullclf_acc = eval_classifier(clf, df, fullclf_features, targetclf, splits)

Logreg: 0.7454370797310279


# Evaluate all classifiers

In [69]:
pd.DataFrame({"Classifier":["Variance classifier", "Full classifier"],
              "Acc - Train": [varclf_acc[0],fullclf_acc[0]],
              "Acc - Primary test": [varclf_acc[1],fullclf_acc[1]],})
              #"Acc - Secondary test": [varclf_acc[2],fullclf_acc[2]]})

Unnamed: 0,Classifier,Acc - Train,Acc - Primary test
0,Variance classifier,0.743196,0.734706
1,Full classifier,0.745437,0.741868
