# Linear Models and Classifiers

This notebook reproduces the results from the initial paper, following the given information as closely as possible.

In [12]:
import pandas as pd
import numpy as np
from sklearn.linear_model import ElasticNet, LinearRegression, LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, accuracy_score

In [13]:
df = pd.read_csv("../data/external/rebuild_features.csv")

In [14]:
df.columns.values

array(['cell_key', 'minimum_dQ_100_10', 'variance_dQ_100_10',
       'skewness_dQ_100_10', 'kurtosis_dQ_100_10', 'slope_lin_fit_2_100',
       'intercept_lin_fit_2_100', 'discharge_capacity_2',
       'diff_discharge_capacity_max_2', 'mean_charge_time_2_6',
       'minimum_IR_2_100', 'diff_IR_100_2', 'minimum_dQ_5_4',
       'variance_dQ_5_4', 'cycle_life', 'cycle_550_clf'], dtype=object)

In [15]:
df.describe()

Unnamed: 0,minimum_dQ_100_10,variance_dQ_100_10,skewness_dQ_100_10,kurtosis_dQ_100_10,slope_lin_fit_2_100,intercept_lin_fit_2_100,discharge_capacity_2,diff_discharge_capacity_max_2,mean_charge_time_2_6,minimum_IR_2_100,diff_IR_100_2,minimum_dQ_5_4,variance_dQ_5_4,cycle_life,cycle_550_clf
count,124.0,124.0,124.0,124.0,124.0,124.0,124.0,124.0,124.0,124.0,124.0,124.0,124.0,124.0,124.0
mean,-3.309622,-8.856155,-1.798408,0.181338,-3.3e-05,1.07561,1.071192,0.023001,10.432973,0.014917,-0.000179,-8.054585,-14.490349,798.387097,0.653226
std,0.505783,0.937557,1.113668,0.209144,0.000116,0.009705,0.009061,0.16776,0.744873,0.004534,0.000436,2.131843,2.014785,372.742979,0.477874
min,-5.142544,-11.825562,-7.669214,-0.651928,-0.001092,1.049389,1.042137,0.000459,8.964706,0.0,-0.003992,-14.112938,-18.273646,148.0,0.0
25%,-3.615959,-9.463472,-2.307431,0.121294,-2e-05,1.070539,1.066903,0.003423,10.04377,0.015301,-0.00023,-9.494888,-15.940693,498.75,0.0
50%,-3.233402,-8.710854,-1.537733,0.207657,-6e-06,1.076238,1.071413,0.004417,10.130237,0.01621,-7.5e-05,-8.177757,-14.72899,736.5,1.0
75%,-2.950006,-8.218369,-1.134104,0.247337,5e-06,1.082333,1.077769,0.005953,10.329087,0.016868,-3e-06,-6.204012,-12.952985,946.5,1.0
max,-1.984958,-6.296653,0.648827,1.474969,3.5e-05,1.101465,1.094639,1.817914,13.40915,0.020022,0.000438,-4.260407,-9.40195,2237.0,1.0


# Preprocessing and feature selection

In [16]:
numBat1 = len([i for i in list(df.cell_key) if i[1] == "1"])
numBat2 = len([i for i in list(df.cell_key) if i[1] == "2"])
numBat3 = len([i for i in list(df.cell_key) if i[1] == "3"])
numBat = sum((numBat1,numBat2,numBat3))

In [17]:
test_ind = np.hstack((np.arange(0,(numBat1+numBat2),2),83))
train_ind = np.arange(1,(numBat1+numBat2-1),2)
secondary_test_ind = np.arange(numBat-numBat3,numBat);

splits = [train_ind, test_ind, secondary_test_ind]

In [18]:
# Define feature and target columns for regression models

varmod_features = ["variance_dQ_100_10"]
dismod_features = [
    "variance_dQ_100_10",
    "minimum_dQ_100_10",
    "skewness_dQ_100_10",
    "kurtosis_dQ_100_10",
    "discharge_capacity_2",
    "diff_discharge_capacity_max_2",
]
fullmod_features = [
    "minimum_dQ_100_10",
    "variance_dQ_100_10",
    "slope_lin_fit_2_100",
    "intercept_lin_fit_2_100",
    "discharge_capacity_2",
    "mean_charge_time_2_6",
    "minimum_IR_2_100",
    "diff_IR_100_2",
]
targetmod = ["cycle_life"]

# Define feature and target columns for classifiers

varclf_features = ["variance_dQ_5_4"]
fullclf_features = [
    "minimum_dQ_5_4",
    "variance_dQ_5_4",
    "discharge_capacity_2",
    "diff_IR_100_2",
]
targetclf = ["cycle_550_clf"]

In [19]:
def get_split(data, features, target, split):
    X = data.iloc[split,:].loc[:,features]
    y = data.iloc[split,:].loc[:,target]
    return X, y

def eval_model(model, data, features, target, split):
    rmse = list()
    mpe = list()
    for split in splits:
        X, y = get_split(data, features, target, split)
        pred = model.predict(X)
        rmse.append(np.sqrt(mean_squared_error(pred, y)))
        mpe.append(float(np.mean(np.abs((y - pred.reshape(-1,1))) / y * 100)))
    return rmse, mpe

def eval_classifier(model, data, features, target, splits):
    acc = list()    
    for split in splits:
        X, y = get_split(data, features, target, split)
        pred = model.predict(X)
        acc.append(accuracy_score(pred, y.values.ravel()))
    return acc

# Variance Model

In [20]:
# Train Elastic net
x_train, y_train = get_split(df, varmod_features, targetmod, train_ind)

alphas = np.linspace(0.0001,1,30)
parameters = {
    "alpha": alphas,
    "l1_ratio": [0.01, 0.25, 0.5, 0.75, 1.]
}
enet = ElasticNet(random_state=54)
regr = GridSearchCV(enet, parameters, cv=4)
print("Elastic Net: %s" % regr.fit(x_train, y_train).score(x_train, y_train))

"""
Because an elastic net with alpha = 0 is technically a linear regression
and elastic net produces inaccuracies with a small alpha,
we also train a linear regression model.
Linear regression performs slighty better at RMSE,
Elastic net performs slightly better at MPE.
We decide to take the linear regression scores.
"""
lin_reg = LinearRegression()
print("Linear Regression: %s" % lin_reg.fit(x_train, y_train).score(x_train, y_train))

varmod_rmse, varmod_mpe = eval_model(lin_reg, df, varmod_features, targetmod, splits)

Elastic Net: 0.778771357057023
Linear Regression: 0.779697297534494


  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


# Discharge Model

In [21]:
# Train Elastic net
x_train, y_train = get_split(df, dismod_features, targetmod, train_ind)

alphas = np.linspace(0.1,1,20)
parameters = {
    "alpha": alphas,
    "l1_ratio": [0.01, 0.25, 0.5, 0.75, 1.]
}
enet = ElasticNet(random_state=54)
regr = GridSearchCV(enet, parameters, cv=4)
print("Elastic Net: %s" % regr.fit(x_train, y_train).score(x_train, y_train))

dismod_rmse, dismod_mpe = eval_model(regr, df, dismod_features, targetmod, splits)

Elastic Net: 0.8466638373843819


  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


# Full Model

In [22]:
# Train Elastic net model
# raising the alpha minimum to 0.59 silences the convergence warnings,
# but decreases the score significantly - what's wrong here? 

x_train, y_train = get_split(df, fullmod_features, targetmod, train_ind)

alphas = np.linspace(0.001,1,20)
parameters = {
    "alpha": alphas,
    "l1_ratio": [0.001, 0.75, 1.]
}
enet = ElasticNet(random_state=54)
regr = GridSearchCV(enet, parameters, cv=4)
print("Elastic Net: %s" % regr.fit(x_train, y_train).score(x_train, y_train))

fullmod_rmse, fullmod_mpe = eval_model(regr, df, fullmod_features, targetmod, splits)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Elastic Net: 0.9207206035327872


  model = cd_fast.enet_coordinate_descent(
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


# Evaluate all linear regression models

In [23]:
pd.DataFrame({"Model":["Variance model", "Discharge model", "Full model"],
              "RMSE - Train": [varmod_rmse[0],dismod_rmse[0],fullmod_rmse[0]],
              "RMSE - Primary test": [varmod_rmse[1],dismod_rmse[1],fullmod_rmse[1]],
              "RMSE - Secondary test": [varmod_rmse[2],dismod_rmse[2],fullmod_rmse[2]],
              "MPE - Train": [varmod_mpe[0],dismod_mpe[0],fullmod_mpe[0]],
              "MPE - Primary test": [varmod_mpe[1],dismod_mpe[1],fullmod_mpe[1]],
              "MPE - Secondary test": [varmod_mpe[2],dismod_mpe[2],fullmod_mpe[2]]})                                                                       

Unnamed: 0,Model,RMSE - Train,RMSE - Primary test,RMSE - Secondary test,MPE - Train,MPE - Primary test,MPE - Secondary test
0,Variance model,151.66533,166.800507,183.023307,21.615414,22.232938,12.874134
1,Discharge model,126.531512,211.11698,225.425486,18.271368,22.573476,12.387553
2,Full model,90.982228,131.272456,265.357683,11.536047,19.161455,20.083662


# Variance Classifier

In [24]:
# Train Logistic Regression
x_train, y_train = get_split(df, varclf_features, targetclf, train_ind)

parameters = {"C": [0.01,0.1,0.5,0.75,1]}

logreg = LogisticRegression(solver="liblinear", random_state=54)
clf = GridSearchCV(logreg, parameters, cv=4)
print("Logreg: %s" % clf.fit(x_train, y_train.values.ravel()).score(x_train, y_train.values.ravel()))

varclf_acc = eval_classifier(clf, df, varclf_features, targetclf, splits)

Logreg: 0.8048780487804879


# Full Classifier

In [25]:
# Train Logistic Regression
# Why is the full classifier worse than the variance classifier?
x_train, y_train = get_split(df, fullclf_features, targetclf, train_ind)

parameters = {"C": [0.01,0.1,0.5,0.75,1]}

logreg = LogisticRegression(solver="liblinear", random_state=54)
clf = GridSearchCV(logreg, parameters, cv=4)
print("Logreg: %s" % clf.fit(x_train, y_train.values.ravel()).score(x_train, y_train.values.ravel()))

fullclf_acc = eval_classifier(clf, df, fullclf_features, targetclf, splits)

Logreg: 0.6585365853658537


# Evaluate all classifiers

In [26]:
pd.DataFrame({"Classifier":["Variance classifier", "Full classifier"],
              "Acc - Train": [varclf_acc[0],fullclf_acc[0]],
              "Acc - Primary test": [varclf_acc[1],fullclf_acc[1]],
              "Acc - Secondary test": [varclf_acc[2],fullclf_acc[2]]})                                  

Unnamed: 0,Classifier,Acc - Train,Acc - Primary test,Acc - Secondary test
0,Variance classifier,0.804878,0.767442,0.95
1,Full classifier,0.658537,0.627907,0.6
