In [1]:
import numpy as np
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
def make_preds(X, model_coeffs,model_intercepts,model_classes):
    """
    This is the prediction function for a multi-class logistic regression.  
    Given a model's coefficients, intercepts, and class labels, it will 
    return a list of class predictions corresponding to input features, 'X'.
    """
    z = np.matmul(X,model_coeffs.transpose()) + model_intercepts
    probs = 1.0/(1.0+np.exp(-z))
    return model_classes[probs.argmax(axis=1)]

# Define paths for both the datasets and weights (I am testing this locally, but for implementation in our repository, these should be changed accordingly).

In [3]:
dataset_path = '/Users/andrew/school_assignments/datascience_project/saved_datasets/'
model_weight_path = '/Users/andrew/school_assignments/datascience_project/model_weights/'

# Training the model:

In [4]:
# Importing the true data sets with unirep features
ecoli_data = pd.read_pickle(dataset_path+'ecoli_noSP_data_and_dG.pkl')

In [5]:
# Running the model comparison on human data set WITH signal peptides
X = np.hstack((np.stack(ecoli_data["UniRep"].to_numpy()),ecoli_data["dG"].to_numpy()[:,np.newaxis]))
y = np.array(ecoli_data['location'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### For example, let's try simple logistic regression:

In [6]:
model = LogisticRegression(solver='liblinear')

In [7]:
model.fit(X_train, y_train)
score = model.score(X_test, y_test)
print(score)

0.9829476248477467


### Then let's save the trained parameters of the model as a pkl file (because good compression for dictionaries)

In [8]:
param_dict = {'classes':model.classes_,'coefficients':model.coef_, 'intercepts':model.intercept_}

In [9]:
with open(model_weight_path+'ecoli_LogReg_params.pkl', 'wb') as handle:
    pickle.dump(param_dict, handle)

# Re-load weights and make predictions:

In [10]:
with open(model_weight_path+'ecoli_LogReg_params.pkl', 'rb') as handle:
    param_dict = pickle.load(handle)

In [11]:
preds = make_preds(X_test, param_dict['coefficients'],param_dict['intercepts'],param_dict['classes'])

### Check to see percent consistency between manually predicting, and using the model object.

In [12]:
print('Percent consistency in prediction methods is: '+str(100*np.sum(preds==model.predict(X_test))/preds.shape[0])+' %')
print('Yayyy, we did it all in numpy!  That was easy!')

Percent consistency in prediction methods is: 100.0 %
Yayyy, we did it all in numpy!  That was easy!
