In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics import roc_curve, roc_auc_score

In [2]:
df_test = pd.read_hdf('../../data/test.h5', 'TEST')

In [3]:
df_test = df_test.loc[df_test['insf_Mv'].notnull(),]
df_test = df_test.reset_index(drop=True)

In [4]:
# df_test = df_test.drop([i for i in df_test.index if not df_test.loc[i, 'tokenized_sentences']]).reset_index()

In [4]:
W2Vmodel = pickle.load( open("../Final_Models/W2Vmodel.pickle", "rb"))

In [5]:
def get_w2v_features(w2v_model, sentence_group):
    """ Transform a sentence_group (containing multiple lists
    of words) into a feature vector. It averages out all the
    word vectors of the sentence_group.
    """
    words = np.concatenate(sentence_group)  # words in text
    index2word_set = set(w2v_model.wv.vocab.keys())  # words known to model
    
    featureVec = np.zeros(w2v_model.vector_size, dtype="float32")
    
    # Initialize a counter for number of words in a review
    nwords = 0
    # Loop over each word in the comment and, if it is in the model's vocabulary, add its feature vector to the total
    for word in words:
        if word in index2word_set: 
            featureVec = np.add(featureVec, w2v_model[word])
            nwords += 1.

    # Divide the result by the number of words to get the average
    if nwords > 0:
        featureVec = np.divide(featureVec, nwords)
    return featureVec

In [6]:
w2v_features_test = list(map(lambda sen_group: get_w2v_features(W2Vmodel, sen_group), df_test['tokenized_sentences']))
test_w2v = np.array(list(map(np.array, w2v_features_test)))



In [8]:
# test_w2v

In [9]:
# test = pd.concat([pd.DataFrame(test_w2v), df_test[['prot_Mv', 'prot_Ao', 'insf_Mv', 'insf_Ao',
#                               'est_Mv', 'est_Ao']]], axis=1)

In [7]:
W=pd.DataFrame(test_w2v)
test = pd.concat([W, df_test], axis=1)

In [8]:
X_test, y_test = test.iloc[:,0:200], test['insf_Mv']

In [9]:
with open('../Final_Models/ML_model_insf_Mv_SVM_pipeline.pickle', 'rb') as f:
    model = pickle.load(f)

In [10]:
def Find_Optimal_Cutoff(target, predicted):
    """ Find the optimal probability cutoff point for a classification model related to event rate
    Parameters
    ----------
    target : Matrix with dependent or target data, where rows are observations

    predicted : Matrix with predicted data, where rows are observations

    Returns
    -------     
    list type, with optimal cutoff value

    """
    fpr, tpr, threshold = roc_curve(target, predicted)
    i = np.arange(len(tpr)) 
    roc = pd.DataFrame({'tf' : pd.Series(tpr-(1-fpr), index=i), 'threshold' : pd.Series(threshold, index=i)})
    roc_t = roc.iloc[(roc.tf-0).abs().argsort()[:1]]

    return list(roc_t['threshold']) 

In [11]:
prob = model.predict_proba(X_test)

In [12]:
Find_Optimal_Cutoff(y_test,prob[:,1])

[1.0069507054049262e-07]

In [13]:
roc_auc_score(y_test,prob[:,1])

0.5957924089024942