In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve, auc, precision_recall_curve
from pandas import Series
from keras.preprocessing.text import Tokenizer
from numpy import array
from keras.utils import to_categorical

Using TensorFlow backend.


In [2]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

In [3]:
# load
in_filename = 'email_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')

In [4]:
# integer encode sequences of words
tokenizer = Tokenizer(num_words = 12345)
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)
# vocabulary size
vocab_size = len(tokenizer.word_index) + 1

In [None]:
# separate into input and output
sequences = array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]

In [None]:
#partial validation data
x_val = X[:20000]
partial_x_train = X[20000:]

y_val = y[:20000]
partial_y_train = y[20000:]

In [None]:
# reduce scale to prevent from over allocate and MemoryError
partial_x_train = X[20000:60000]

partial_y_train = y[20000:60000]

In [None]:
def roc_curve_acc(Y_test, Y_pred,method):
    false_positive_rate, true_positive_rate, thresholds = roc_curve(Y_test, Y_pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    plt.title('Receiver Operating Characteristic')
    plt.plot(false_positive_rate, true_positive_rate, color='darkorange',label='%s AUC = %0.3f'%(method, roc_auc))
    plt.legend(loc='lower right')
    plt.plot([0,1],[0,1],'b--')
    plt.xlim([-0.1,1.2])
    plt.ylim([-0.1,1.2])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')

In [None]:
parameters = {'bootstrap': True,
              'min_samples_leaf': 3,
              'n_estimators': 100, 
              'min_samples_split': 10,
              'max_features': 'sqrt',
              'max_depth': 6,
              'max_leaf_nodes': None}

In [None]:
RF = RandomForestRegressor(**parameters)
RF.fit(partial_x_train, partial_y_train)

RF = RandomForestClassifier(**parameters)
RF.fit(partial_x_train, partial_y_train)

In [None]:
features_list = partial_y_train
feature_importance = RF.feature_importances_
sorted_idx = np.argsort(feature_importance)[:20]
 
plt.figure(figsize=(5,7))
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], align='center')
plt.yticks(range(len(sorted_idx)), features_list[sorted_idx])
plt.xlabel('Importance')
plt.title('Feature importances')
plt.draw()
plt.show()

In [None]:
Y_pred = RF.predict(x_val)

In [None]:
print("Random Forest Classifier report \n", classification_report(y_val, Y_pred))

In [None]:
roc_curve_acc(y_val, Y_pred, "RF")