In [17]:
from sklearn.metrics import roc_curve, auc
# Models
from sklearn.svm import LinearSVC #SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
# Other Packages
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.externals import joblib
import random
from timeit import default_timer as timer
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)



In [18]:
def prepare_data (data, sample_size):
    # Create Dataframe of unique docs to use sample function from pandas with random state
    docs = pd.DataFrame(data.doc.unique())
    random_docs = docs[0].sample(n=sample_size, random_state=1).values.tolist()
    data_sample = data[data['doc'].isin(random_docs)]
    data_sample1 = data_sample.loc[:, 'doc':'chem']
    data_sample2 = data_sample.loc[:, 'word.is.lower':data.iloc[:,-1].name]

    # Replace missing values
    data_sample2 = data_sample2.fillna(data_sample2.mode().iloc[0])
    data_sample = pd.concat([data_sample1, data_sample2], axis=1, sort=False)

    # Split random_docs in test and training sets
    docs_train, docs_test = train_test_split(random_docs, random_state=1)

    # Split random_data in test and training sets
    train, test = data_sample[data_sample['doc'].isin(docs_train)], data_sample[data_sample['doc'].isin(docs_test)]
    
    return train, test

In [19]:
def n_estimators (x_train, y_train):
    n_estimators = [1, 2, 4, 8, 16, 32, 64, 100, 200]
    train_results = []
    test_results = []

    for estimator in n_estimators:
       rf = RandomForestClassifier(n_estimators=estimator, n_jobs=-1)
       rf.fit(x_train, y_train)
       train_pred = rf.predict(x_train)
       false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, train_pred)
       roc_auc = auc(false_positive_rate, true_positive_rate)
       train_results.append(roc_auc)
       y_pred = rf.predict(x_test)
       false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
       roc_auc = auc(false_positive_rate, true_positive_rate)
       test_results.append(roc_auc)

    from matplotlib.legend_handler import HandlerLine2D
    line1, = plt.plot(n_estimators, train_results, 'b', label='Train AUC')
    line2, = plt.plot(n_estimators, test_results, 'r', label='Test AUC')
    plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
    plt.ylabel('AUC score')
    plt.xlabel('n_estimators')
    plt.show()

In [20]:
def max_depth (x_train, y_train):
    max_depths = np.linspace(1, 32, 32, endpoint=True)
    train_results = []
    test_results = []
    for max_depth in max_depths:
       rf = RandomForestClassifier(max_depth=max_depth, n_jobs=-1)
       rf.fit(x_train, y_train)
       train_pred = rf.predict(x_train)
       false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, train_pred)
       roc_auc = auc(false_positive_rate, true_positive_rate)
       train_results.append(roc_auc)
       y_pred = rf.predict(x_test)
       false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
       roc_auc = auc(false_positive_rate, true_positive_rate)
       test_results.append(roc_auc)
    from matplotlib.legend_handler import HandlerLine2D
    line1, = plt.plot(max_depths, train_results, 'b', label='Train AUC')
    line2, = plt.plot(max_depths, test_results, 'r', label='Test AUC')
    plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
    plt.ylabel('AUC score')
    plt.xlabel('n_estimators')
    plt.show()

In [21]:
def min_samples_split (x_train, y_train):
    min_samples_splits = np.linspace(0.1, 1.0, 10, endpoint=True)
    train_results = []
    test_results = []
    for min_samples_split in min_samples_splits:
       rf = RandomForestClassifier(min_samples_split=min_samples_split)
       rf.fit(x_train, y_train)
       train_pred = rf.predict(x_train)
       false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, train_pred)
       roc_auc = auc(false_positive_rate, true_positive_rate)
       train_results.append(roc_auc)
       y_pred = rf.predict(x_test)
       false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
       roc_auc = auc(false_positive_rate, true_positive_rate)
       test_results.append(roc_auc)
    from matplotlib.legend_handler import HandlerLine2D
    line1, = plt.plot(min_samples_splits, train_results, 'b', label='Train AUC')
    line2, = plt.plot(min_samples_splits, test_results, 'r', label='Test AUC')
    plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
    plt.ylabel('AUC score')
    plt.xlabel('n_estimators')
    plt.show()

In [22]:
def min_samples_leaf (x_train, y_train):
    min_samples_leafs = np.linspace(0.1, 0.5, 5, endpoint=True)
    train_results = []
    test_results = []
    for min_samples_leaf in min_samples_leafs:
       rf = RandomForestClassifier(min_samples_leaf=min_samples_leaf)
       rf.fit(x_train, y_train)
       train_pred = rf.predict(x_train)
       false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, train_pred)
       roc_auc = auc(false_positive_rate, true_positive_rate)
       train_results.append(roc_auc)
       y_pred = rf.predict(x_test)
       false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
       roc_auc = auc(false_positive_rate, true_positive_rate)
       test_results.append(roc_auc)
    from matplotlib.legend_handler import HandlerLine2D
    line1, = plt.plot(min_samples_leafs, train_results, 'b', label='Train AUC')
    line2, = plt.plot(min_samples_leafs, test_results, 'r', label='Test AUC')
    plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
    plt.ylabel('AUC score')
    plt.xlabel('n_estimators')
    plt.show()

In [23]:
def max_features (x_train, y_train):
    max_features = list(range(1,train.shape[1]))
    train_results = []
    test_results = []
    for max_feature in max_features:
       rf = RandomForestClassifier(max_features=max_feature)
       rf.fit(x_train, y_train)
       train_pred = rf.predict(x_train)
       false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, train_pred)
       roc_auc = auc(false_positive_rate, true_positive_rate)
       train_results.append(roc_auc)
       y_pred = rf.predict(x_test)
       false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
       roc_auc = auc(false_positive_rate, true_positive_rate)
       test_results.append(roc_auc)
    from matplotlib.legend_handler import HandlerLine2D
    line1, = plt.plot(max_features, train_results, 'b', label='Train AUC')
    line2, = plt.plot(max_features, ttest_results, 'r', label='Test AUC')
    plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
    plt.ylabel('AUC score')
    plt.xlabel('n_estimators')
    plt.show()

In [24]:
labels = ['chapter', 'subchapter', 'version', 'directive', 'signal', 'company', 'date','date_oldversiondate', 'date_printdate', 'date_revisiondate', 'date_validdate', 'usecase', 'usecase_con', 'usecase_pro', 'chem']

In [25]:
data = pd.read_pickle('data_model_allfeatw13_3_web.pkl')

In [26]:
train, test = prepare_data(data, 500)
colidx = data.columns.get_loc

r_paper_w = np.r_[colidx('word.is.lower'):colidx('+6_word.is.stop')]
r_own_w = np.r_[colidx('word.is.title'):colidx('+6_is.page.3')]
r_own_date_spe_w = np.r_[colidx('word.is.print.date.trigger'):colidx('+6_word.is.oldversion.date.trigger')]
r_own_date_full_w = np.r_[r_own_w, r_own_date_spe_w]
r_web_w = np.r_[colidx('0'):colidx('299')]

feat = ('Own+Paper+Wordembedding', np.r_[r_own_w, r_paper_w, r_web_w])
# Separate random_data training/test split in features and labels
x_train = train.iloc[:, feat[1]]
x_test = test.iloc[:, feat[1]]

In [27]:
for l in labels:
    y_train = train.loc[:, l]
    y_test = test.loc[:, l]
    
    print ('************' + l + '************')
    n_estimators(x_train, y_train)
    

************chapter************


KeyboardInterrupt: 

In [28]:
for l in labels:
    y_train = train.loc[:, l]
    y_test = test.loc[:, l]
    
    print ('************' + l + '************')
    max_depth(x_train, y_train)

************chapter************




KeyboardInterrupt: 

In [None]:
for l in labels:
    y_train = train.loc[:, l]
    y_test = test.loc[:, l]
    
    print ('************' + l + '************')
    min_samples_split(x_train, y_train)

In [None]:
for l in labels:
    y_train = train.loc[:, l]
    y_test = test.loc[:, l]
    
    print ('************' + l + '************')
    min_samples_leaf(x_train, y_train)

In [None]:
for l in labels:
    y_train = train.loc[:, l]
    y_test = test.loc[:, l]
    
    print ('************' + l + '************')
    max_features(x_train, y_train)