In [161]:
import pandas as pd
import numpy as np
import nltk
import re
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import random
from sklearn.model_selection import GridSearchCV

In [162]:
def train_validate_evaluate(classifier, dataset, feature):
    if feature == 'bow':
        item = ''
    elif feature == 'bool-bow':
        item = 'bool-'
    else:
        item = 'lexicon'
    path = './data/SA/' + dataset + '/' + item + 'labeled.npy'
    data = np.transpose(np.load(path))
    X = data[:,:-1]
    y = data[:,-1]
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.8, random_state=15)
    if classifier == 'NB':
        clf = GaussianNB()
    elif classifier == 'LR':
        clf = LogisticRegression(random_state=0)
    elif classifier == 'DT':
        clf = DecisionTreeClassifier(random_state=0)
    elif classifier == 'RF':
        clf = RandomForestClassifier()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_val)
    accuracy = metrics.accuracy_score(y_val, y_pred)
    precision = metrics.precision_score(y_val, y_pred)
    recall = metrics.precision_score(y_val, y_pred)
    f1_score = metrics.f1_score(y_val, y_pred)
    metrics_data = {'dataset': [dataset], 'classifier': [classifier], 'model':[feature],
        'accuracy': [accuracy], 'precision':[precision], 'recall':[recall], 'f1_score': [f1_score]}
    df = pd.DataFrame(data = metrics_data)
    df.index = [dataset + ' ' + classifier + ' ' + feature]
    return df

In [96]:
classifiers = ['NB', 'LR','DT', 'RF']
datasets = ['books', 'dvd', 'electronics', 'kitchen', 'all']
features = ['bow', 'bool-bow']

In [97]:
df = pd.DataFrame()
for classifier in classifiers:
    for dataset in datasets:
        for feature in features:
            df = pd.concat([df, train_validate_evaluate(classifier, dataset, feature)], axis=0)

In [165]:
parameters = {
    'n_estimators': random.sample(range(1, 10000),500),
    'max_features': ('auto', 'sqrt', 'log2'),
}
scores = ['precision', 'recall', 'accuracy', 'f1-score']

In [166]:
def test_rf_hyperparameters(dataset, feature):
    if feature == 'bow':
        item = ''
    elif feature == 'bool-bow':
        item = 'bool-'
    else:
        item = 'lexicon'
    path = './data/SA/' + dataset + '/' + item + 'labeled.npy'
    data = np.transpose(np.load(path))
    X = data[:,:-1]
    y = data[:,-1]
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.8, random_state=15)
    for score in scores:
        print("# Tuning hyper-parameters for %s" % score)
        print()
        clf = GridSearchCV(RandomForestClassifier(), parameters, scoring='%s_macro' % score, n_jobs=10)
        clf.fit(X_train, y_train)
        print("Best parameters set found on development set:")
        print()
        print(clf.best_params_)
        print()
        print("Grid scores on development set:")
        print()
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r"
                  % (mean, std * 2, params))
        print()

        print("Detailed classification report:")
        print()
        print("The model is trained on the full development set.")
        print("The scores are computed on the full evaluation set.")
        print()
        y_true, y_pred = y_val, clf.predict(X_val)
        print(classification_report(y_true, y_pred))
        print()

In [167]:
test_rf_hyperparameters('books', 'bow')

# Tuning hyper-parameters for precision



KeyboardInterrupt: 

In [154]:
parameters

{'n_estimators': [0, 9], 'max_features': ('auto', 'sqrt', 'log2')}