In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import random
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV

In [2]:
def train_validate_evaluate(classifier, dataset, feature):
    if feature == 'bow':
        item = ''
    elif feature == 'bool-bow':
        item = 'bool-'
    else:
        item = 'lexicon'
    path = './data/SA/' + dataset + '/' + item + 'labeled.npy'
    data = np.transpose(np.load(path))
    X = data[:,:-1]
    y = data[:,-1]
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.8, random_state=15)
    if classifier == 'NB':
        clf = GaussianNB()
    elif classifier == 'LR':
        clf = LogisticRegression(random_state=0)
    elif classifier == 'DT':
        clf = DecisionTreeClassifier(random_state=0)
    elif classifier == 'RF':
        clf = RandomForestClassifier()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_val)
    accuracy = metrics.accuracy_score(y_val, y_pred)
    precision = metrics.precision_score(y_val, y_pred)
    recall = metrics.recall_score(y_val, y_pred)
    f1_score = metrics.f1_score(y_val, y_pred)
    metrics_data = {'dataset': [dataset], 'classifier': [classifier], 'model':[feature],
        'accuracy': [accuracy], 'precision':[precision], 'recall':[recall], 'f1_score': [f1_score]}
    df = pd.DataFrame(data = metrics_data)
    df.index = [dataset + ' ' + classifier + ' ' + feature]
    return df

In [3]:
classifiers = ['NB', 'LR','DT', 'RF']
datasets = ['books', 'dvd', 'electronics', 'kitchen', 'all']
features = ['bow', 'bool-bow']

In [21]:
df = pd.DataFrame()
for classifier in classifiers:
    for dataset in datasets:
        for feature in features:
            df = pd.concat([df, train_validate_evaluate(classifier, dataset, feature)], axis=0)

KeyboardInterrupt: 

In [None]:
df.groupby('classifier').mean().plot.bar(figsize=(12,8), rot=48)
plt.title('Comparación de clasificadores', fontsize=20)
plt.xlabel('Clasificador', fontsize=15)
plt.ylabel('Porcentaje', fontsize=15)
plt.grid()
plt.savefig('./results/classifier_comparison.png')

In [None]:
df.groupby('model').mean().plot.bar(figsize=(12,8), rot=48)
plt.title('Comparación de modelos', fontsize=20)
plt.xlabel('Modelo', fontsize=15)
plt.ylabel('Porcentaje', fontsize=15)
plt.grid()
plt.savefig('./results/model_comparison.png')

In [None]:
df.groupby('dataset').mean().plot.bar(figsize=(12,8), rot=48)
plt.title('Comparación de datasets', fontsize=20)
plt.xlabel('Clasificador', fontsize=15)
plt.ylabel('Porcentaje', fontsize=15)
plt.grid()
plt.savefig('./results/datasets_comparison.png')

In [8]:
n_estimators = np.sort(random.sample(range(1, 1000),50))
parameters = {
    'n_estimators': n_estimators,
    'max_features': ('auto', 'sqrt', 'log2'),
}
scores = ['precision', 'recall']

In [9]:
def test_rf_hyperparameters(dataset, feature):
    if feature == 'bow':
        item = ''
    elif feature == 'bool-bow':
        item = 'bool-'
    else:
        item = 'lexicon'
    path = './data/SA/' + dataset + '/' + item + 'labeled.npy'
    data = np.transpose(np.load(path))
    X = data[:,:-1]
    y = data[:,-1]
    df = pd.DataFrame()
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.8, random_state=15)
    clfs = []
    for score in scores:
        print("# Tuning hyper-parameters for %s" % score)
        print()
        clf = GridSearchCV(RandomForestClassifier(), parameters, scoring='%s_macro' % score, n_jobs=-1)
        clf.fit(X_train, y_train)
        print("Best parameters set found on development set:")
        print()
        print(clf.best_params_)
        print()
        print("Grid scores on development set:")
        print()
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r"
                  % (mean, std * 2, params))
        print()

        print("Detailed classification report:")
        print()
        print("The model is trained on the full development set.")
        print("The scores are computed on the full evaluation set.")
        print()
        y_true, y_pred = y_val, clf.predict(X_val)
        print(classification_report(y_true, y_pred))
        print()
        clfs.append(clf)
    return clfs

In [None]:
clfs_books = test_rf_hyperparameters('books', 'bow')
clfs_books_b = test_rf_hyperparameters('books', 'bool-bow')
clfs_dvd = test_rf_hyperparameters('dvd', 'bow')
clfs_dvd_b = test_rf_hyperparameters('dvd', 'bool-bow')
clfs_electronics = test_rf_hyperparameters('electronics', 'bow')
clfs_electronics_b = test_rf_hyperparameters('electronics', 'bool-bow')
clfs_kitchen = test_rf_hyperparameters('kitchen', 'bow')
clfs_kitchen_b = test_rf_hyperparameters('kitchen', 'bool-bow')

In [10]:
clfs_all_b = test_rf_hyperparameters('all', 'bool-bow')
clfs_all = test_rf_hyperparameters('all', 'bow')

# Tuning hyper-parameters for precision



KeyboardInterrupt: 

In [None]:
clf = clfs_all_b[0]
s = (clf.cv_results_['mean_test_score']-min(clf.cv_results_['mean_test_score']))/max(clf.cv_results_['mean_test_score']-min(clf.cv_results_['mean_test_score']))
plt.scatter(clf.cv_results_['param_n_estimators'],
            clf.cv_results_['param_max_features'], 
            s = s*100)

In [None]:
clf = clfs_all_b[1]
s = (clf.cv_results_['mean_test_score']-min(clf.cv_results_['mean_test_score']))/max(clf.cv_results_['mean_test_score']-min(clf.cv_results_['mean_test_score']))
plt.scatter(clf.cv_results_['param_n_estimators'],
            clf.cv_results_['param_max_features'], 
            s = s*100)

In [None]:
clf = clfs_all[0]
s = (clf.cv_results_['mean_test_score']-min(clf.cv_results_['mean_test_score']))/max(clf.cv_results_['mean_test_score']-min(clf.cv_results_['mean_test_score']))
plt.scatter(clf.cv_results_['param_n_estimators'],
            clf.cv_results_['param_max_features'], 
            s = s*100)

In [None]:
clf = clfs_all[1]
s = (clf.cv_results_['mean_test_score']-min(clf.cv_results_['mean_test_score']))/max(clf.cv_results_['mean_test_score']-min(clf.cv_results_['mean_test_score']))
plt.scatter(clf.cv_results_['param_n_estimators'],
            clf.cv_results_['param_max_features'], 
            s = s*100)