In [1]:
import nltk
import random
#from nltk.corpus import movie_reviews
from nltk.classify.scikitlearn import SklearnClassifier
import pickle
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify import ClassifierI
from statistics import mode
from nltk.tokenize import word_tokenize
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers

    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)

    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)

        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf
    
short_pos = open("short_reviews/positive.txt","r").read()
short_neg = open("short_reviews/negative.txt","r").read()

# move this up here
documents = []
df = []

#  j is adject, r is adverb, and v is verb
#allowed_word_types = ["J","R","V"]
allowed_word_types = ["J"]

for p in short_pos.split('\n'):
    documents.append( (p, "pos") ) # Apenda os documentos com a variavel resposta
    # Primeiro vou remover palavras que contem numeros
    words = word_tokenize(re.sub(r'\w*\d\w*', '', p).strip())
    pos = nltk.pos_tag(words) # Coloca etiqueta do tipo de palavra
    ww = []
    for w in pos:
        if w[1][0] in allowed_word_types:
            ww.append(w[0].lower())
    df.append( (' '.join(ww), "pos") )
    
for p in short_neg.split('\n'):
    documents.append( (p, "neg") ) # Apenda os documentos com a variavel resposta
    # Primeiro vou remover palavras que contem numeros
    words = word_tokenize(re.sub(r'\w*\d\w*', '', p).strip())
    pos = nltk.pos_tag(words) # Coloca etiqueta do tipo de palavra
    ww = []
    for w in pos:
        if w[1][0] in allowed_word_types:
            ww.append(w[0].lower())
    df.append( (' '.join(ww), "neg") )
    
df = pd.DataFrame(df, columns = ['sentence', 'y_true'])

save_documents = open("pickled_algos/documents.pickle","wb")
pickle.dump(documents, save_documents)
save_documents.close()


save_documents = open("pickled_algos/df.pickle","wb")
pickle.dump(df, save_documents)
save_documents.close()

In [2]:
train, test = train_test_split(df, test_size = 0.1, random_state = 0, stratify = df['y_true'])

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
import lightgbm as lgb

from sklearn.externals import joblib

from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold
from sklearn.utils import parallel_backend

seed = 123

parameters = {
    'transformer__tfidf__max_features': [1000],
    #'transformer__tfidf__ngram_range': [(1, 1), (1, 2)],
    'transformer__tfidf__ngram_range': [(1, 2)],
    'transformer__tfidf__max_df': [.6],
    'transformer__tfidf__min_df': [20],
    'clf__max_iter': [5000],
    'clf__C': [.1, 1, 10, 100],
    'clf__penalty': ['l1', 'l2'],
    'clf__fit_intercept': [True],
    'clf__class_weight': ['balanced', None],
    #'clf__solver': ['lbfgs'],
    'clf__random_state': [seed]
}

pipeline = Pipeline([
    ('transformer', ColumnTransformer([('tfidf', TfidfVectorizer(analyzer = 'word'), 'sentence')], remainder = 'passthrough')),
    ('clf', LogisticRegression() )
])

cv = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 5, random_state = seed)
gs = GridSearchCV(pipeline, parameters, cv = cv, scoring = 'f1_macro', n_jobs = 5, verbose = 1, refit = True)

#Rodando GridSearch
with parallel_backend('multiprocessing'):
    gs.fit(train.drop(columns = 'y_true'), train['y_true'])

save_classifier = open("pickled_algos/gs.pickle","wb")
pickle.dump(gs, save_classifier)
save_classifier.close()

Fitting 50 folds for each of 16 candidates, totalling 800 fits


[Parallel(n_jobs=5)]: Using backend MultiprocessingBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:   12.3s
[Parallel(n_jobs=5)]: Done 190 tasks      | elapsed:   31.5s
[Parallel(n_jobs=5)]: Done 440 tasks      | elapsed:  1.2min
[Parallel(n_jobs=5)]: Done 790 tasks      | elapsed:  2.2min
[Parallel(n_jobs=5)]: Done 800 out of 800 | elapsed:  2.2min finished


In [4]:
def feature_names_tfidf(X, column_tfidf, **params_tfidf):
    tfidf = TfidfVectorizer(**params_tfidf)
    tfidf.fit(X[column_tfidf])
    tfidf_feature_names = {'tdidf__' + x for x in tfidf.get_feature_names()}
    feature_names = list(tfidf_feature_names) + list(X.columns.drop(column_tfidf))
    return feature_names

featurenames = feature_names_tfidf(X = train.drop(columns = 'y_true'), column_tfidf = 'sentence',
    max_df = list(gs.best_params_.values())[list(gs.best_params_).index('transformer__tfidf__max_df')],
    min_df = list(gs.best_params_.values())[list(gs.best_params_).index('transformer__tfidf__min_df')],
    max_features = list(gs.best_params_.values())[list(gs.best_params_).index('transformer__tfidf__max_features')],
    ngram_range = list(gs.best_params_.values())[list(gs.best_params_).index('transformer__tfidf__ngram_range')]
)

k = gs.best_estimator_.named_steps['clf']
Features = pd.DataFrame(k.coef_.tolist())
Features.columns = featurenames

In [5]:
gs.best_params_

{'clf__C': 1,
 'clf__class_weight': None,
 'clf__fit_intercept': True,
 'clf__max_iter': 5000,
 'clf__penalty': 'l2',
 'clf__random_state': 123,
 'transformer__tfidf__max_df': 0.6,
 'transformer__tfidf__max_features': 1000,
 'transformer__tfidf__min_df': 20,
 'transformer__tfidf__ngram_range': (1, 2)}

In [6]:
predtr = gs.best_estimator_.predict(train.drop(columns = 'y_true'))
predte = gs.best_estimator_.predict(test.drop(columns = 'y_true'))

In [10]:
407+257

664

In [8]:
668/1067

0.6260543580131209

In [9]:
pd.crosstab(predte, test['y_true'])

y_true,neg,pos
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
neg,407,276
pos,127,257
