# Pre processamento 20 new groups

In [109]:
from time import time
from sklearn.datasets import load_files

t0 = time()
dataset = load_files(container_path='../smallerdataset', shuffle=True)
print('Import Time: %0.3fs' % (time() - t0))
X, y = dataset.data, dataset.target
print(X[0], y)
print(len(X), len(y))

Import Time: 0.476s
b"From: mpretzel@cs.utexas.edu (Benjamin W. Allums)\nSubject: Re: Mac II SCSI & PMMU socket question\n\nIn article <1qkmb2$n0d@jethro.Corp.Sun.COM> khc@marantz.Corp.Sun.COM writes:\n\n>1. The Mac II is supposed to have a socket for the MC68851 PMMU chip. Could\n>anyone let me know where that socket is on the motherboard. I have obtained\n>a PMMU chip (16 Mhz) from a surplus store, and would like to install it onto\n>my Mac II (circa 1987). But I cannot see the socket myself when I tried to\n>install it.\n\nThe original Mac II had an Apple MMU chip installed which performs a subset\nof the 68851's functions.  If you look underneath your front left floppy\nbay you will find three chips, all approximately the same size.  One will\nbe the 68020, the next the 68881, and the third, approximately the same\nsize, will be the Apple chip.  It is easy to spot because it has a 'hump'\nin the middle of it.\n\n\nExample:\n\n\n                         -----------\n                

In [52]:
import re
import nltk
# nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def pre_process_text(text):
    if not isinstance(text, str):
        text = text.decode('ISO-8859-1')
    
    text = re.sub('[^a-zA-Z]', ' ', text) # Retirar caracteres especiais e digitos
    text = text.lower() # Tudo para caixa baixa
    text = text.split() # Retirar espaços exessivos
    text = ' '.join(text)
    # print('\tTexto limpo.\n', text)

    # Tokenizar
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    # print('\tTransformando em tokens.\n', tokens)

    # Remover as stopword
    stop = stopwords.words('english')
    tokens = [token for token in tokens if token not in set(stop)]
    # print('\tRetirando stopwords.\n', tokens)
    
    # Tirar palavras menores que 2 caracteres
    tokens = [token for token in tokens if len(token) > 2]
    # print('\tRetirando palavras menores que 2.\n', tokens)
    
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # print('\tLemmatizing.\n', tokens)
    
    # Texto pre processado
    tokens = ' '.join(tokens)
    
    return tokens

In [46]:
pre_process_text(b'Hello! How are u, my friend? You should be fine by now, shouldn\'t you? It is raining cats and dogs outside')

	Texto limpo.
 hello how are u my friend you should be fine by now shouldn t you it is raining cats and dogs outside
	Transformando em tokens.
 ['hello', 'how', 'are', 'u', 'my', 'friend', 'you', 'should', 'be', 'fine', 'by', 'now', 'shouldn', 't', 'you', 'it', 'is', 'raining', 'cats', 'and', 'dogs', 'outside']
	Retirando stopwords.
 ['hello', 'u', 'friend', 'fine', 'raining', 'cats', 'dogs', 'outside']
	Retirando palavras menores que 2.
 ['hello', 'friend', 'fine', 'raining', 'cats', 'dogs', 'outside']
	Lemmatizing.
 ['hello', 'friend', 'fine', 'raining', 'cat', 'dog', 'outside']


'hello friend fine raining cat dog outside'

In [110]:
t0 = time()
X = [pre_process_text(doc) for doc in X]
print(time() - t0)

10.60123610496521


# BOW

In [111]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer_bow = CountVectorizer(max_features=35000)
t0 = time()
X_bow = vectorizer_bow.fit_transform(X)
print(time() - t0)

0.45781707763671875


# TFIDF

In [193]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer_tfidf = TfidfVectorizer(min_df=1, max_df=0.7, ngram_range=(1,2), norm='l2', lowercase=False, stop_words='english', max_features=57000)
t0 = time()
X_tfidf = vectorizer_tfidf.fit_transform(X)
print(time() - t0)

1.4903137683868408


In [194]:
X_tfidf.shape

(2928, 57000)

# Hashing

In [232]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.pipeline import make_pipeline

hasher = HashingVectorizer(stop_words='english',
                           alternate_sign=False,
                           norm=None,
                           binary=False,
                           n_features=1040000
                          )
vectorizer = make_pipeline(hasher, TfidfTransformer())
X_hs = vectorizer.fit_transform(X)
X_hs.shape

(2928, 1040000)

# Sampling

In [233]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

X_train_bw, X_test_bw, y_train_bw, y_test_bw = train_test_split(X_bow, y, test_size=0.20, random_state=42)
X_train_tf, X_test_tf, y_train_tf, y_test_tf = train_test_split(X_tfidf, y, test_size=0.20, random_state=42)
X_train_hs, X_test_hs, y_train_hs, y_test_hs = train_test_split(X_hs, y, test_size=0.20, random_state=42)

In [234]:
print(X_train_bw.shape, X_test_bw.shape, y_train_bw.shape, y_test_bw.shape)
print(X_train_tf.shape, X_test_tf.shape, y_train_tf.shape, y_test_tf.shape)
print(X_train_hs.shape, X_test_hs.shape, y_train_hs.shape, y_test_hs.shape)

(2342, 35000) (586, 35000) (2342,) (586,)
(2342, 57000) (586, 57000) (2342,) (586,)
(2342, 1040000) (586, 1040000) (2342,) (586,)


# Naive Bayes Multinomial

In [115]:
from sklearn.naive_bayes import MultinomialNB
clf_nb = MultinomialNB()

### BOW

In [136]:
clf_nb.fit(X_train_bw, y_train_bw)
y_pred_nb_bw = clf_nb.predict(X_test_bw)
print(confusion_matrix(y_test_bw, y_pred_nb_bw))
print(classification_report(y_test_bw, y_pred_nb_bw))

[[151  43  11]
 [  4 185  16]
 [  0   7 169]]
             precision    recall  f1-score   support

          0       0.97      0.74      0.84       205
          1       0.79      0.90      0.84       205
          2       0.86      0.96      0.91       176

avg / total       0.88      0.86      0.86       586



### TFIDF

In [196]:
clf_nb.fit(X_train_tf, y_train_tf)
y_pred_nb_tfidf = clf_nb.predict(X_test_tf)
print(confusion_matrix(y_test_tf, y_pred_nb_tfidf))
print(classification_report(y_test_tf, y_pred_nb_tfidf))

[[185  18   2]
 [ 13 181  11]
 [  4   8 164]]
             precision    recall  f1-score   support

          0       0.92      0.90      0.91       205
          1       0.87      0.88      0.88       205
          2       0.93      0.93      0.93       176

avg / total       0.90      0.90      0.90       586



### Hashing

In [235]:
clf_nb.fit(X_train_hs, y_train_hs)
y_pred_nb_hs = clf_nb.predict(X_test_hs)
print(confusion_matrix(y_test_hs, y_pred_nb_hs))
print(classification_report(y_test_hs, y_pred_nb_hs))

[[186  18   1]
 [ 14 181  10]
 [  5   6 165]]
             precision    recall  f1-score   support

          0       0.91      0.91      0.91       205
          1       0.88      0.88      0.88       205
          2       0.94      0.94      0.94       176

avg / total       0.91      0.91      0.91       586



# Stochastic Gradient Descent

In [214]:
from sklearn.linear_model import SGDClassifier
clf_sgd = SGDClassifier(alpha=.0001, max_iter=50, penalty="l2")

### BOW

In [208]:
clf_sgd.fit(X_train_bw, y_train_bw)
y_pred_sgd_bw = clf_sgd.predict(X_test_bw)
print(confusion_matrix(y_test_bw, y_pred_sgd_bw))
print(classification_report(y_test_bw, y_pred_sgd_bw))

[[180  21   4]
 [ 23 168  14]
 [  8  19 149]]
             precision    recall  f1-score   support

          0       0.85      0.88      0.87       205
          1       0.81      0.82      0.81       205
          2       0.89      0.85      0.87       176

avg / total       0.85      0.85      0.85       586



### TFIDF

In [215]:
clf_sgd.fit(X_train_tf, y_train_tf)
y_pred_sgd_tfidf = clf_sgd.predict(X_test_tf)
print(confusion_matrix(y_test_tf, y_pred_sgd_tfidf))
print(classification_report(y_test_tf, y_pred_sgd_tfidf))

[[189  13   3]
 [ 11 179  15]
 [  3   8 165]]
             precision    recall  f1-score   support

          0       0.93      0.92      0.93       205
          1       0.90      0.87      0.88       205
          2       0.90      0.94      0.92       176

avg / total       0.91      0.91      0.91       586



### Hashing

In [236]:
clf_sgd.fit(X_train_hs, y_train_hs)
y_pred_sgd_hs = clf_sgd.predict(X_test_hs)
print(confusion_matrix(y_test_hs, y_pred_sgd_hs))
print(classification_report(y_test_hs, y_pred_sgd_hs))

[[188  15   2]
 [ 17 173  15]
 [  3   7 166]]
             precision    recall  f1-score   support

          0       0.90      0.92      0.91       205
          1       0.89      0.84      0.87       205
          2       0.91      0.94      0.92       176

avg / total       0.90      0.90      0.90       586



# SVC (SVM) - Linear

In [222]:
from sklearn.svm import LinearSVC
clf_svm = LinearSVC(penalty='l2', dual=True, tol=1e-3)

### BOW

In [131]:
clf_svm.fit(X_train_bw, y_train_bw)
y_pred_svm_bw = clf_svm.predict(X_test_bw)
print(confusion_matrix(y_test_bw, y_pred_svm_bw))
print(classification_report(y_test_bw, y_pred_svm_bw))

[[184  17   4]
 [ 15 169  21]
 [  8  10 158]]
             precision    recall  f1-score   support

          0       0.89      0.90      0.89       205
          1       0.86      0.82      0.84       205
          2       0.86      0.90      0.88       176

avg / total       0.87      0.87      0.87       586



### TFIDF

In [224]:
clf_svm.fit(X_train_tf, y_train_tf)
y_pred_svm_tfidf = clf_svm.predict(X_test_tf)
print(confusion_matrix(y_test_tf, y_pred_svm_tfidf))
print(classification_report(y_test_tf, y_pred_svm_tfidf))

[[189  13   3]
 [ 12 178  15]
 [  3   7 166]]
             precision    recall  f1-score   support

          0       0.93      0.92      0.92       205
          1       0.90      0.87      0.88       205
          2       0.90      0.94      0.92       176

avg / total       0.91      0.91      0.91       586



### Hashing

In [237]:
clf_svm.fit(X_train_hs, y_train_hs)
y_pred_svm_hs = clf_svm.predict(X_test_hs)
print(confusion_matrix(y_test_hs, y_pred_svm_hs))
print(classification_report(y_test_hs, y_pred_svm_hs))

[[189  13   3]
 [ 16 177  12]
 [  2   8 166]]
             precision    recall  f1-score   support

          0       0.91      0.92      0.92       205
          1       0.89      0.86      0.88       205
          2       0.92      0.94      0.93       176

avg / total       0.91      0.91      0.91       586



# Passive-Agressive

In [226]:
from sklearn.linear_model import PassiveAggressiveClassifier
clf_pa = PassiveAggressiveClassifier(max_iter=50)

### BOW

In [227]:
clf_pa.fit(X_train_bw, y_train_bw)
y_pred_pa_bw = clf_pa.predict(X_test_bw)
print(confusion_matrix(y_test_bw, y_pred_pa_bw))
print(classification_report(y_test_bw, y_pred_pa_bw))

[[185  18   2]
 [ 16 169  20]
 [  6   8 162]]
             precision    recall  f1-score   support

          0       0.89      0.90      0.90       205
          1       0.87      0.82      0.85       205
          2       0.88      0.92      0.90       176

avg / total       0.88      0.88      0.88       586



### TFIDF

In [228]:
clf_pa.fit(X_train_tf, y_train_tf)
y_pred_pa_tfidf = clf_pa.predict(X_test_tf)
print(confusion_matrix(y_test_tf, y_pred_pa_tfidf))
print(classification_report(y_test_tf, y_pred_pa_tfidf))

[[188  14   3]
 [ 11 179  15]
 [  3   7 166]]
             precision    recall  f1-score   support

          0       0.93      0.92      0.92       205
          1       0.90      0.87      0.88       205
          2       0.90      0.94      0.92       176

avg / total       0.91      0.91      0.91       586



### Hashing

In [238]:
clf_pa.fit(X_train_hs, y_train_hs)
y_pred_pa_hs = clf_pa.predict(X_test_hs)
print(confusion_matrix(y_test_hs, y_pred_pa_hs))
print(classification_report(y_test_hs, y_pred_pa_hs))

[[190  14   1]
 [ 17 174  14]
 [  4   7 165]]
             precision    recall  f1-score   support

          0       0.90      0.93      0.91       205
          1       0.89      0.85      0.87       205
          2       0.92      0.94      0.93       176

avg / total       0.90      0.90      0.90       586

