In [163]:
import nltk
import requests
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn import metrics

%matplotlib inline

from bs4 import BeautifulSoup
from nltk.corpus import stopwords

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/luizreis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Downloading Data

In [61]:
csv = pd.read_csv("conjunto-treinamento-classificador.csv")

## Feature Extraction

In [65]:
def get_html_text(url): 
    page = requests.get(url)
    soup = BeautifulSoup(page.content)
    return soup.get_text()

In [172]:
en_stopwords = set(stopwords.words("english"))

def extract_features(url): 
    text = get_html_text(url)
    
    data = re.sub("[^a-zA-Z]", " ", text).lower()
    words = data.split() 
    return [w for w in words if not w in en_stopwords]

In [173]:
bag = csv["urls"].apply(extract_features).apply(' '.join)

## Preprocessing

In [128]:
vectorizer = CountVectorizer(analyzer = "word",
                             tokenizer = None,    
                             preprocessor = None,
                             stop_words = None,
                             max_features = 5000)

bag_of_words = vectorizer.fit_transform(bag).toarray()
bag_of_words

array([[ 6,  0,  0, ...,  1,  0,  0],
       [12,  0,  0, ...,  0,  0,  0],
       [ 8,  0,  0, ...,  0,  0,  0],
       ...,
       [ 5,  3,  1, ...,  0,  0,  0],
       [ 3,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0]])

In [144]:
tf_transformer = TfidfTransformer(use_idf=True).fit(bag_of_words)
bag_of_words_tf = tf_transformer.transform(bag_of_words).toarray()
bag_of_words_tf

array([[0.02907851, 0.        , 0.        , ..., 0.00815886, 0.        ,
        0.        ],
       [0.06108942, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.04080557, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.01894922, 0.015159  , 0.00659799, ..., 0.        , 0.        ,
        0.        ],
       [0.00599405, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [145]:
clf = MultinomialNB()

In [151]:
model = clf.fit(bag_of_words_tf, csv["target"].tolist())

In [152]:
y_pred = model.predict(bag_of_words_tf)

In [153]:
print("Number of mislabeled points out of a total %d points : %d"
 % (bag_of_words_tf.shape[0],(csv["target"].tolist() != y_pred).sum()))

Number of mislabeled points out of a total 160 points : 50


## Pipeline

In [178]:
text_clf = Pipeline([
    ('vect', CountVectorizer(analyzer = "word",
                             tokenizer = None,    
                             preprocessor = None,
                             stop_words = None,
                             max_features = 5000)),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', MultinomialNB()),
])

In [179]:
text_clf.fit(bag, csv["target"].tolist())

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=5000, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [180]:
predicted = text_clf.predict(bag)

In [181]:
np.mean(predicted == csv["target"].tolist())   

0.68125