In [1]:
# coding: utf-8

import pandas as pd

from files.base import Base
from classifier.ClassifierService import ClassifierService

import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')

stopWords = set(stopwords.words('portuguese'))

classifierService = ClassifierService()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\josue\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\josue\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package rslp to
[nltk_data]     C:\Users\josue\AppData\Roaming\nltk_data...
[nltk_data]   Package rslp is already up-to-date!


In [2]:
df = pd.read_csv(Base.BASE, sep=";", names=[
    'article','type',
    'positive',
    'negative',
    'neutral',
    'link',
    'origin',
    'id',
    'length',
    'count_neutral_words'
])

total = df.article.count()
totalDN = df[(df.origin == "Diário do Nordeste")].article.count()
totalG1 = df[(df.origin == "G1 Ceará")].article.count()
totalOP = df[(df.origin == "O Povo")].article.count()

impartial = df[df.type == "impartial"].article.count()
pImpartial = impartial / total * 100

impartialDN = df[(df.origin == "Diário do Nordeste") & (df.type == "impartial")].article.count()
pImpartialDN = impartialDN / totalDN * 100

impartialG1 = df[(df.origin == "G1 Ceará") & (df.type == "impartial")].article.count()
pImpartialG1 = impartialG1 / totalG1 * 100

impartialOP = df[(df.origin == "O Povo") & (df.type == "impartial")].article.count()
pImpartialOP = impartialOP / totalOP * 100

partial = df[df.type == "partial"].article.count()
pPartial = partial / total * 100

partialDN = df[(df.origin == "Diário do Nordeste") & (df.type == "partial")].article.count()
pPartialDN = partialDN / totalDN * 100

partialG1 = df[(df.origin == "G1 Ceará") & (df.type == "partial")].article.count()
pPartialG1 = partialG1 / totalG1 * 100

partialOP = df[(df.origin == "O Povo") & (df.type == "partial")].article.count()
pPartialOP = partialOP / totalOP * 100

analytics = pd.DataFrame({
    'total' : {
        'Total': total,
        'Diário do Nordeste': totalDN,
        'G1 Ceará': totalG1,
        'O Povo': totalOP
    },
    'impartial' : {
        'Total': str(impartial) + " ( " + str(round(pImpartial)) + "% )",
        'Diário do Nordeste': str(impartialDN) + " ( " + str(round(pImpartialDN)) + "% )",
        'G1 Ceará': str(impartialG1) + " ( " + str(round(pImpartialG1)) + "% )",
        'O Povo': str(impartialOP) + " ( " + str(round(pImpartialOP)) + "% )",
    },
    'partial' : {
        'Total':  str(partial) + " ( " + str(round(pPartial)) + "% )",
        'Diário do Nordeste': str(partialDN) + " ( " + str(round(pPartialDN)) + "% )",
        'G1 Ceará': str(partialG1) + " ( " + str(round(pPartialG1)) + "% )",
        'O Povo': str(partialOP) + " ( " + str(round(pPartialOP)) + "% )",
    },
})

analytics

Unnamed: 0,total,impartial,partial
Total,12611,5357 ( 42% ),7254 ( 58% )
Diário do Nordeste,2374,749 ( 32% ),1625 ( 68% )
G1 Ceará,2309,773 ( 33% ),1536 ( 67% )
O Povo,7928,3835 ( 48% ),4093 ( 52% )


In [6]:
df_trainig = pd.read_csv(Base.TRAINING, sep=";", names=['article','type'])
df_trainig_nb = pd.read_csv(Base.TRAINING_NAIVE_BAYES, sep=";", names=['article','type'])
df_ground_truth = pd.read_csv(Base.BASE_GROUND_TRUTH, sep=";", names=['article','type', 'id', 'length', 'count_neutral_words'])

analytics_base = pd.DataFrame({
    'total' : {
        'Total': df.article.count(),
        'Base Training': df_trainig.article.count(),
        'Base Training Naive Bayes': df_trainig_nb.article.count(),
        'Base Ground Truth': df_ground_truth.article.count(),
    },
    'impartial' : {
        'Total': df[df.type == "impartial"].article.count(),
        'Base Training': df_trainig[df_trainig.type == "impartial"].article.count(),
        'Base Training Naive Bayes': df_trainig_nb[df_trainig_nb.type == "impartial"].article.count(),
        'Base Ground Truth': df_ground_truth[df_ground_truth.type == "impartial"].article.count(),
    },
    'partial' : {
        'Total':  df[df.type == "partial"].article.count(),
        'Base Training': df_trainig[df_trainig.type == "partial"].article.count(),
        'Base Training Naive Bayes': df_trainig_nb[df_trainig_nb.type == "partial"].article.count(),
        'Base Ground Truth': df_ground_truth[df_ground_truth.type == "partial"].article.count(),
    },
})

analytics_base

Unnamed: 0,total,impartial,partial
Total,12611,5357,7254
Base Training,12471,5283,7188
Base Training Naive Bayes,12471,6879,5592
Base Ground Truth,140,100,40


In [5]:
accuracy = classifierService.accuracy(groundTruth = True) * 100
print("Acurácia do Modelo Gerado: " + str(round(accuracy, 2)) + "%")

print("\nMatriz de Confusão")
print( classifierService.confusionMatrix(groundTruth = True) )

Acurácia do Modelo Gerado: 62.86%

Matriz de Confusão
          |  i    |
          |  m    |
          |  p  p |
          |  a  a |
          |  r  r |
          |  t  t |
          |  i  i |
          |  a  a |
          |  l  l |
----------+-------+
impartial |<58>42 |
  partial | 10<30>|
----------+-------+
(row = reference; col = test)

