In [1]:
# coding: utf-8

import matplotlib.pyplot as plt
import pandas as pd

from files.base import Base
from classifier.ClassifierService import ClassifierService

classifierService = ClassifierService()

[nltk_data] Downloading package rslp to
[nltk_data]     C:\Users\josue\AppData\Roaming\nltk_data...
[nltk_data]   Package rslp is already up-to-date!


In [2]:
df = pd.read_csv(Base.BASE, sep=";", names=['article','type','positive','negative','neutral','link','origin', 'id'])
df.head(2)

Unnamed: 0,article,type,positive,negative,neutral,link,origin,id
0,STF derruba decisão que flexibiliza 'A voz do ...,impartial,2,-2,-1,https://www.opovo.com.br/noticias/politica/201...,O Povo,1
1,Hage rebate críticas à divulgação de salários ...,partial,2,-3,-1,https://www.opovo.com.br/noticias/politica/201...,O Povo,2


In [3]:
totalDN = df[(df.origin == "Diário do Nordeste")].article.count()
totalG1 = df[(df.origin == "G1 Ceará")].article.count()
totalOP = df[(df.origin == "O Povo")].article.count()

impartialDN = df[(df.origin == "Diário do Nordeste") & (df.type == "impartial")].article.count()
pImpartialDN = impartialDN / totalDN * 100

impartialG1 = df[(df.origin == "G1 Ceará") & (df.type == "impartial")].article.count()
pImpartialG1 = impartialG1 / totalG1 * 100

impartialOP = df[(df.origin == "O Povo") & (df.type == "impartial")].article.count()
pImpartialOP = impartialOP / totalOP * 100

partialDN = df[(df.origin == "Diário do Nordeste") & (df.type == "partial")].article.count()
pPartialDN = partialDN / totalDN * 100

partialG1 = df[(df.origin == "G1 Ceará") & (df.type == "partial")].article.count()
pPartialG1 = partialG1 / totalG1 * 100

partialOP = df[(df.origin == "O Povo") & (df.type == "partial")].article.count()
pPartialOP = partialOP / totalOP * 100

analytics = pd.DataFrame({
    'total' : {
        'Total': df.article.count(),
        'Diário do Nordeste': totalDN,
        'G1 Ceará': totalG1,
        'O Povo': totalOP
    },
    'impartial' : {
        'Total': df[df.type == "impartial"].article.count(),
        'Diário do Nordeste': str(impartialDN) + " ( " + str(round(pImpartialDN)) + "% )",
        'G1 Ceará': str(impartialG1) + " ( " + str(round(pImpartialG1)) + "% )",
        'O Povo': str(impartialOP) + " ( " + str(round(pImpartialOP)) + "% )",
    },
    'partial' : {
        'Total':  df[df.type == "partial"].article.count(),
        'Diário do Nordeste': str(partialDN) + " ( " + str(round(pPartialDN)) + "% )",
        'G1 Ceará': str(partialG1) + " ( " + str(round(pPartialG1)) + "% )",
        'O Povo': str(partialOP) + " ( " + str(round(pPartialOP)) + "% )",
    },
})

analytics

Unnamed: 0,total,impartial,partial
Total,11616,4609,7007
Diário do Nordeste,2110,612 ( 29% ),1498 ( 71% )
G1 Ceará,2030,597 ( 29% ),1433 ( 71% )
O Povo,7476,3400 ( 45% ),4076 ( 55% )


In [4]:
df_trainig = pd.read_csv(Base.TRAINING, sep=";", names=['article','type'])
df_test = pd.read_csv(Base.TEST, sep=";", names=['article','type'])
df_ground_truth = pd.read_csv(Base.BASE_GROUND_TRUTH, sep=";", names=['article','type', 'id'])

analytics_base = pd.DataFrame({
    'total' : {
        'Total': df.article.count(),
        'Base Training': df_trainig.article.count(),
        'Base Test': df_test.article.count(),
        'Base Ground Truth': df_ground_truth.article.count(),
    },
    'impartial' : {
        'Total': df[df.type == "impartial"].article.count(),
        'Base Training': df_trainig[df_trainig.type == "impartial"].article.count(),
        'Base Test': df_test[df_test.type == "impartial"].article.count(),
        'Base Ground Truth': df_ground_truth[df_ground_truth.type == "impartial"].article.count(),
    },
    'partial' : {
        'Total':  df[df.type == "partial"].article.count(),
        'Base Training': df_trainig[df_trainig.type == "partial"].article.count(),
        'Base Test': df_test[df_test.type == "partial"].article.count(),
        'Base Ground Truth': df_ground_truth[df_ground_truth.type == "partial"].article.count(),
    },
})

analytics_base

Unnamed: 0,total,impartial,partial
Total,11616,4609,7007
Base Training,3475,1378,2097
Base Test,8111,3217,4894
Base Ground Truth,30,18,12


In [5]:
accuracy = classifierService.accuracy() * 100
print("Acurácia do Modelo Gerado: " + str(round(accuracy, 2)) + "%")

Acurácia do Modelo Gerado: 67.88%


In [6]:
print("\nMatriz de Confusão")
print( classifierService.confusionMatrix() )


Matriz de Confusão
          |    i      |
          |    m      |
          |    p    p |
          |    a    a |
          |    r    r |
          |    t    t |
          |    i    i |
          |    a    a |
          |    l    l |
----------+-----------+
impartial |<2444> 727 |
  partial | 1838<2976>|
----------+-----------+
(row = reference; col = test)



In [6]:
accuracy = classifierService.accuracy(groundTruth = True) * 100
print("Acurácia do Modelo Gerado: " + str(round(accuracy, 2)) + "%")

print("\nMatriz de Confusão")
print( classifierService.confusionMatrix(groundTruth = True) )

Acurácia do Modelo Gerado: 53.33%

Matriz de Confusão
          |  i    |
          |  m    |
          |  p  p |
          |  a  a |
          |  r  r |
          |  t  t |
          |  i  i |
          |  a  a |
          |  l  l |
----------+-------+
impartial | <6>12 |
  partial |  2<10>|
----------+-------+
(row = reference; col = test)

