In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!unzip "/content/drive/MyDrive/curso_word2vec/data/cbow_s300.zip"

Archive:  /content/drive/MyDrive/curso_word2vec/data/cbow_s300.zip
replace cbow_s300.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


In [None]:
import pandas as pd

article_train = pd.read_csv('/content/drive/MyDrive/curso_word2vec/data/treino.csv', usecols=['title', 'category'])
article_test = pd.read_csv('/content/drive/MyDrive/curso_word2vec/data/teste.csv', usecols=['title', 'category'])

print(article_train.shape)
print(article_test.shape)

(90000, 2)
(20513, 2)


In [None]:
print(article_train.info())
article_train.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90000 entries, 0 to 89999
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     90000 non-null  object
 1   category  90000 non-null  object
dtypes: object(2)
memory usage: 1.4+ MB
None


Unnamed: 0,title,category
0,"Após polêmica, Marine Le Pen diz que abomina n...",mundo
1,"Macron e Le Pen vão ao 2º turno na França, em ...",mundo
2,"Apesar de larga vitória nas legislativas, Macr...",mundo
3,"Governo antecipa balanço, e Alckmin anuncia qu...",cotidiano
4,"Após queda em maio, a atividade econômica sobe...",mercado


In [None]:
print(article_test.info())
article_test.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20513 entries, 0 to 20512
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     20513 non-null  object
 1   category  20513 non-null  object
dtypes: object(2)
memory usage: 320.6+ KB
None


Unnamed: 0,title,category
0,Grandes irmãos,colunas
1,Haddad congela orçamento e suspende emendas de...,colunas
2,Proposta de reforma da Fifa tem a divulgação d...,esporte
3,"Mercado incipiente, internet das coisas conect...",mercado
4,"Mortes: Psicanalista, estudou o autismo em cri...",cotidiano


In [None]:
from gensim.models import KeyedVectors

word2vec_model = KeyedVectors.load_word2vec_format('/content/cbow_s300.txt', binary=False)

In [None]:
word2vec_model.most_similar('china')

[('rússia', 0.7320704460144043),
 ('índia', 0.7241617441177368),
 ('tailândia', 0.701935887336731),
 ('indonésia', 0.6860769987106323),
 ('turquia', 0.6741335988044739),
 ('malásia', 0.6665689945220947),
 ('mongólia', 0.6593616008758545),
 ('manchúria', 0.6581847667694092),
 ('urss', 0.6581669449806213),
 ('grã-bretanha', 0.6568098068237305)]

## Vetorização dos títulos dos artigos

In [None]:
from string import punctuation, digits
import numpy as np


def custom_tokenizer(series):
  punctuation_translator = str.maketrans('', '', punctuation)
  digits_translator = str.maketrans(digits, '0' * len(digits))

  __series = series.str.lower()
  __series = __series.str.translate(punctuation_translator)
  __series = __series.str.translate(digits_translator)
  __series = __series.str.split()
  __series = __series.map(np.array)

  return __series.to_numpy()


def combine_vectors_by_sum(__input):
  result_vector = np.zeros(300)

  for word in __input:
    try:
      result_vector += word2vec_model.get_vector(word)
    except KeyError:
      result_vector += word2vec_model.get_vector('unknown')

  return result_vector

In [None]:
phrases_test = pd.Series(['Olá, Meu nome é Lucaszz 12', 'Essa é mais uma frase interessante 1230', 'Olha! um outro array 2'])
print(custom_tokenizer(phrases_test))

[array(['olá', 'meu', 'nome', 'é', 'lucaszz', '00'], dtype='<U7')
 array(['essa', 'é', 'mais', 'uma', 'frase', 'interessante', '0000'],
       dtype='<U12')
 array(['olha', 'um', 'outro', 'array', '0'], dtype='<U5')]


In [None]:
result = np.array(list(map(combine_vectors_by_sum, custom_tokenizer(phrases_test))))
result.shape

(3, 300)

In [None]:
def get_vectorized_data(series):
  return np.array(list(map(combine_vectors_by_sum, custom_tokenizer(series))))

In [None]:
X_train = get_vectorized_data(article_train.title)
X_test = get_vectorized_data(article_test.title)

Y_train = article_train.category
Y_test = article_test.category

print(X_train.shape)
print(X_test.shape)

print(Y_train.shape)
print(Y_test.shape)

(90000, 300)
(20513, 300)
(90000,)
(20513,)


In [None]:
from sklearn.linear_model import LogisticRegression

logistic_regression = LogisticRegression(max_iter=200)
logistic_regression.fit(X_train, Y_train)

LogisticRegression(max_iter=200)

In [None]:
logistic_regression.score(X_test, Y_test)

0.7976405206454443

In [None]:
from sklearn.metrics import classification_report

predicted = logistic_regression.predict(X_test)
CR = classification_report(Y_test, predicted)
print(CR)

              precision    recall  f1-score   support

     colunas       0.86      0.72      0.78      6103
   cotidiano       0.61      0.79      0.69      1698
     esporte       0.92      0.89      0.90      4663
   ilustrada       0.14      0.89      0.24       131
     mercado       0.84      0.79      0.81      5867
       mundo       0.73      0.86      0.79      2051

    accuracy                           0.80     20513
   macro avg       0.68      0.82      0.70     20513
weighted avg       0.83      0.80      0.81     20513



In [None]:
from sklearn.dummy import DummyClassifier

dummy_classifier = DummyClassifier(strategy='stratified')
dummy_classifier.fit(X_train, Y_train)

DummyClassifier(strategy='stratified')

In [None]:
predicted = dummy_classifier.predict(X_test)
CR = classification_report(Y_test, predicted)
print(CR)

              precision    recall  f1-score   support

     colunas       0.31      0.17      0.22      6103
   cotidiano       0.07      0.14      0.09      1698
     esporte       0.22      0.17      0.19      4663
   ilustrada       0.01      0.17      0.01       131
     mercado       0.29      0.17      0.21      5867
       mundo       0.10      0.16      0.12      2051

    accuracy                           0.17     20513
   macro avg       0.17      0.16      0.14     20513
weighted avg       0.24      0.17      0.19     20513



In [None]:
!unzip "/content/drive/MyDrive/curso_word2vec/data/skip_s300.zip"

Archive:  /content/drive/MyDrive/curso_word2vec/data/skip_s300.zip
  inflating: skip_s300.txt           


In [None]:
word2vec_model = KeyedVectors.load_word2vec_format('/content/skip_s300.txt', binary=False)

In [None]:
X_train = get_vectorized_data(article_train.title)
X_test = get_vectorized_data(article_test.title)

Y_train = article_train.category
Y_test = article_test.category

In [None]:
logistic_regression = LogisticRegression(max_iter=300)
logistic_regression.fit(X_train, Y_train)

predicted = logistic_regression.predict(X_test)
CR = classification_report(Y_test, predicted)
print(CR)

              precision    recall  f1-score   support

     colunas       0.86      0.72      0.78      6103
   cotidiano       0.62      0.80      0.70      1698
     esporte       0.93      0.90      0.91      4663
   ilustrada       0.15      0.92      0.26       131
     mercado       0.85      0.81      0.83      5867
       mundo       0.76      0.86      0.80      2051

    accuracy                           0.81     20513
   macro avg       0.70      0.84      0.72     20513
weighted avg       0.84      0.81      0.82     20513



In [None]:
dummy_classifier = DummyClassifier(strategy='stratified')
dummy_classifier.fit(X_train, Y_train)

predicted = dummy_classifier.predict(X_test)
CR = classification_report(Y_test, predicted)
print(CR)

              precision    recall  f1-score   support

     colunas       0.29      0.16      0.21      6103
   cotidiano       0.09      0.17      0.12      1698
     esporte       0.22      0.17      0.19      4663
   ilustrada       0.01      0.15      0.01       131
     mercado       0.28      0.17      0.21      5867
       mundo       0.09      0.16      0.12      2051

    accuracy                           0.17     20513
   macro avg       0.16      0.16      0.14     20513
weighted avg       0.24      0.17      0.19     20513

