In [None]:
# !pip install tiktoken
# !pip install transformers[torch]
# !pip install scikit-learn

# **Importações**

In [68]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import tiktoken # type: ignore
import nltk
from nltk.tokenize import word_tokenize
import ast
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# **Criação de arquivo tokenizado**

## **Leitura dos dados**

In [None]:
news = pd.read_csv(r'..\AI\preprocessed\pre-processed.csv')
news = news.drop(columns=['index'])

In [15]:
news.head()

Unnamed: 0,label,preprocessed_news
0,fake,katia abreu diz vai colocar expulsao moldura n...
1,fake,ray peita bolsonaro conservador fake entrevist...
2,fake,reinaldo azevedo desmascarado policia federal ...
3,fake,relatorio assustador bndes mostra dinheiro pub...
4,fake,radialista americano fala sobre pt vendem ilus...


In [16]:
news['class'] = news['label'] == 'true'
news['class'] = news['class'].astype(int)

In [17]:
news.head()

Unnamed: 0,label,preprocessed_news,class
0,fake,katia abreu diz vai colocar expulsao moldura n...,0
1,fake,ray peita bolsonaro conservador fake entrevist...,0
2,fake,reinaldo azevedo desmascarado policia federal ...,0
3,fake,relatorio assustador bndes mostra dinheiro pub...,0
4,fake,radialista americano fala sobre pt vendem ilus...,0


## **Tokenização**

### com tiktoken

In [47]:
enconding = tiktoken.get_encoding('o200k_base')

In [19]:
first_news_token = enconding.encode(news['preprocessed_news'][0])
print(first_news_token)
print(enconding.decode(first_news_token))

[14408, 535, 692, 40962, 17259, 12156, 47242, 106492, 3458, 24575, 2705, 46625, 167645, 6385, 11323, 9015, 535, 692, 40962, 17699, 106492, 3458, 10074, 3687, 21905, 1375, 3458, 11531, 3419, 26044, 190748, 19475, 21530, 3857, 939, 36427, 21527, 11257, 277, 24575, 2705, 131023, 1194, 106492, 3458, 9693, 831, 365, 16505, 81815, 540, 37886, 31954, 4664, 7010, 10495, 40182, 14665, 106492, 12641, 46625, 2096, 347, 30321, 2096, 347, 30321, 58488, 13053, 2453, 17699, 9015, 535, 692, 40962, 118426, 106492, 3458, 13967, 36992, 19208, 10495, 40182, 73374, 58282, 83302, 9015, 535, 61815, 39531, 105384, 9015, 535, 46625, 11821, 18961, 30967, 3165, 84082, 17795, 29297, 24766, 18961, 27391, 1093, 9015, 535, 49726, 285, 5538, 344, 7072, 258, 926, 138568, 15908, 10074, 3687, 322, 116530, 277, 47249, 55968, 9015, 535, 35342, 58282, 1975, 17055, 172879, 6925, 12538, 365, 11564, 28397, 12102, 18961, 46625, 179719, 283, 9015, 535, 692, 40962, 1974, 17331, 4581, 16693, 2506, 292, 1578, 117299, 13812, 809]
k

In [20]:
news['tokens'] = news['preprocessed_news'].apply(lambda x: enconding.encode(x))

In [21]:
news['num_tokens'] = news['tokens'].apply(len)

In [22]:
news.head()

Unnamed: 0,label,preprocessed_news,class,tokens,num_tokens
0,fake,katia abreu diz vai colocar expulsao moldura n...,0,"[14408, 535, 692, 40962, 17259, 12156, 47242, ...",154
1,fake,ray peita bolsonaro conservador fake entrevist...,0,"[25430, 1045, 2580, 7271, 81783, 13953, 3723, ...",210
2,fake,reinaldo azevedo desmascarado policia federal ...,0,"[264, 1028, 2408, 130965, 191981, 731, 4227, 6...",271
3,fake,relatorio assustador bndes mostra dinheiro pub...,0,"[6004, 31554, 180151, 3723, 287, 301, 268, 453...",509
4,fake,radialista americano fala sobre pt vendem ilus...,0,"[19027, 563, 3857, 94277, 60486, 4844, 18961, ...",97


In [25]:
news.to_csv('tokenized.csv', sep='\t', index=False)

# **Processo com o arquivo tokenizado**

In [23]:
news = pd.read_csv(r'..\AI\preprocessed\tokenized.csv', sep='\t')
news['tokens'] = news['tokens'].apply(ast.literal_eval)
news.head()

Unnamed: 0,label,preprocessed_news,class,tokens,num_tokens
0,fake,katia abreu diz vai colocar expulsao moldura n...,0,"[14408, 535, 692, 40962, 17259, 12156, 47242, ...",154
1,fake,ray peita bolsonaro conservador fake entrevist...,0,"[25430, 1045, 2580, 7271, 81783, 13953, 3723, ...",210
2,fake,reinaldo azevedo desmascarado policia federal ...,0,"[264, 1028, 2408, 130965, 191981, 731, 4227, 6...",271
3,fake,relatorio assustador bndes mostra dinheiro pub...,0,"[6004, 31554, 180151, 3723, 287, 301, 268, 453...",509
4,fake,radialista americano fala sobre pt vendem ilus...,0,"[19027, 563, 3857, 94277, 60486, 4844, 18961, ...",97


## **Vetorização**

In [24]:
def slice_tokens(token, max_len):
    if len(token) > max_len:
        token = token[:max_len]
    elif len(token) < max_len:
        token = token + [0]*(max_len-len(token))
    
    return token

### com 128 tokens

In [26]:
max_len = 128
news['limited_tokens'] = news['tokens'].apply(lambda token: slice_tokens(token, max_len))

In [27]:
news['limited_tokens']

0       [14408, 535, 692, 40962, 17259, 12156, 47242, ...
1       [25430, 1045, 2580, 7271, 81783, 13953, 3723, ...
2       [264, 1028, 2408, 130965, 191981, 731, 4227, 6...
3       [6004, 31554, 180151, 3723, 287, 301, 268, 453...
4       [19027, 563, 3857, 94277, 60486, 4844, 18961, ...
                              ...                        
7195    [73, 192284, 26091, 270, 2424, 1375, 3458, 103...
7196    [1133, 259, 17259, 139730, 283, 27291, 33140, ...
7197    [630, 302, 359, 23205, 5086, 6302, 1589, 259, ...
7198    [977, 18786, 31954, 38122, 18977, 893, 3458, 4...
7199    [59835, 1190, 91935, 17259, 59965, 81089, 6768...
Name: limited_tokens, Length: 7200, dtype: object

## **Treinamento**

In [37]:
x = np.array(news['limited_tokens'].tolist())
y = np.array(news['class'])

In [52]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8)

In [56]:
for data in [x_train, x_test, y_train, y_test]:
    print(data.shape)

(5760, 128)
(1440, 128)
(5760,)
(1440,)


### Regressão Logistica

In [61]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(x_train,y_train)

In [65]:
y_predict = lr.predict(x_test)

In [66]:
lr.score(x_test,y_test)

0.6409722222222223

In [70]:
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

           0       0.64      0.59      0.61       701
           1       0.64      0.69      0.66       739

    accuracy                           0.64      1440
   macro avg       0.64      0.64      0.64      1440
weighted avg       0.64      0.64      0.64      1440

