<a href="https://colab.research.google.com/github/JohnBortotti/Bayes-Sklearn-test/blob/main/Bayes_sklearn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.metrics import accuracy_score, top_k_accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import spacy

import string
import math
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
pt_stopwords = stopwords.words('portuguese')

os.system("python -m spacy download pt_core_news_lg")

spacy_nlp_model = spacy.load('pt_core_news_lg', disable=[
    "tagger", "parser", "ner", "textcat", "entity_linker", 
    "attribute_ruler", "entity_ruler", "morphologizer",
    "senter", "tok2vec", "transformer"
])

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# set csv path and separator
csv = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/kaggle_databases/tweet_emotions.csv', sep=';')

In [None]:
# csv config labels, columns, etc...
csv_comment_column = 'content'
csv_label_column = 'sentiment'
csv = csv[[csv_comment_column, csv_label_column]]

In [None]:
# text processing (remove stopwords, tokenization)
train_dataset = []

bad_words = set(pt_stopwords + list(string.punctuation))

for i, row in csv.iterrows():
  treated_string = ""
  row_tokens = row_tokens = spacy_nlp_model(row[csv_comment_column].lower())
  for token in row_tokens:
    word = token.lemma_
    if not (word in bad_words):
        treated_string += word
  if treated_string != "":
    train_dataset.append((treated_string, row[csv_label_column]))

In [None]:
# shuffle dataset
train_dataframe = pd.DataFrame(train_dataset)
train_dataframe = train_dataframe.sample(frac=1).reset_index(drop=True)

In [None]:
# droping some rows to avoid RAM crashing on Google Colab
train_dataframe = train_dataframe[:20000]

In [None]:
# slice dataframe (train and validation)
total_len = len(train_dataframe)
train_len = math.floor(len(train_dataframe)*7/10)

validation_dataframe = train_dataframe[train_len+1:]
train_dataframe = train_dataframe[0:train_len]

In [None]:
# labels encoding
label_encoder = preprocessing.LabelEncoder()
train_labels=label_encoder.fit_transform(train_dataframe[1])

# input encoding
vectorizer = CountVectorizer()
train_inputs = vectorizer.fit_transform(train_dataframe[0])

In [None]:
# training step
model = MultinomialNB()
model.fit(train_inputs.toarray(), train_labels)

In [None]:
# validation
validation_encoded = vectorizer.transform(validation_dataframe[0]).toarray()
validation_predict = model.predict(validation_encoded)

print('Accuracy:', accuracy_score(label_encoder.transform(validation_dataframe[1]), validation_predict))
print('Test dataset:', len(train_inputs.toarray()))
print('Validation dataset:', len(validation_encoded))

In [None]:
# report
print(classification_report(label_encoder.transform(validation_dataframe[1]), validation_predict, target_names=label_encoder.classes_))