In [1]:
# Instalando os pacotes necessários
from google.colab import drive
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
# Conectando ao Google Drive e efetuando a leitura do Dataset
drive.mount('/content/drive')

dataframe = pd.read_excel("/content/drive/MyDrive/M8/ModeloSprint2/LATAM-Data.xlsx")
dataframe.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0.1,Unnamed: 0,Supplier Name,Normalized Supplier Name,Parent Supplier Name,Region,Country Name,Strategic Region,Requestor Name,Preparer Name,Level 1,...,GL Desc (Level 6),Invoice ID,Invoice Number,Invoice Source,GL Description,Product,Project,"Month, Day, Year of Payment Date",PO Number,Amount USD
0,,20 TABELIAO DE NOTAS DA CAPITAL,20 TABELIAO DE NOTAS DA CAPITAL,20 TABELIAO DE NOTAS DA CAPITAL,LATAM,Brazil,LATAM,Daniela Fechio,Cindy Eurie,Uncategorized,...,Operating Expenses w/o Allocations,300002608576539,504851,LETTERBOX,Postage and courier,Default Product,31505 - Sao Paulo Birmann 32,2023-02-10,70000600000.0,6
1,,20 TABELIAO DE NOTAS DA CAPITAL,20 TABELIAO DE NOTAS DA CAPITAL,20 TABELIAO DE NOTAS DA CAPITAL,LATAM,Brazil,LATAM,Daniela Fechio,Cindy Eurie,Uncategorized,...,Operating Expenses w/o Allocations,300002647480228,505438,LETTERBOX,Postage and courier,Default Product,31505 - Sao Paulo Birmann 32,2023-03-08,70000600000.0,2
2,,20 TABELIAO DE NOTAS DA CAPITAL,20 TABELIAO DE NOTAS DA CAPITAL,20 TABELIAO DE NOTAS DA CAPITAL,LATAM,Brazil,LATAM,Daniela Fechio,Cindy Eurie,Uncategorized,...,Operating Expenses w/o Allocations,300002705372803,505806,LETTERBOX,Postage and courier,Default Product,31505 - Sao Paulo Birmann 32,2023-04-12,70000600000.0,6
3,,20 TABELIAO DE NOTAS DA CAPITAL,20 TABELIAO DE NOTAS DA CAPITAL,20 TABELIAO DE NOTAS DA CAPITAL,LATAM,Brazil,LATAM,Daniela Fechio,Cindy Eurie,Uncategorized,...,Operating Expenses w/o Allocations,300002712153834,506089,LETTERBOX,Postage and courier,Default Product,31505 - Sao Paulo Birmann 32,2023-04-14,70000600000.0,8
4,,20 TABELIAO DE NOTAS DA CAPITAL,20 TABELIAO DE NOTAS DA CAPITAL,20 TABELIAO DE NOTAS DA CAPITAL,LATAM,Brazil,LATAM,Daniela Fechio,Cindy Eurie,Uncategorized,...,Operating Expenses w/o Allocations,300002746687642,506689,LETTERBOX,Postage and courier,Default Product,31505 - Sao Paulo Birmann 32,2023-05-10,70000600000.0,6


In [5]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Definindo método de pré-processamento de texto
def preprocess_text(text):
    tokens = word_tokenize(text)  # Tokenização
    tokens = [word for word in tokens if word.isalnum()]  # Removendo caracteres não alfanuméricos
    tokens = [word.lower() for word in tokens]  # Convertendo para minúsculas
    # print("Tokenização: ", tokens)

    tokens = [word for word in tokens if word not in stop_words]  # Removendo Stopwords
    # print("Remoção das Stopwords: ", tokens)

    stemmed_tokens = [stemmer.stem(word) for word in tokens]  # Aplicando stemming
    # print("Aplicação do Stemming: ", stemmed_tokens)

    return ' '.join(stemmed_tokens)

# Aplicando o método na coluna de interesse
dataframe['GL Description'] = dataframe['GL Description'].apply(preprocess_text)

# Dividindo os dados em conjuntos de treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(dataframe['GL Description'], dataframe['Level 1'], test_size=0.2, random_state=42)

# Vetorizando o texto ponderando pela frequência de ocorrência
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [6]:
# Treinando modelo Multinomial Naive Bayes
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)

# Prevendo os valores do conjunto de teste
y_pred = nb_classifier.predict(X_test_tfidf)

# Avaliando o modelo
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Acurácia do modelo: {accuracy}')
print(f'Relatório de classificação:\n{report}')

Acurácia do modelo: 0.8277851753217932
Relatório de classificação:
                                                       precision    recall  f1-score   support

                                   Energy & Utilities       0.00      0.00      0.00         5
                                      Human Resources       0.68      0.62      0.65        82
                                            Logistics       0.45      0.88      0.60        16
                                        Manufacturing       0.00      0.00      0.00        14
                                Professional Services       0.95      0.92      0.94      1093
R&D Equipment (incl. Equipment Services and Supplies)       0.00      0.00      0.00         9
                             Real Estate & Facilities       0.69      0.89      0.78       379
                            Sales, Marketing & Events       0.50      0.54      0.52        80
                                   Technology/Telecom       0.83      0.91   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
# Testando o modelo com um novo input
new_description = ["Accrued Fees Ads"]
new_description = [preprocess_text(new_description[0])]
new_description_tfidf = tfidf_vectorizer.transform(new_description)

predicted_category = nb_classifier.predict(new_description_tfidf)
print(f'A categoria prevista para a nova descrição é: {predicted_category[0]}')

A categoria prevista para a nova descrição é: Sales, Marketing & Events
