# NLU - LawHunter


## Pipeline

1. **NLP Pre-processing**
   - Lowercase standardization
   - Stop words removal
   - Entity identification
   - Normalization (slang, etc.)
   - Stemming and lemmatization

2. **Word Embeding**
   - word2vec

3. **Model Training**
   - Naive Bayes (not suitable with negative values)
   - Logictic Regression
   - Decision Tree
   - Random Forest
   - SVM
   - Neural Network

   3.1 Metrics  
   3.2 Model Selection

DataFrame structure:
   |example sentence | intent_example

4. **Testing: input and output of the model: intent identification.**

## Imports and downloads

In [None]:
!pip install spacy



In [None]:
!python -m spacy download pt_core_news_sm

Collecting pt-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-3.7.0/pt_core_news_sm-3.7.0-py3-none-any.whl (13.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('pt_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
# Imports and Downloads
import nltk
import gdown
import string
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import ne_chunk, pos_tag
from nltk.corpus import wordnet
import numpy as np

# Download necessary packages
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Pre-processing

In [None]:
# Class for processing texts
class TextProcessor:
    def __init__(self):
        # Load the SpaCy model for Portuguese language processing
        self.nlp = spacy.load('pt_core_news_sm')

        # Dictionary to map slang/abbreviations to their full forms
        self.slang_dict = {
            "vc": "você", "blz": "beleza", "pq": "porque", "tb": "também",
            "msg": "mensagem", "dps": "depois", "n": "não", "q": "que",
            "kd": "cadê", "vdd": "verdade", "aki": "aqui", "flw": "falou",
            "td": "tudo", "tks": "obrigado", "bjs": "beijos", "pls": "por favor",
            "obg": "obrigado", "vlw": "valeu", "cmg": "comigo", "qdo": "quando",
            "pvc": "pode ver", "pvt": "privado"
        }

        # String of punctuation characters to be removed from the text
        self.punctuation = string.punctuation

    def lowercase_standardization(self, text):
        """Convert the entire text to lowercase."""
        return text.lower()

    def tokenize_text(self, text):
        """Tokenize the text into words using SpaCy."""
        doc = self.nlp(text)
        return [token.text for token in doc]

    def remove_stop_words(self, tokens):
        """Remove stop words from the tokenized list."""
        return [token for token in tokens if not self.nlp.vocab[token].is_stop]

    def entity_identification(self, text):
        """Identify named entities in the text using SpaCy."""
        doc = self.nlp(text)
        return [(ent.text, ent.label_) for ent in doc.ents]

    def normalize_text(self, tokens):
        """Replace slang and abbreviations with their full forms."""
        return [self.slang_dict.get(word, word) for word in tokens]

    def lemmatize(self, tokens):
        """Lemmatize the tokens using SpaCy to get the base form of words."""
        doc = self.nlp(" ".join(tokens))
        return [token.lemma_ for token in doc]

    def remove_punctuation(self, text):
        """Remove punctuation from the text."""
        return text.translate(str.maketrans('', '', self.punctuation))

    def process(self, text):
        """Execute the complete text processing pipeline."""
        # Identify named entities before stop word removal
        entities = self.entity_identification(text)

        # Remove punctuation from the text
        text = self.remove_punctuation(text)

        # Convert the text to lowercase
        text = self.lowercase_standardization(text)

        # Tokenize the text into words
        tokens = self.tokenize_text(text)

        # Normalize tokens by replacing slang/abbreviations
        tokens = self.normalize_text(tokens)

        # Remove stop words from the tokenized list
        tokens = self.remove_stop_words(tokens)

        # Lemmatize the tokens to their base forms
        lemmatized = self.lemmatize(tokens)

        # Return the processed data as a dictionary
        return {
            "tokens": tokens,
            "entities": entities,
            "lemmatized": lemmatized,
        }

In [None]:
# Test TextProcessor
text_processor = TextProcessor()
text = "Preciso saber se houve alterações nas normas do Banco Central sobre o controle da dívida externa."
result = text_processor.process(text)

print("Tokens:", result['tokens'])
print("Entidades:", result['entities'])
print("Lematização:", result['lemmatized'])


Tokens: ['preciso', 'houve', 'alterações', 'normas', 'banco', 'central', 'controle', 'dívida', 'externa']
Entidades: [('Banco Central', 'ORG')]
Lematização: ['preciso', 'haver', 'alteração', 'norma', 'banco', 'central', 'controle', 'dívida', 'externo']


## Intentions dataset

Based on the files and phrases that partner "Bank of America" provided, LawHunter developers created this intent dataset. This is extremely necessary to train the algorithm that will indicate what the user wants to search for.

In [None]:
import pandas as pd

# URL
csv_url = 'https://docs.google.com/spreadsheets/d/1i0dgQwPX8yc-Bin2fSufYohVFVv-jg5MMLxW9bOA4SQ/export?format=csv'

# Read CSV
df = pd.read_csv(csv_url)

df.head()

Unnamed: 0,Phrases,Intention
0,Quais são as novas diretrizes sobre a regulaçã...,intent_investment
1,Me informe sobre as mudanças regulatórias envo...,intent_investment
2,Existe alguma atualização nas normas de CAPEX ...,intent_investment
3,Qual é a última resolução que trata do impacto...,intent_investment
4,Quais regulamentações recentes afetam o comérc...,intent_investment


This dataset contains 3 intentions, and 50 diferent phrases for them.

In [None]:
df['Intention'].unique()

array(['intent_investment', 'intent_sustainability',
       'intent_macroeconomics'], dtype=object)

## Word Embedding

In [None]:
# Download the word embeding
gdown.download('https://drive.google.com/uc?id=1htT3BRNWNPBkX965rgGNuvz7jShrebfH', 'slip_s300.txt', quiet=False)

Downloading...
From (original): https://drive.google.com/uc?id=1htT3BRNWNPBkX965rgGNuvz7jShrebfH
From (redirected): https://drive.google.com/uc?id=1htT3BRNWNPBkX965rgGNuvz7jShrebfH&confirm=t&uuid=fe81b809-f3e1-4f39-9775-bde170badd8a
To: /content/slip_s300.txt
100%|██████████| 2.66G/2.66G [00:46<00:00, 56.7MB/s]


'slip_s300.txt'

In [None]:
# Uses the W2V with the word embeding
from gensim.models import KeyedVectors
model = KeyedVectors.load_word2vec_format('slip_s300.txt', binary=False)

In [None]:
from gensim.models import KeyedVectors

class WordVectorizer:
    def __init__(self, model_path):
        """
        Initializes the WordVectorizer by loading the Word2Vec model.

        :param model_path: Path to the Word2Vec model file.
        """
        self.model = KeyedVectors.load_word2vec_format(model_path, binary=False)

    def vectorize_token(self, token):
        """
        Returns the vector for a single token if it is in the vocabulary.

        :param token: Word or token to be vectorized.
        :return: Word2Vec vector for the token, or None if not in vocabulary.
        """
        if token in self.model:
            return self.model[token]
        else:
            return None

    def average_vector(self, tokens):
        """
        Computes the average vector for a list of tokens, ignoring those not in the vocabulary.

        :param tokens: List of words (tokens) to compute the average vector.
        :return: Average vector of the list of tokens.
        """
        # Vectorize only tokens that are in the model's vocabulary
        valid_vectors = [self.vectorize_token(token) for token in tokens if self.vectorize_token(token) is not None]

        if valid_vectors:
            # Compute the mean of the vectors
            mean_vector = np.mean(valid_vectors, axis=0)
            return mean_vector
        else:
            return np.zeros(self.model.vector_size)  # Returns a zero vector if no words are found


In [None]:
# Pre-processing the dataset
def apply_preprocessing(row):
    result = text_processor.process(row['Phrases'])

    # Return the processed components (tokens, entities, lemmatized words, and word vectors)
    return pd.Series({
        'Tokens': result['tokens'],
        'Entities': result['entities'],
        'Lemmatized': result['lemmatized'],
    })

# Apply the preprocessing function to each row in the DataFrame
df_processed = df.apply(apply_preprocessing, axis=1)

# Concatenate the original DataFrame with the processed data
df = pd.concat([df, df_processed], axis=1)

df.head()

Unnamed: 0,Phrases,Intention,Tokens,Entities,Lemmatized
0,Quais são as novas diretrizes sobre a regulaçã...,intent_investment,"[diretrizes, regulação, taxa, selic, divulgada...","[(Quais, ORG), (Selic, LOC), (Banco Central, O...","[diretriz, regulação, taxa, selic, divulgar, b..."
1,Me informe sobre as mudanças regulatórias envo...,intent_investment,"[informe, mudanças, regulatórias, envolvendo, ...",[],"[informe, mudança, regulatório, envolver, comm..."
2,Existe alguma atualização nas normas de CAPEX ...,intent_investment,"[existe, alguma, atualização, normas, capex, e...","[(CAPEX, MISC)]","[existir, algum, atualização, norma, capex, em..."
3,Qual é a última resolução que trata do impacto...,intent_investment,"[última, resolução, trata, impacto, ipca, cart...","[(IPCA, ORG)]","[último, resolução, tratar, impacto, ipca, car..."
4,Quais regulamentações recentes afetam o comérc...,intent_investment,"[regulamentações, recentes, afetam, comércio, ...",[],"[regulamentação, recente, afetar, comércio, pe..."


In [None]:
# Creating an instance of WordVectorizer (make sure the model path is correct)
model_path = 'slip_s300.txt'  # Replace with the correct path to your model
word_vectorizer = WordVectorizer(model_path)

# Vectorization function to be applied to the DataFrame
def apply_vectorizing(row):
    """
    Applies vectorization to the list of lemmatized tokens in a row.

    :param row: A row from the DataFrame containing a 'Lemmatized' column.
    :return: A Pandas Series containing the calculated mean vector.
    """
    # Computes the mean vector for the lemmatized tokens
    mean_vector = word_vectorizer.average_vector(row['Lemmatized'])

    # Returns the mean vector
    return pd.Series({
        'Mean_Vector': mean_vector
    })

df_vectors = df.apply(apply_vectorizing, axis=1)

# Concatenates the original DataFrame with the vectorized data
df = pd.concat([df, df_vectors], axis=1)

# Displays the updated DataFrame
df.head()


Unnamed: 0,Phrases,Intention,Tokens,Entities,Lemmatized,Mean_Vector
0,Quais são as novas diretrizes sobre a regulaçã...,intent_investment,"[diretrizes, regulação, taxa, selic, divulgada...","[(Quais, ORG), (Selic, LOC), (Banco Central, O...","[diretriz, regulação, taxa, selic, divulgar, b...","[0.22229652, -0.1165195, -0.118158996, -0.0583..."
1,Me informe sobre as mudanças regulatórias envo...,intent_investment,"[informe, mudanças, regulatórias, envolvendo, ...",[],"[informe, mudança, regulatório, envolver, comm...","[-0.006954856, -0.07061014, -0.003372709, 0.09..."
2,Existe alguma atualização nas normas de CAPEX ...,intent_investment,"[existe, alguma, atualização, normas, capex, e...","[(CAPEX, MISC)]","[existir, algum, atualização, norma, capex, em...","[0.06514925, -0.07583188, -0.068316, -0.013809..."
3,Qual é a última resolução que trata do impacto...,intent_investment,"[última, resolução, trata, impacto, ipca, cart...","[(IPCA, ORG)]","[último, resolução, tratar, impacto, ipca, car...","[0.17783813, -0.080594, -0.08606913, 0.0522530..."
4,Quais regulamentações recentes afetam o comérc...,intent_investment,"[regulamentações, recentes, afetam, comércio, ...",[],"[regulamentação, recente, afetar, comércio, pe...","[-0.024623662, -0.1450585, -0.043318, -0.03440..."


## Model Trainning and choosing

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [None]:
# 'Mean_Vector' contains the mean vector and 'Intentions' contains the target labels
X = np.array(df['Mean_Vector'].tolist())
y = df['Intention']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Logistic Regression

In [None]:
# Logistic Regression
LogisticRegression = LogisticRegression()

LogisticRegression.fit(X_train, y_train)

In [None]:
# Predict on the test set
y_pred = LogisticRegression.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)

# Print metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"Confusion Matrix:\n{conf_matrix}")

Accuracy: 0.84
Precision: 0.84
Recall: 0.84
F1 Score: 0.83
Confusion Matrix:
[[ 7  2  2]
 [ 1  8  0]
 [ 0  0 11]]


### SVM

In [None]:
# SVM
SVM = SVC(kernel='linear')

SVM.fit(X_train, y_train)

In [None]:
# Predict on the test set
y_pred = SVM.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)

# Print metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"Confusion Matrix:\n{conf_matrix}")

Accuracy: 0.90
Precision: 0.92
Recall: 0.90
F1 Score: 0.90
Confusion Matrix:
[[ 8  1  2]
 [ 0  9  0]
 [ 0  0 11]]


### Decision Tree

In [None]:
# Inicializando o modelo DecisionTree
DecisionTree = DecisionTreeClassifier()

# Treinando o modelo
DecisionTree.fit(X_train, y_train)

In [None]:
# Predict on the test set
y_pred = DecisionTree.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)

# Print metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"Confusion Matrix:\n{conf_matrix}")

Accuracy: 0.68
Precision: 0.68
Recall: 0.68
F1 Score: 0.66
Confusion Matrix:
[[ 5  2  4]
 [ 2  6  1]
 [ 1  0 10]]


### Random Forest

In [None]:
# Inicializando o modelo RandomForest
RandomForest = RandomForestClassifier()

# Treinando o modelo
RandomForest.fit(X_train, y_train)

In [None]:
# Predict on the test set
y_pred = RandomForest.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)

# Print metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"Confusion Matrix:\n{conf_matrix}")

Accuracy: 0.84
Precision: 0.85
Recall: 0.84
F1 Score: 0.83
Confusion Matrix:
[[ 7  3  1]
 [ 1  8  0]
 [ 0  0 11]]


### MLP _ Neural Network

In [None]:
# Inicializando a neural_network
MLP = MLPClassifier()

# Treinando o modelo
MLP.fit(X_train, y_train)



In [None]:
# Predict on the test set
y_pred = MLP.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)

# Print metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"Confusion Matrix:\n{conf_matrix}")

Accuracy: 0.87
Precision: 0.89
Recall: 0.87
F1 Score: 0.86
Confusion Matrix:
[[ 7  1  3]
 [ 0  9  0]
 [ 0  0 11]]


## Final Tests


In [None]:
def predict_user_input(user_input, text_processor, word_vectorizer, SVM):
    """
    Takes a user's input, preprocesses it, vectorizes it, and predicts the class using a trained model.

    :param user_input: A string containing the user's input.
    :param text_processor: An instance of the TextProcessor class for text preprocessing.
    :param word_vectorizer: An instance of the WordVectorizer class for vectorization.
    :param trained_model: A trained machine learning model with a .predict method.
    :return: The predicted class for the user input.
    """
    # Step 1: Preprocess the input using TextProcessor
    processed_data = text_processor.process(user_input)

    # Extract lemmatized tokens from processed data
    lemmatized_tokens = processed_data['lemmatized']

    # Step 2: Vectorize the processed input using WordVectorizer
    mean_vector = word_vectorizer.average_vector(lemmatized_tokens).reshape(1, -1)

    # Step 3: Predict using the trained model
    prediction = SVM.predict(mean_vector)

    # Return the predicted class
    return prediction[0]

In [None]:
# Example input
user_input = "Quais atualizações legais recentes afetam os investimentos de renda fixa com base no IPCA?"

# Assume you have already created and trained the following:
# - text_processor: instance of TextProcessor
# - word_vectorizer: instance of WordVectorizer
# - naive_bayes_classifier: trained instance of MultinomialNB (or another model)

# Predict the class
predicted_class = predict_user_input(user_input, text_processor, word_vectorizer, SVM)

print(f"The predicted class is: {predicted_class}")


The predicted class is: intent_investment


In [None]:
import pickle

In [None]:
# Save the model to a .pkl file
with open('model.pkl', 'wb') as file:
    pickle.dump(SVM, file)


In [None]:
from google.colab import files

# Baixe o arquivo .pkl
files.download('model.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>