In [1]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import joblib
#!python -m spacy download pt

from motor.motor_asyncio import AsyncIOMotorClient
import asyncio
from pymongo import MongoClient


DB_HOST_DEV = "mongo"
DB_PASSWORD_DEV = "example"
DB_PORT_DEV = "27017"
DB_DATABASE_DEV = "teste_data"

MONGODB_HOST = "127.0.0.1"  # Ou o endereço IP/nome do servidor MongoDB
MONGODB_PORT = 27017  # Ou a porta correta em que o MongoDB está configurado para ouvir

In [2]:
DB_URL = f"""mongodb://{MONGODB_HOST}:{DB_PASSWORD_DEV}
@{DB_HOST_DEV}:{DB_PORT_DEV}/"""

client = MongoClient("localhost", 27017)
database = client["teste_data"]


script_collection = database["script"]
description_colletion = database["description"]
users_collection = database["users"]
conversations_collection = database["bot"]

In [15]:
data = description_colletion.find_one({})

In [16]:
data = pd.DataFrame(data["descriptions"]).reset_index(drop=True)
data["description"] = data["description"].apply(lambda x: x.lower())

In [17]:
nlp = spacy.load("pt_core_news_sm")



In [18]:
def remove_stopwords(text):
    doc = nlp(text)
    filtered_words = [token.text for token in doc if not token.is_stop]
    return " ".join(filtered_words)

In [19]:
data["description"] = data["description"].apply(remove_stopwords)

In [20]:
# Instantiate the TF-IDF vectorizer
vectorizer = TfidfVectorizer(lowercase=True, strip_accents="unicode")

vectorizer = vectorizer.fit(data["description"])

# Apply TF-IDF on the text dataset
tfidf_matrix = vectorizer.fit_transform(data["description"])

# Get the feature names
feature_names = vectorizer.get_feature_names_out()

# Create a new dataframe with the TF-IDF features
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

In [21]:
# Save model

modelo_file = "modelo_vectorizer.joblib"
joblib.dump(vectorizer, modelo_file)


['modelo_vectorizer.joblib']

In [22]:
# Exemplo de entrada do usuário
user_input = "Estou com alguns problemas com meu seguro."

user_input = remove_stopwords(user_input)

In [23]:
modelo_carregado = joblib.load(modelo_file)

In [39]:
# Vetorização do input do usuário
input_vector = modelo_carregado.transform([user_input])

# Apply TF-IDF on the text dataset
description_matrix = modelo_carregado.transform(data["description"])

# Cálculo da similaridade de cosseno entre o input do usuário e cada descrição
similarity_scores = cosine_similarity(input_vector.toarray(), description_matrix)

In [42]:
most_similar_index = similarity_scores.argmax()

In [43]:
print(
    f"Matriz de similaridade {similarity_scores}\n",
    f"Indice do vetor: {most_similar_index}",
)

Matriz de similaridade [[0.12919409 0.         0.11114526 0.12393859 0.26820331 0.13189763]]
 Indice do vetor: 4


In [None]:
# Resultado
most_similar_description = data["description"][most_similar_index]
print("Descrição mais similar:", most_similar_description)

In [None]:
data

# Script

In [None]:
data_script = pd.read_json(
    "/home/maksonvinicio/Documents/GitLab-GitHub/Customer-Care-AI/ml_model/data/data_script.json"
).reset_index(drop=True)

In [None]:
response = data_script[data_script["id"] == most_similar_index]["script"].to_dict()

In [None]:

# class Bot:
#     @router.post("/bot")
#     async def message(connection_string: Depends(DB_URL)):
#         message_history = MongoDBChatMessageHistory(
#             connection_string=connection_string, session_id="test-session"
#         )

#         message_history.add_user_message("hi!")

#         message_history.add_ai_message("whats up?")