# Import libraries

In [1]:
import json
import os
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import uvicorn
from fastapi import FastAPI, Form
from fastapi.responses import JSONResponse
import nest_asyncio

# Loading dataset

In [2]:
DATA_DIR = "./data"
with open(os.path.join(DATA_DIR, "PT_Ecommerce_FAQ_Chatbot_dataset.json"), 'r', encoding='utf-8') as file:
    data = json.load(file)
df = pd.DataFrame(data['questions'])
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,question,answer
0,Como é que posso criar uma conta?,"Para criar uma conta, clique no botão 'Regista..."
1,Que métodos de pagamento é que aceitam?,"Aceitamos cartões de crédito, cartões de débit..."
2,Como posso rastrear a minha encomenda?,Pode rastrear a sua encomenda ao iniciar sessã...
3,Qual é a vossa política de devolução?,A nossa política de devolução permite que devo...
4,Posso cancelar a minha encomenda?,Pode cancelar a sua encomenda se esta ainda nã...


# Tokenizing questions

In [3]:
# Tokenization, Removing Stopwords, and Special Characters
nltk.download('stopwords')
nltk.download('punkt')
stopwords_pt = set(nltk.corpus.stopwords.words('portuguese'))
def tokenize_and_clean(text):
    ps = nltk.stem.PorterStemmer()
    tokens = nltk.word_tokenize(text.lower())
    tokens = [ps.stem(token) for token in tokens if token.isalpha() and token not in stopwords_pt]
    return ' '.join(tokens)
df['tokenized_question'] = df['question'].apply(tokenize_and_clean)
df.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\leona\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\leona\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,question,answer,tokenized_question
0,Como é que posso criar uma conta?,"Para criar uma conta, clique no botão 'Regista...",posso criar conta
1,Que métodos de pagamento é que aceitam?,"Aceitamos cartões de crédito, cartões de débit...",método pagamento aceitam
2,Como posso rastrear a minha encomenda?,Pode rastrear a sua encomenda ao iniciar sessã...,posso rastrear encomenda
3,Qual é a vossa política de devolução?,A nossa política de devolução permite que devo...,vossa política devolução
4,Posso cancelar a minha encomenda?,Pode cancelar a sua encomenda se esta ainda nã...,posso cancelar encomenda


# Vectorizing questions

In [4]:
# Vetorização usando TF-IDF
vectorizer = TfidfVectorizer()
vectorized_question = vectorizer.fit_transform(df['tokenized_question'])
vectorized_question

<49x96 sparse matrix of type '<class 'numpy.float64'>'
	with 224 stored elements in Compressed Sparse Row format>

# Creating chatbot using cosine_similarities to obtain most probable answer

In [5]:
def find_most_similar_question(query, vectorizer, model, df, vector_question):
    query = tokenize_and_clean(query)
    query_vectorized = vectorizer.transform([query])
    similarities = model(query_vectorized, vector_question)[0]
    print(similarities.max())
    if similarities.max() < 0.2:
        most_similar_question = "NA"
        most_similar_answer = "Devido a ser um chatbot desenvolvido para uma plataforma de E-commerce não consigo responder a essa questão. Por favor reformule a questão."
    else:
        most_similar_index = similarities.argmax()
        most_similar_question = df.iloc[most_similar_index]['question']
        most_similar_answer = df.iloc[most_similar_index]['answer']
    return most_similar_question, most_similar_answer

In [6]:
example_question = "pagamento"
most_similar_question, most_similar_answer = find_most_similar_question(example_question, vectorizer, cosine_similarity, df, vectorized_question)
print(f"Pergunta de exemplo: {example_question}")
print(f"Pergunta mais similar: {most_similar_question}")
print(f"Resposta correspondente: {most_similar_answer}")

0.5385446685308316
Pergunta de exemplo: pagamento
Pergunta mais similar: Que métodos de pagamento é que aceitam?
Resposta correspondente: Aceitamos cartões de crédito, cartões de débito e PayPal como métodos de pagamento para encomendas online.


# Running the server

In [7]:
DATA_DIR = "./data"
DATASET_NAME = "PT_Ecommerce_FAQ_Chatbot_dataset.json"
LOG_DIR = "./log"
nltk.download('stopwords')
nltk.download('punkt')
stopwords_pt = set(nltk.corpus.stopwords.words('portuguese'))
if not os.path.exists(LOG_DIR):
    os.mkdir(LOG_DIR)

#Read data
def read_data():
    with open(os.path.join(DATA_DIR, DATASET_NAME), 'r', encoding='utf-8') as file:
        data = json.load(file)
    df = pd.DataFrame(data['questions'])
    df.reset_index(drop=True, inplace=True)
    return df

#tokenize sentences
def tokenize_and_clean(text):
    ps = nltk.stem.PorterStemmer()
    tokens = nltk.word_tokenize(text.lower())
    tokens = [ps.stem(token) for token in tokens if token.isalpha() and token not in stopwords_pt]
    return ' '.join(tokens)

#preprocess the dataframe
def preprocess_text(df):
    df_ = df.copy()
    df_['tokenized_question'] = df_['question'].apply(tokenize_and_clean)
    vectorizer = TfidfVectorizer()
    vectorized_question = vectorizer.fit_transform(df_['tokenized_question'])
    return df_, vectorizer, vectorized_question

def find_most_similar_question(query, vectorizer, model, df, vector_question):
    query = tokenize_and_clean(query)
    query_vectorized = vectorizer.transform([query])
    similarities = model(query_vectorized, vector_question)[0]
    if similarities.max() < 0.2:
        most_similar_question = "NA"
        most_similar_answer = "Devido a ser um chatbot desenvolvido para uma plataforma de E-commerce não consigo responder a essa questão. Por favor reformule a questão."
    else:
        most_similar_index = similarities.argmax()
        most_similar_question = df.iloc[most_similar_index]['question']
        most_similar_answer = df.iloc[most_similar_index]['answer']
    return most_similar_question, most_similar_answer

app = FastAPI(title='Deploying a chatbot with FastAPI')

df = read_data()
preprocessed_df, vectorizer, vectorized_question = preprocess_text(df)

@app.get("/")
def home():
    return "API is working as expected. Now head over to http://localhost:8000/docs."

@app.post("/chat") 
def chat(query:str= Form(...)):

    most_similar_question, most_similar_answer = find_most_similar_question(query, vectorizer, cosine_similarity, preprocessed_df, vectorized_question)
    stream_response = {
        'YOU': query,
        'most_similar_question': most_similar_question,
        'CHATBOT': most_similar_answer
    }
    with open(os.path.join(LOG_DIR, "log.txt"), 'a', encoding='utf-8') as log:
        log.write(json.dumps(stream_response) + '\n')

    return JSONResponse(stream_response)

nest_asyncio.apply()
uvicorn.run(app, host= "127.0.0.1", port=8000)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\leona\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\leona\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
INFO:     Started server process [18888]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)


INFO:     127.0.0.1:57407 - "POST /chat HTTP/1.1" 200 OK
INFO:     127.0.0.1:57408 - "POST /chat HTTP/1.1" 200 OK


INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [18888]


#### Go to http://127.0.0.1:8000/docs or run the client.ipynb notebook. Don't stop this notebook as it will be the server