<a href="https://colab.research.google.com/github/MRPatrick8/NLP/blob/main/Hackathon2/Rene_Patrick_Muyizere_2nd_Hackathon_File.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Rene Patrick MUYIZERE

# Data Extraction (Web scraping)

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [None]:
content = requests.get('https://rura.rw/').content
#Preview of the Content
len(content)

34266

In [None]:
#Initiating the BS4 parser
page = BeautifulSoup(content, 'html.parser')

In [None]:
#Get all the links in the website
links = page.find_all('a')

In [None]:
index = -16
link = links[index]['href'] #Get the link
title = links[index].text #Save the description
print('Link:', link, 'Described as:', title)

Link: index.php?id=173 Described as: Sub-Sectors and Services


In [None]:
list_of_links = [] #list()
list_of_titles = []

for item in links: #loop over the list of links
  if str(item['href'])[0:9] == 'index.php':
    lin = 'https://rura.rw/' + item['href']
  else:
    lin = item['href']

  list_of_links.append(lin)
  if len(item.text) > 0:
    list_of_titles.append(item.text)
  else:
    list_of_titles.append(None)

df = pd.DataFrame()
df['title'] = list_of_titles
df['link'] = list_of_links

df = df.dropna(subset=['title'])

df

Unnamed: 0,title,link
0,bernd markert dissertation buying term papers ...,#
1,bernd markert dissertation,http://rura.rw/?bernd-markert-dissertation
2,buying term papers online wrong,http://rura.rw/?buying-term-papers-online-wrong
9,Home,https://rura.rw/index.php?id=23
10,About RURA,https://rura.rw/index.php?id=25
...,...,...
170,Regulatory instruments,https://rura.rw/index.php?id=179
171,Sector Reports,https://rura.rw/index.php?id=185
172,Statistics,https://rura.rw/index.php?id=86
173,FAQs,https://rura.rw/index.php?id=88


In [None]:
#Preview the dataframe
df.describe()

#Check whehter the link is full formated link
is_full_link = df.link.str.startswith('http')

#Check whether the link is pdf documents
is_pdf = df.link.str.contains('.pdf')

df['is_full_link'] = is_full_link #Create a new col
df['is_pdf'] = is_pdf #create a new col

df = df[df['is_full_link'] == True ] #Drop all the links
df =  df[df['is_pdf'] == False ]

df

Unnamed: 0,title,link,is_full_link,is_pdf
1,bernd markert dissertation,http://rura.rw/?bernd-markert-dissertation,True,False
2,buying term papers online wrong,http://rura.rw/?buying-term-papers-online-wrong,True,False
9,Home,https://rura.rw/index.php?id=23,True,False
10,About RURA,https://rura.rw/index.php?id=25,True,False
11,Background,https://rura.rw/index.php?id=44,True,False
...,...,...,...,...
170,Regulatory instruments,https://rura.rw/index.php?id=179,True,False
171,Sector Reports,https://rura.rw/index.php?id=185,True,False
172,Statistics,https://rura.rw/index.php?id=86,True,False
173,FAQs,https://rura.rw/index.php?id=88,True,False


In [None]:
import re 

def preprocessing(titles : list):
  cleaned_titles = []
  for title in titles:
    title = title.lower()
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    html_pattern = re.compile('<.*?>')
    title = url_pattern.sub(r'', title)
    title = html_pattern.sub(r'', title)
    title = re.sub(r"[^\w\d'\s]+", ' ', title)
    cleaned_titles.append(title)
  return cleaned_titles


df['title'] = preprocessing(df['title'])

# Building Chatbot

In [None]:
question = """Guide me to {}?
Where do you get {}?
What is the link to the {}?
How do you get {}?
How does {} works?
How can i get to {}?
What is the {}?
Show me {}
How long did it take to have {}
How do i apply for {}?
Who is in Charge of {}?"""

answers = """Here is the link {}.
You can use this link for more information.  {}"""

question.split('\n')
intents = []

for index in range(len(df)):
  intent = {}
  tag = df.iloc[index]['title']
  intent['tag'] = tag
  intent['patterns'] = list()
  intent['responses'] = [f'You can use this link for more information {df.iloc[index].link}']
  for i in range(len(question.split('\n'))):
    intent['patterns'].append(question.split('\n')[i].format(intent['tag']))

  intents.append(intent)
  
file = {'intents': intents}


In [None]:
!git clone https://github.com/MRPatrick8/chatbot-deployment.git

Cloning into 'chatbot-deployment'...
remote: Enumerating objects: 157, done.[K
remote: Counting objects: 100% (119/119), done.[K
remote: Compressing objects: 100% (86/86), done.[K
remote: Total 157 (delta 65), reused 54 (delta 29), pack-reused 38[K
Receiving objects: 100% (157/157), 103.09 KiB | 748.00 KiB/s, done.
Resolving deltas: 100% (70/70), done.


In [None]:
import json

with open('/content/chatbot-deployment/intents.json', 'w+') as intent_file:
  intent_file.seek(0)
  intent_file.truncate()
  json.dump(file, intent_file)

In [None]:
%cd /content/chatbot-deployment

/content/chatbot-deployment


In [None]:
import nltk 
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Training the chatbot

In [None]:
import numpy as np
import random
import json

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from nltk_utils import bag_of_words, tokenize, stem
from model import NeuralNet

with open('/content/chatbot-deployment/intent.json', 'r') as f:
    intents = json.load(f)

all_words = []
tags = []
xy = []
# loop through each sentence in our intents patterns
for intent in intents['intents']:
    tag = intent['tag']
    # add to tag list
    tags.append(tag)
    for pattern in intent['patterns']:
        # tokenize each word in the sentence
        w = tokenize(pattern)
        # add to our words list
        all_words.extend(w)
        # add to xy pair
        xy.append((w, tag))

# stem and lower each word
ignore_words = ['?', '.', '!','/n']
all_words = [stem(w) for w in all_words if w not in ignore_words]
# remove duplicates and sort
all_words = sorted(set(all_words))
tags = sorted(set(tags))

# create training data
X_train = []
y_train = []
for (pattern_sentence, tag) in xy:
    # X: bag of words for each pattern_sentence
    bag = bag_of_words(pattern_sentence, all_words)
    X_train.append(bag)
    # y: PyTorch CrossEntropyLoss needs only class labels, not one-hot
    label = tags.index(tag)
    y_train.append(label)

X_train = np.array(X_train)
y_train = np.array(y_train)

# Hyper-parameters 
num_epochs = 1000
batch_size = 8
learning_rate = 0.001
input_size = len(X_train[0])
hidden_size = 8
output_size = len(tags)
print(input_size, output_size)

class ChatDataset(Dataset):

    def __init__(self):
        self.n_samples = len(X_train)
        self.x_data = X_train
        self.y_data = y_train

    # support indexing such that dataset[i] can be used to get i-th sample
    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    # we can call len(dataset) to return the size
    def __len__(self):
        return self.n_samples

dataset = ChatDataset()
train_loader = DataLoader(dataset=dataset,
                          batch_size=batch_size,
                          shuffle=True,
                          num_workers=0)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = NeuralNet(input_size, hidden_size, output_size).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
for epoch in range(num_epochs):
    for (words, labels) in train_loader:
        words = words.to(device)
        labels = labels.to(dtype=torch.long).to(device)
        
        # Forward pass
        outputs = model(words)
        # if y would be one-hot, we must apply
        # labels = torch.max(labels, 1)[1]
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    if (epoch+1) % 100 == 0:
        print (f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


print(f'final loss: {loss.item():.4f}')

data = {
"model_state": model.state_dict(),
"input_size": input_size,
"hidden_size": hidden_size,
"output_size": output_size,
"all_words": all_words,
"tags": tags
}

FILE = "data.pth"
torch.save(data, FILE)

print(f'training complete. file saved to {FILE}')

193 98
Epoch [100/1000], Loss: 0.0880
Epoch [200/1000], Loss: 0.1256
Epoch [300/1000], Loss: 0.3168
Epoch [400/1000], Loss: 0.3798
Epoch [500/1000], Loss: 0.0000
Epoch [600/1000], Loss: 0.2361
Epoch [700/1000], Loss: 0.0460
Epoch [800/1000], Loss: 0.2062
Epoch [900/1000], Loss: 0.0915
Epoch [1000/1000], Loss: 0.0644
final loss: 0.0644
training complete. file saved to data.pth


In [None]:
# !python chat.py

# Displaying Result

In [None]:
!pip install fastapi pyngrok uvicorn nest-asyncio

In [None]:
!ngrok authtoken 2IwhVMeBradWUnsUY1ulqIBEkvS_2U88vFz2cqDxDjCMRYdrx

Authtoken saved to configuration file: /root/.ngrok2/ngrok.yml


In [None]:
!pip install flask
!pip install flask-ngrok

In [None]:
!pip install flask_cors
!pip install detectlanguage

In [None]:
from flask import Flask, render_template, request,jsonify
from flask_ngrok import run_with_ngrok
from flask_cors import CORS
from chat import get_response
import re
import requests
import json
import detectlanguage
from detectlanguage import simple_detect # import the translator


app = Flask(__name__)
CORS(app)


class translator:
    api_url = "https://translate.googleapis.com/translate_a/single"
    client = "?client=gtx&dt=t"
    dt = "&dt=t"

    #fROM English to Kinyarwanda
    def translate(text : str , target_lang : str, source_lang : str):
        sl = f"&sl={source_lang}"
        tl = f"&tl={target_lang}"
        r = requests.get(translator.api_url+ translator.client + translator.dt + sl + tl + "&q=" + text)
        return json.loads(r.text)[0][0][0]

# use this link to get your api key https://detectlanguage.com/
detectlanguage.configuration.api_key = "13e26484ba8a0a3d865573c4868de0a0"
detectlanguage.configuration.secure = True

def process_question(text : str):
  source_lang = simple_detect(text)
  resp = translator.translate(text=text, target_lang='en', source_lang=source_lang)
  return resp, source_lang
def process_answer(text : str, source_lang):
  resp = translator.translate(text=text, target_lang=source_lang, source_lang='en')
  return resp

# create two routes

def preprocessing(text):
    text = text.lower()
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    html_pattern = re.compile('<.*?>')
    text = url_pattern.sub(r'', text)
    text = html_pattern.sub(r'', text)
    text = re.sub(r"[^\w\d'\s]+", ' ', text)

    return text
Q = []
R = []
def process(QUESTION: str):
    Q.append(QUESTION)
    USER_QUERY, SL = process_question(QUESTION) #Translate the original question into english and store the source lang
    RESPONSE = get_response(USER_QUERY) #Asking th chatbot question
    ORIGINAL_RESPONSE = process_answer(RESPONSE, SL)
    R.append(ORIGINAL_RESPONSE)
    return ORIGINAL_RESPONSE


@app.route("/",  methods=["GET"])
def index_get():
    return render_template("base.html")

@app.route("/predict",methods=["POST"])


def predict():
    text = request.get_json().get("message")
    #check if text is valid (I let it for you)
    response = process(text)
    # we jsonify our response
    message = {"answer":response}
    print(message)
    return jsonify(message)


In [None]:
run_with_ngrok(app)
app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


INFO:werkzeug: * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)


 * Running on http://da3f-34-143-155-41.ngrok.io
 * Traffic stats available on http://127.0.0.1:4040


INFO:werkzeug:127.0.0.1 - - [18/Dec/2022 21:14:07] "[37mGET / HTTP/1.1[0m" 200 -
INFO:werkzeug:127.0.0.1 - - [18/Dec/2022 21:14:08] "[37mGET /static/style.css HTTP/1.1[0m" 200 -
INFO:werkzeug:127.0.0.1 - - [18/Dec/2022 21:14:12] "[37mGET / HTTP/1.1[0m" 200 -
INFO:werkzeug:127.0.0.1 - - [18/Dec/2022 21:14:12] "[37mGET /static/app.js HTTP/1.1[0m" 200 -
INFO:werkzeug:127.0.0.1 - - [18/Dec/2022 21:14:12] "[37mGET /static/images/chatbox-icon.svg HTTP/1.1[0m" 200 -
INFO:werkzeug:127.0.0.1 - - [18/Dec/2022 21:14:26] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [18/Dec/2022 21:15:21] "[37mPOST /predict HTTP/1.1[0m" 200 -


{'answer': 'Bonjour comment puis-je vous aider?'}


INFO:werkzeug:127.0.0.1 - - [18/Dec/2022 21:15:50] "[37mPOST /predict HTTP/1.1[0m" 200 -


{'answer': 'You can use this link https://rura.rw/index.php?id=251 for more information'}


INFO:werkzeug:127.0.0.1 - - [18/Dec/2022 21:16:21] "[37mPOST /predict HTTP/1.1[0m" 200 -


{'answer': 'Você é bem vindo!'}


In [None]:
!pkill ngrok
# saving all questions and answers in a df as report
chat = pd.DataFrame()
chat['Asked Q']=Q
chat['answers'] = R
chat


Unnamed: 0,Asked Q,answers
0,bonjour,Bonjour comment puis-je vous aider?
1,Where can i get tweets aout RURA?,You can use this link https://rura.rw/index.ph...
2,obrigado,Você é bem vindo!
