<a href="https://colab.research.google.com/github/MaikarfiJesse/chatBot/blob/main/ClimateChatBot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
from transformers import BertTokenizer, BertModel
from sklearn.preprocessing import LabelBinarizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Import BERT-base pretrained model
model = BertModel.from_pretrained('bert-base-uncased')

In [19]:
import numpy as np
import pandas as pd
import re
import random
import transformers
import matplotlib.pyplot as plt
import json
import pickle
import torch

import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Activation,Dropout
from tensorflow.keras.optimizers import SGD
from tensorflow.keras import metrics

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [21]:
climate_df = pd.read_csv("/content/sample_data/climate.csv")
climate_df.head()

Unnamed: 0,Questions,Answers,Tags
0,What is climate change?,Climate change refers to long-term changes in ...,Definition
1,What are the main causes of climate change?,The main causes of climate change are human ac...,Causes
2,How does deforestation contribute to climate c...,Deforestation reduces the number of trees that...,Causes
3,What are greenhouse gases?,Greenhouse gases are gases that trap heat in t...,Greenhouse Gases
4,How do greenhouse gases affect the climate?,Greenhouse gases trap heat from the sun in the...,Greenhouse Gases


In [22]:
dict_ = {"intents":[]}

# dict_
questions = None
for ind in climate_df.index:
    ques = climate_df['Questions'][ind].lower()
    ques = ques.replace('?','.')
    # ques += climate_df['Patterns'][ind].lower()
    ques = ques.split('.')
    for i, q in enumerate(ques):
        ques[i] = q.strip()
    ans = climate_df['Answers'][ind]
    dict1 = {'tags': climate_df['Tags'][ind]}
    # dict1['patterns'] = ques[:-1]
    dict1['responses'] = ans
    dict_['intents'].append(dict1)

# print(dict_)
print(dict_["intents"][:2])

with open("climate_change.json", "w") as f:
    f.write(json.dumps(dict_))

[{'tags': 'Definition', 'responses': 'Climate change refers to long-term changes in temperature, precipitation, and other atmospheric conditions on Earth.'}, {'tags': 'Causes', 'responses': 'The main causes of climate change are human activities, such as burning fossil fuels, deforestation, and industrial processes, leading to greenhouse gas emissions.'}]


In [23]:
words=[]
classes=[]
documents=[]
ignore_letters=['?','!','.',',']


for index, row in climate_df.iterrows():
    question = row['Questions']
    tag = row['Tags']
    # Tokenize the question
    inputs = tokenizer(question, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy()

    documents.append((embeddings, tag))
    words.extend(tokenizer.tokenize(question))  # Add tokenized words to the word list

    if tag not in classes:
        classes.append(tag)

lemmatizer=WordNetLemmatizer()

words = [lemmatizer.lemmatize(word.lower()) for word in words if word not in ignore_letters]
words = sorted(set(words))
classes = sorted(set(classes))

pickle.dump(words, open('cl_words.pkl', 'wb'))
pickle.dump(classes, open('cl_classes.pkl', 'wb'))

# Prepare training data
training = []
output_empty = [0] * len(classes)

for document in documents:
    embeddings, tag = document
    output_row = list(output_empty)
    output_row[classes.index(tag)] = 1
    training.append([embeddings, output_row])

random.shuffle(training)
training = np.array(training, dtype=object)

train_x = np.vstack(training[:, 0])
train_y = np.array(training[:, 1].tolist())

In [24]:
from tensorflow.keras.optimizers.schedules import ExponentialDecay

model=Sequential()
model.add(Dense(128,input_shape=(len(train_x[0]),),activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(64,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(len(train_y[0]),activation='softmax'))


# Define learning rate schedule
lr_schedule = ExponentialDecay(
    initial_learning_rate=0.0001,
    decay_steps=100000,
    decay_rate=0.96,
    staircase=True
)

# Use the SGD optimizer with the learning rate schedule
sgd = SGD(learning_rate=lr_schedule, momentum=0.9, nesterov=True)
# sgd=SGD(learning_rate=0.0001,decay=1e-6,momentum=0.9,nesterov=True)


model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy','Precision','Recall'])
# model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=METRICS)
hist = model.fit(np.array(train_x),np.array(train_y),epochs=50,batch_size=12,verbose=1)
model.save('cl_chatbotmodel.h5', hist)
print('Training Done')

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Training Done


  saving_api.save_model(


In [25]:
from tensorflow.keras.models import load_model

with open('climate_change.json') as json_file:
    intents = json.load(json_file)

words=pickle.load(open('cl_words.pkl','rb'))
print(words[:100], len(words))

classes=pickle.load(open('cl_classes.pkl','rb'))
print(classes[:5], len(classes))
model=load_model('cl_chatbotmodel.h5')

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

def get_bert_embedding(sentence):
    inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy()
    return embeddings


def predict_class(sentence):
    embedding = get_bert_embedding(sentence)
    res = model.predict(embedding)[0]
    ERROR_THRESHOLD = 0.25
    results = [[i, r] for i, r in enumerate(res) if r > ERROR_THRESHOLD]
    results.sort(key=lambda x: x[1], reverse=True)
    return_list = []
    for r in results:
        return_list.append({'intent': classes[r[0]], 'probability': str(r[1])})
    return return_list

def get_response(intents_list, intents_json):
    tag = intents_list[0]['intent']
    print(intents_list[0])
    list_of_intents = intents_json['intents']
    for i in list_of_intents:
        if tag in i['tags']:
            return i['responses']
    return None

print("GO! BOT IS RUNNING")

while True:
    message=input("")
    if message.lower() in ['quit', 'exit']:
      print("Goodbye")
      break
    ints=predict_class(message)
    if len(ints) > 0:
        res=get_response(ints,intents)
        print(f"Bot: {res}")

['##ence', '##iga', '##ili', '##ing', '##orestation', '##s', '##ting', '##tion', '(', ')', 'activity', 'adapt', 'adaptation', 'addressing', 'affect', 'agreement', 'agriculture', 'and', 'are', 'biodiversity', 'burning', 'can', 'cap', 'capture', 'carbon', 'cause', 'cc', 'change', 'city', 'climate', 'combat', 'contribute', 'coral', 'cycle', 'def', 'do', 'doe', 'economy', 'ecosystem', 'education', 'effect', 'emission', 'energy', 'event', 'example', 'extreme', 'food', 'footprint', 'forest', 'fossil', 'freshwater', 'fuel', 'gas', 'global', 'greenhouse', 'health', 'help', 'how', 'human', 'i', 'ice', 'impact', 'in', 'international', 'is', 'level', 'linked', 'main', 'melting', 'mental', 'mit', 'my', 'ocean', 'of', 'offset', 'on', 'paris', 'pattern', 'polar', 're', 'reduce', 'reef', 'refugee', 'region', 'renewable', 'resource', 'rise', 'role', 'sea', 'security', 'solution', 'some', 'source', 'storage', 'strategy', 'technology', 'the', 'their', 'to', 'transportation'] 105
['Carbon Footprint', 'Ca