# Chemical Element Reference Chatbot
# Melek Mizher - November 2022

In [None]:
import requests
from bs4 import BeautifulSoup
import wikipedia
import pandas as pd
import nltk
import random
import regex as re
import sys
import numpy as np

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer

import gensim
from gensim.models import Word2Vec

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from transformers import pipeline
from sentence_transformers import SentenceTransformer

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

#!pip install beautifulsoup4
#!pip install wikipedia

In [None]:
nltk.download('stopwords',quiet=True)
nltk.download('wordnet',quiet=True)
nltk.download('punkt',quiet=True)
nltk.download('omw-1.4',quiet=True)

### Suppress warning messages

In [None]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

# Dataset Extraction from Wikipedia

In [None]:
#List of all the Elements in the Periodic table by Atomic Number

Elements = "Hydrogen,Helium,Lithium,Beryllium,Boron,Carbon,Nitrogen,Oxygen,Fluorine,Neon,Sodium,Magnesium,Aluminum,Silicon,Phosphorus,Sulfur,Chlorine,Argon,Potassium,Calcium,Scandium,Titanium,Vanadium,Chromium,Manganese,Iron,Cobalt,Nickel,Copper,Zinc,Gallium,Germanium,Arsenic,Selenium,Bromine,Krypton,Rubidium,Strontium,Yttrium,Zirconium,Niobium,Molybdenum,Technetium,Ruthenium,Rhodium,Palladium,Silver,Cadmium,Indium,Tin,Antimony,Tellurium,Iodine,Xenon,Cesium,Barium,Lanthanum,Cerium,Praseodymium,Neodymium,Promethium,Samarium,Europium,Gadolinium,Terbium,Dysprosium,Holmium,Erbium,Thulium,Ytterbium,Lutetium,Hafnium,Tantalum,Tungsten,Rhenium,Osmium,Iridium,Platinum,Gold,Mercury,Thallium,Lead,Bismuth,Polonium,Astatine,Radon,Francium,Radium,Actinium,Thorium,Protactinium,Uranium,Neptunium,Plutonium,Americium,Curium,Berkelium,Californium,Einsteinium,Fermium,Mendelevium,Nobelium,Lawrencium,Rutherfordium,Dubnium,Seaborgium,Bohrium,Hassium,Meitnerium,Darmstadtium,Roentgenium,Copernicium,Nihonium,Flevorium,Moscovium,Livermorium,Tennessine,Oganesson"

In [None]:
Elements_Symbols = "H,He,Li,Be,B,C,N,O,F,Ne,Na,Mg,Al,Si,P,S,Cl,Ar,K,Ca,Sc,Ti,V,Cr,Mn,Fe,Co,Ni,Cu,Zn,Ga,Ge,As,Se,Br,Kr,Rb,Sr,Y,Zr,Nb,Mo,Tc,Ru,Rh,Pd,Ag,Cd,In,Sn,Sb,Te,I,Xe,Cs,Ba,La,Ce,Pr,Nd,Pm,Sm,Eu,Gd,Tb,Dy,Ho,Er,Tm,Yb,Lu,Hf,Ta,W,Re,Os,Ir,Pt,Au,Hg,Tl,Pb,Bi,Po,At,Rn,Fr,Ra,Ac,Th,Pa,U,Np,Pu,Am,Cm,Bk,Cf,Es,Fm,Md,No,Lr,Rf,Db,Sg,Bh,Hs,Mt,Ds,Rg,Cn,Nh,Fl,Mc,Lv,Ts,Og"

In [None]:
#Turns all the names to a List
Elements = Elements.split(sep=',')
Elements_Symbols = Elements_Symbols.split(sep=',')

### Generate Chemical Element Dataset by extracting Wikipedia Articles

In [None]:
for Element in Elements:
    if Element == "Mercury":    # Prevents disambiguation problem for Mercury
        Element = Element + "_(element)"
    new_doc = wikipedia.page(Element, auto_suggest=False).content    #This extracts Element wikipedia page
    if Element == "Mercury_(element)": # Returns variable to correct name
        Element = "Mercury"
    new_doc = new_doc.split(sep='\n\n\n== References') # Separates final part of the Article
    globals()[Element] = new_doc[0]
    new_doc = globals()[Element].split(sep='\n\n\n== See also')   # (Notes,See Also,References,etc.)
    globals()[Element] = new_doc[0]
    new_doc = globals()[Element].split(sep='\n\n\n== Notes')      # This is repeated to ensure that sections are removed
    globals()[Element] = new_doc[0]

### Generate List from Variables

In [None]:
Data = [globals()[Element] for Element in Elements]

In [None]:
raw_corpus_df = pd.DataFrame({'Element':Elements, 'Symbol':Elements_Symbols, 'Data':Data})

In [None]:
raw_corpus_df["Atomic Number"] = raw_corpus_df.index+1

# Dataset has been created with Element name, symbol, wikipedia article text, and atomic number

In [None]:
raw_corpus_df.to_csv("Chemical_Element_Data.csv", index=False)

# Dataset is now saved as a CSV file for fast loading

### Load Dataset from CSV

In [None]:
corpus_df = pd.read_csv("Chemical_Element_Data.csv")

In [None]:
corpus_df

# Dataset Preprocessing

In [None]:
def remove_punctuation(text):
    return re.sub('[^a-zA-Z]', ' ', str(text))

def lower_case(text):
    return text.lower()

def remove_tags(text):
    return re.sub("&lt;/?.*?&gt;", " &lt;&gt; ", text)

def remove_special_chars_and_digits(text):
    return re.sub("(\\d|\\W)+", " ", text)

def tokenize(document, tokenize="word"):
    if tokenize == "word":
        tokens = nltk.word_tokenize(document)
    else:
        tokens = nltk.sent_tokenize(document)
    return tokens

In [2]:
def WalterProcess(document):

    document = remove_punctuation(document)
    document = lower_case(document)
    document = remove_tags(document)
    document = remove_special_chars_and_digits(document)
    #document = tokenize(document, tokenize)

    return document

In [3]:
final_corpus = corpus_df.copy()
final_corpus['Data'] = corpus_df['Data'].apply(WalterProcess)

NameError: name 'corpus_df' is not defined

In [4]:
final_corpus

NameError: name 'final_corpus' is not defined

In [5]:
final_corpus.to_csv("Final_Chemical_Element_Data.csv", index=False)

NameError: name 'final_corpus' is not defined

# Debugging Section

In [26]:
#Ignore this Section

# Walter Models

In [38]:
from sentence_transformers import SentenceTransformer

In [52]:
def Walter(model, question, context_index):

    # Selects Appropriate Chemical Element Dataset
    context = final_corpus.Data[context_index]

    # Model Selection
    if model == 1:
        model = pipeline(model='distilbert-base-cased-distilled-squad', revision='626af31')
    if model == 2:
        model = pipeline(model='deepset/roberta-base-squad2')
    if model == 3:
        model = pipeline(model='deepset/tinyroberta-squad2')
    if model == 4:
        model = pipeline(model='deepset/minilm-uncased-squad2')
    if model == 5:
        model = pipeline(model='mrm8488/longformer-base-4096-finetuned-squadv2')
    # Process Question
    question = WalterProcess(question)

    # Generate Response
    response = model(question=question, context=context)
    #print("Score:", response['score'])

    return response['answer']

In [60]:
model=2
question = "Why can lithium explode?"
current_index = 2

response = Walter(model, question, current_index)
print(response)

Downloading:   0%|          | 0.00/6.91k [00:00<?, ?B/s]

neutrons


In [58]:
# Manual Generation for Quick DataFrame Generation

model=5
question = "What type of element is Iron?"
current_index = 25

for model in range(1,6):
    response = Walter(model, question, current_index)
    print(model)
    print(response)

Downloading:   0%|          | 0.00/3.64k [00:00<?, ?B/s]

1
anionic complexes


Downloading:   0%|          | 0.00/6.91k [00:00<?, ?B/s]

2
chemical


Downloading:   0%|          | 0.00/3.52k [00:00<?, ?B/s]

3
cell contains free iron


Downloading:   0%|          | 0.00/3.48k [00:00<?, ?B/s]

4
iron is the most abundant element on earth


Downloading:   0%|          | 0.00/3.50k [00:00<?, ?B/s]

5
free iron


# Create Chatbot Program with Basic Functions

In [None]:
if __name__ == "__main__":

    # Introduction
    print("Walter:  Hello, my name is Walter. I know information about all the known Chemical Elements.")
    print("Walter:  What should I call you?\n")
    username = input()
    if username.lower() in ['quit', 'cancel', 'exit', 'escape']:
        print(f"Walter:  Good bye!")
        sys.exit()

    model = random.randrange(1,6) # Random Model Selection from 1 to 5

    print(f"{username}:  {username}.\n")
    print(f"Walter:  Perfect, {username}. I would like to tell you that I am allergic to punctuation.")

    # Begin Loop
    while True:

        # Initialize Chatbot Loop
        understand = False
        print("Walter:  What Element can I help you learn more about?\n")
        user_response = input()
        current_index = 0
        print(f"{username}:  {user_response}\n")
        user_response = user_response.lower()
        if user_response in ['quit', 'cancel', 'exit', 'escape']:
            print(f"Walter:  Good bye {username}!")
            break

        current_element = user_response.capitalize()

        # Confirm Index of Element of Interest
        if current_element in final_corpus['Element'].values:
            current_index = final_corpus[final_corpus['Element']==current_element].index[0]
            understand = True

        # Confirm Index of Symbol of Interest and return Element name
        elif current_element in final_corpus['Symbol'].values:
            current_index = final_corpus[final_corpus['Symbol']==current_element].index[0]
            current_element = final_corpus['Element'][current_index]
            understand = True

        # Misunderstand user query.
        else:
            understand = False

        # Confirm Interest in given Element.
        if understand == True:
            print(f"Walter:  {username}, I see that you want to learn more about {current_element}, is that correct?\n")
            user_response = input()
            print(f"{username}:  {user_response}\n")
            user_response = user_response.lower()

            # Proceeed with Operation
            if user_response in ['yes', 'y', 'yep', 'correct', 'got it']:
                print(f"Walter:  Got it, {username}!, What would you like to learn about {current_element}?\n")
                user_query = input()
                print(f"{username}:  {user_query}\n")
                user_query = user_query.lower()

                # Give summary if user simply wants a Summary
                if 'summary' in user_query:
                    if current_element == "Mercury":
                        temp = "Mercury_(element)"
                    else:
                        temp = current_element
                    summary = wikipedia.summary(temp, sentences=4, auto_suggest=False)
                    print(f"Walter:  {summary}\n")

                else:
                    #This is where the magic happens
                    model = 1
                    print(f"Walter:  Give me a few seconds to think about it.\n")
                    response = Walter(model, user_query, current_index)
                    print(f"Walter:  {response}\n")


                # Check if user wants to ask another question.
                print(f"Walter:  Do you have any other questions?\n")
                user_response = input()
                print(f"{username}: {user_response}\n")
                user_response = user_response.lower()
                if user_response in ['yes', 'y', 'yep', 'correct', 'got it']:
                    continue
                else:
                    print(f"Walter:  Good bye {username}! It was a pleasure to help you.")
                    break

            # Misunderstanding: Abort Operation
            else:
                print(f"Walter:  I'm sorry {username}. I must have misunderstood you. Let's try again.")
                continue
        else:
            print(f"Walter:  Sorry {username}, I didn't quite understand you. Let's try again.")
            continue


In [None]:
#Word Analysis Leveraging Tendency of Elements Restricted Reasoning WALPTERR or WALTER for short.

In [None]:
#response = requests.get(url="https://en.wikipedia.org/wiki/Hydrogen")
#print(response.status_code)

In [None]:
#soup = BeautifulSoup(response.text, 'html.parser')

In [None]:
raw=f.read()
raw=raw.lower()# converts to lowercase


sent_tokens = nltk.sent_tokenize(raw)# converts to list of sentences
word_tokens = nltk.word_tokenize(raw)# converts to list of words


lemmer = nltk.stem.WordNetLemmatizer()
def LemTokens(tokens):
    return [lemmer.lemmatize(token) for token in tokens]
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
def LemNormalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


# Generating response
def response(user_response):
    robo_response=''
    TfidfVec = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english')
    tfidf = TfidfVec.fit_transform(sent_tokens)
    vals = cosine_similarity(tfidf[-1], tfidf)
    idx=vals.argsort()[0][-2]
    flat = vals.flatten()
    flat.sort()
    req_tfidf = flat[-2]
    if(req_tfidf==0):
        robo_response=robo_response+"I am sorry! I don't understand you"
        return robo_response
    else:
        robo_response = robo_response+sent_tokens[idx]
        return robo_response


flag=True
print("ROBO: My name is Robo. I will answer your queries about Chatbots. If you want to exit, type Bye!")

while(flag==True):
    user_response = input()
    user_response=user_response.lower()
    if(user_response!='bye'):
        if(user_response=='thanks' or user_response=='thank you' ):
            flag=False
            print("ROBO: You are welcome..")
        else:
            if(greeting(user_response)!=None):
                print("ROBO: "+greeting(user_response))
            else:
                sent_tokens.append(user_response)
                word_tokens=word_tokens+nltk.word_tokenize(user_response)
                final_words=list(set(word_tokens))
                print("ROBO: ",end="")
                print(response(user_response))
                sent_tokens.remove(user_response)
    else:
        flag=False
        print("ROBO: Bye! take care..")
