<a href="https://colab.research.google.com/github/Muhammad-Taufiq-Khan/TAUFIQ-NLP-Task-ML-Headless-Technologies-Limited/blob/main/_TAUFIQ_ML_Engineer_NLP_Task.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Dependencies

In [None]:
import warnings
warnings.simplefilter("ignore")
import pandas as pd # data manipulation
from collections import Counter # count word frequency in a string
import re #for regular expression

## for data preprocessing
import nltk
# Download the following 3 (three) packages only 1 time during dependency setup
nltk.download('wordnet')
nltk.download('stopwords') 
nltk.download('omw-1.4')
from nltk.corpus import wordnet as wn
from nltk.stem.porter import PorterStemmer #for steming 
from nltk.stem import WordNetLemmatizer #for lemmatization

from nltk.corpus import stopwords 


## for similarity check
import spacy
! python -m spacy download en_core_web_lg  #Run only 1 time in terminal during dependency setup

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


2022-11-22 10:35:45.824359: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-lg==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.4.1/en_core_web_lg-3.4.1-py3-none-any.whl (587.7 MB)
[K     |████████████████████████████████| 587.7 MB 14 kB/s 
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.4.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


# 2. User defined methods to process FAQ data

In [None]:
# from dependencies import *

# LOAD english stopwords [stpwrd]
stpwrd = nltk.corpus.stopwords.words('english') 
new_stopwords = ['albert', 'einstein', 'date', 'name'] # after self R&D I've found the necessity to consider these words as stopwords
stpwrd.extend(new_stopwords) 
#LOAD spacy package [nlp]
nlp = spacy.load("en_core_web_lg") 


''' # FUNCTION 01: Preprocessing function'''
def preprocessing(length_of_df, feature_to_preprocess):
    # ps = PorterStemmer()
    lemma = WordNetLemmatizer()
    preprocessed_text_list = []
    for row_i in range(length_of_df):
        text = re.sub('[^a-zA-Z]', ' ', str(feature_to_preprocess[row_i]))       # Cleaning symbols, numbers, punctuations, extra spaces, etc.
        text = text.lower()                                                      # Converting records into lower case.
        text = text.split()
        text = [lemma.lemmatize(word) for word in text if not word in stpwrd]    # Reoving stopwords and applying lemmatizattion.
        text = ' '.join(text)
        preprocessed_text_list.append(text)
    return preprocessed_text_list


'''# FUNCTION 02: Find hyponyms (specific words such as Hyponyms of Parent are father, mother) of each words in a question'''
def hyponyms(word): 
    hyponyms = []
    unnecessary_charecters_front = 8 #there are 8 unnecessary charecters (S y n s e t ( ') before actal hponyms "Synset('passing.n.02')"
    unnecessary_charecters_last = -7  #there are 6 unnecessary charecters (. n . 0 2 ' ) after actual hponyms "Synset('passing.n.02')"
    WORD = word.split(' ')
    for w in WORD:
        synonyms = wn.synsets(w)
        if synonyms:
            for i in synonyms[0].hyponyms(): # synonyms[0].hyponyms(): hyponyms of only 1st class synonyms have been considered.
                i = re.sub('[^a-zA-Z]', ' ', str(i)[unnecessary_charecters_front: unnecessary_charecters_last])
                # i initially include Synset('passing.n.02') but actual hypernym/hyponym word is only 'passing'. To get the actual word the above code is written.
                # print(i)
                hyponyms.append(i)
    return hyponyms


'''# FUNCTION 03: Find hypernym (generalized or abstract such as hypernym of father can be parent) of each words in a question'''
def hypernyms(word):
    hypernym = []
    unnecessary_charecters_front = 8 #there are 8 unnecessary charecters (S y n s e t ( ') before actal hponyms "Synset('passing.n.02')"
    unnecessary_charecters_last = -7 #there are 6 unnecessary charecters (. n . 0 2 ' ) after actual hponyms "Synset('passing.n.02')"
    WORD = word.split(' ')
    for w in WORD:
        synonyms = wn.synsets(w)
        if synonyms:
            for i in synonyms[0].hypernyms(): # synonyms[0].hypernyms(): hypernyms of only 1st class synonyms have been considered.
                i = re.sub('[^a-zA-Z]', ' ', str(i)[unnecessary_charecters_front: unnecessary_charecters_last])
                # i initially include Synset('passing.n.02') but actual hypernym/hyponym word is only 'passing'. To get the actual word the above code is written.
                hypernym.append(i)
    return hypernym


''' # FUNCTION 04: Attach hyponyms and hypernyms with qestion/answer'''
def attach_hyperhypo(preprocessed_texts: list, hyperhypo_similarity_threshold: float):
    text_with_hyperhypo_list = []
    for text in preprocessed_texts:
        text_list = text.split(" ")
        hypo = hyponyms(text)                           # find hyponyms
        hyper = hypernyms(text)                         # find hypernyms
        text_list.extend(hypo)                          # attach hyponyms
        text_list.extend(hyper)                         # arrach hypernyms

        text_str = (" ".join (text_list))               # convert text list into string. Such as ['grand father', 'step mother', 'death season' ] -> "grand father step mother death season"
        text_list_word_by_word = text_str.split(" ")    #convert text string into list. each list element is single word here. Which reduce redundancy. Such as  "grand father step mother death season " -> ['grand', 'father', 'step', 'mother', 'death', 'season']
        old_text_list = text.split(" ")                 # old_text_list will include only base words
        new_text_list = []                              # new_text_list will include unique base words, hyponyms and hypernyms of base words
        
        for base_word in old_text_list:
            new_text_list.append(base_word)
            for hyperhypo_word in text_list_word_by_word:
                if nlp(base_word).similarity(nlp(hyperhypo_word)) > hyperhypo_similarity_threshold:     # if hypernyms and hyponyms have more than 60% similarity with the base words then consider.
                    new_text_list.append(hyperhypo_word)

        new_text_list = Counter(new_text_list)          # unique words 
        new_text_str = " ".join(new_text_list.keys())
        text_with_hyperhypo_list.append(new_text_str)
    return text_with_hyperhypo_list


''' # FUNCTION 05: Create Modified Dataset'''
def create_modified_dataset(base_df, new_df_name: str, preprocessed_FAQ: list, preprocessed_FAQ_Ans: list, FAQ_with_hyperhypo: list, FAQ_Ans_with_hyperhypo: list):
    df_new = base_df.copy()
    len_of_base_df = len(base_df)
    df_new['preprocessed_FAQ'], df_new['preprocessed_FAQ_Ans'], df_new['FAQ_with_hyperhypo'], df_new['FAQ_Ans_with_hyperhypo'] = [preprocessed_FAQ, preprocessed_FAQ_Ans, FAQ_with_hyperhypo, FAQ_Ans_with_hyperhypo]
    Corpus = list(zip(preprocessed_FAQ, preprocessed_FAQ_Ans))                      # combination of preprocessed FAQs and preprocessed Answers
    Corpus = preprocessing(len_of_base_df, Corpus)
    Corpus_new = list(zip(FAQ_with_hyperhypo, preprocessed_FAQ_Ans))                # combination of preprocessed FAQs with hypernyms-hyponyms and preprocessed Answers
    Corpus_new = preprocessing(len_of_base_df, Corpus_new)
    Corpus_with_hyperhypo = list(zip(FAQ_with_hyperhypo,FAQ_Ans_with_hyperhypo ))   # combination of preprocessed FAQs with hypernyms-hyponyms and preprocessed Answers with hypernyms-hyponyms
    Corpus_with_hyperhypo = preprocessing(len_of_base_df, Corpus_with_hyperhypo)
    df_new['Corpus'] = Corpus
    df_new['Corpus_new'] = Corpus_new
    df_new['Corpus_with_hyperhypo'] =  Corpus_with_hyperhypo
    df_new.to_csv(new_df_name); print('Hybrid dataset creation done')

# 3. User defined methods to process new test question/ FAQ_test data

In [None]:
# Functions to process single test question

# from dependencies import *
# from functions import hypernyms, hyponyms, nlp, stpwrd


'''# FUNCTION 06: Function to preprocess test FAQ'''
def preprocessing_single_ques(single_ques_test):
    # ps = PorterStemmer()
    lemma = WordNetLemmatizer()
    question = re.sub('[^a-zA-Z]', ' ', single_ques_test)                           # Cleaning symbols, numbers, punctuations, extra spaces, etc.
    question = question.lower()                                                     # Converting records into lower case.
    question = question.split()
    question = [lemma.lemmatize(word) for word in question if not word in stpwrd]   # Reoving stopwords and applying lemmatizattion
    question = ' '.join(question)
    return question


'''# FUNCTION 07: Attach hypernyms and hyponyms with preprocessed test FAQ text'''
def attach_hyperhypo_single(question):
    question_list = question.split(" ")
    hypo_of_question = hyponyms(question)                       # find hyponyms
    hyper_of_question = hypernyms(question)                     # find hpernyms
    question_list.extend(hypo_of_question)                      # attach hyponyms
    question_list.extend(hyper_of_question)                     # attach hypernms

    question_str = (" ".join (question_list))                   # convert text list into string. Such as ['grand father', 'step mother', 'death season' ] -> "grand father step mother death season"      
    question_list_word_by_word = question_str.split(" ")        #convert text string into list. each list element is single word here. Which reduce redundancy. Such as  "grand father step mother death season " -> ['grand', 'father', 'step', 'mother', 'death', 'season']
    old_Q_list = question.split(" ")                            # old_Q_list will include only base words
    new_Q_list = []                                             # new_Q_list will include unique base words, hyponyms and hypernyms of base words

    for base_q in old_Q_list:
        new_Q_list.append(base_q)
        for hyperhypo_q in question_list_word_by_word: 
            if nlp(base_q).similarity(nlp(hyperhypo_q)) > 0.82: # if hypernyms and hyponyms have more than 82% similarity with the base words then consider.
                new_Q_list.append(hyperhypo_q)

    new_Q_list = Counter(new_Q_list)                            # unique words of a question
    new_Q_str = " ".join(new_Q_list.keys())
    return new_Q_str


''' # FUNCTION 08: Find most similar FAQ of test FAQ and answer it'''
def answer(question_test: str, base_df, base_feature: str ):
    question = preprocessing_single_ques(question_test)
    question = attach_hyperhypo_single(question)
    question_list = question.split(" ")
    question = nlp(question)

    most_similar_FAQ_key = -1
    similarity_dict = {}                                        # will contain similarity score of each FAQ with test/asked question Also contain iteration number (key) of each FAQ in this Dictionary
    similar_FAQ_key = []                                        # will contain key of similar FAQ which has nearly similarity score compared to most similar score key.

    for num, faq in enumerate(base_df[base_feature]):
        faq_list = faq.split(" ")
        for _ in faq_list:
            for __ in question_list:
                if _ == __:                                     # if any word of test FAQ is available in base_features FAQ consider the base_features's FAQ as most similar question
                    most_similar_FAQ_key = num
        similarity_dict[num] = question.similarity(nlp(faq))    # contain similarity score of test FAQ with each base_features's FAQ

    if most_similar_FAQ_key != -1:                              # if any word of test FAQ is available in base_features FAQ
        most_similar_FAQ = base_df['Question'][most_similar_FAQ_key]    
        most_similar_FAQ_Answer = base_df['Answer'][most_similar_FAQ_key]
    else:
        most_similar_FAQ_score = max(similarity_dict.values())  
        most_similar_FAQ_key = max(similarity_dict, key = similarity_dict.get)
        
        # Near to most similar FAQ
        for key, value in similarity_dict.items():
            if value >= (most_similar_FAQ_score - 0.1):         # threshold = 0.1: which FAQ's similarity score is not less than (<) most similarity score - 0.1, consider these as almost similar FAQ
                similar_FAQ_key.append(key)
        similar_FAQ_key.remove(most_similar_FAQ_key)

        most_similar_FAQ = base_df['Question'][most_similar_FAQ_key]     
        most_similar_FAQ_Answer = base_df['Answer'][most_similar_FAQ_key]     
        # print(similarity_dict)
        # print(most_similar_FAQ_score)

    # print(most_similar_FAQ_key)
    print(f"Asked Question: {question_test}")
    # print(f"Similar FAQ: {most_similar_FAQ}")
    print(f"Answer: {most_similar_FAQ_Answer}", end = ' ')
    # For almost similar
    if similar_FAQ_key:
        for key in similar_FAQ_key:
            print(base_df['Answer'][key], end = ' ')    
    print()
    return most_similar_FAQ_Answer

# 4. Process FAQ dataset

In [None]:
# from functions import *

""" FETCH DATA """
FAQs_link = "https://raw.githubusercontent.com/Muhammad-Taufiq-Khan/TAUFIQ-NLP-Task-ML-Headless-Technologies-Limited/main/FAQs.csv"
df = pd.read_csv(FAQs_link)                                                     ;print("Fetched data - FAQ")


""" PREPROCESSING """
df_len = len(df)
preprocessed_FAQ = preprocessing(df_len, df['Question'])                        ;print("Preprocessing done - FAQ")
preprocessed_FAQ_Ans = preprocessing(df_len, df['Answer'])                      ;print("Preprocessing done - FAQ Ans")


""" ATTACHING HYPONYMS AND HYPERNYMS WITH PREPROCESSED TEXTS """
threshold = 0.6 # hyperhypo similarity threshold
FAQ_with_hyperhypo = attach_hyperhypo(preprocessed_FAQ, threshold)             ;print('Attaching hyper-hypo done - FAQ')
FAQ_Ans_with_hyperhypo = attach_hyperhypo(preprocessed_FAQ_Ans, threshold)     ;print('Attaching hyper-hypo done - FAQ Ans')


""" CREATING HYBRID DATASET """
create_modified_dataset(df, 'hybrid_dataset.csv', preprocessed_FAQ, preprocessed_FAQ_Ans, FAQ_with_hyperhypo, FAQ_Ans_with_hyperhypo)


Fetched data - FAQ
Preprocessing done - FAQ
Preprocessing done - FAQ Ans
Attaching hyper-hypo done - FAQ
Attaching hyper-hypo done - FAQ Ans
Hybrid dataset creation done


# 5. Test by asking new question or by the FAQ_test dataset

In [None]:
# from function_FAQ_test import *
modified_df_path = '/content/hybrid_dataset.csv'
modified_df = pd.read_csv(modified_df_path)
FAQs_test_link = "https://raw.githubusercontent.com/Muhammad-Taufiq-Khan/TAUFIQ-NLP-Task-ML-Headless-Technologies-Limited/main/FAQs_test.csv"
df_test = pd.read_csv(FAQs_test_link)


''' # Test by All FAQ-test with multiple base features '''
# base_feature_names = ['preprocessed_FAQ', 'FAQ_with_hyperhypo', 'Corpus', 'Corpus_new', 'Corpus_with_hyperhypo']
# for i, base_feature in enumerate(base_feature_names):
#     print(f"\n #{i+1}. Base Feature: {base_feature}")
#     for question in df_test['Question']:
#         Ans = answer(question, modified_df, base_feature); print()


''' # Test by all FAQ-test based on best base-feature '''
for question in df_test['Question']:
        Ans = answer(question, modified_df, 'FAQ_with_hyperhypo'); print()

Asked Question: What is the date of his death?
Answer: He was married to Mileva Marić between 1903 and 1919. They had three children, Lieserl (born 1902), Hans Albert (born 1904) and Eduard (born 1910). He married Elsa Löwenthal in 1919 and they lived together until her death in 1936. He died 18 April 1955 in Princeton, New Jersey, USA. His father was Hermann Einstein and his mother was Pauline Einstein (born Koch). He had one sister named Maja. 

Asked Question: Did Einstein have siblings?
Answer: He had one sister named Maja. 

Asked Question: Who was his wife?
Answer: He had one sister named Maja. He was married to Mileva Marić between 1903 and 1919. They had three children, Lieserl (born 1902), Hans Albert (born 1904) and Eduard (born 1910). He married Elsa Löwenthal in 1919 and they lived together until her death in 1936. 

Asked Question: What was Einstein's father's name?
Answer: He had one sister named Maja. His father was Hermann Einstein and his mother was Pauline Einstein (b

In [None]:
''' # Test by single FAQ-test/ new question based on best base-feature '''
question = "Did Albert Einstein had any baby?"
Ans = answer(question, modified_df, 'FAQ_with_hyperhypo') #;print(f"\nAnswer: {Ans}")


Asked Question: Did Albert Einstein had any baby?
Answer: He was married to Mileva Marić between 1903 and 1919. They had three children, Lieserl (born 1902), Hans Albert (born 1904) and Eduard (born 1910). He married Elsa Löwenthal in 1919 and they lived together until her death in 1936. His father was Hermann Einstein and his mother was Pauline Einstein (born Koch). 
