In [157]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import json
#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('wordnet')

In [158]:
data = pd.read_csv('dialogs.txt' , sep='\t' , names=['Question' , 'Answer'])

In [159]:
question_list = data['Question'].tolist()
answer_list = data['Answer'].tolist()

In [160]:
data

Unnamed: 0,Question,Answer
0,"hi,",hi how can i help you
1,"hi, how are you doing?",i'm fine. how about yourself?
2,i'm fine. how about yourself?,i'm pretty good. thanks for asking.
3,i'm pretty good. thanks for asking.,no problem. so how have you been?
4,no problem. so how have you been?,i've been great. what about you?
...,...,...
3721,that's a good question. maybe it's not old age.,are you right-handed?
3722,are you right-handed?,yes. all my life.
3723,yes. all my life.,you're wearing out your right hand. stop using...
3724,you're wearing out your right hand. stop using...,but i do all my writing with my right hand.


In [161]:
def preprocess(text):
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    text = re.sub(r'[^\w\s]', '' , text)
    tokens = nltk.word_tokenize(text.lower())
    tokens = [token for token in tokens if token not in stopwords.words('english')]
    lemmatize_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    stemmed_tokens = [stemmer.stem(token) for token in lemmatize_tokens]
    return ' '.join(stemmed_tokens)

In [162]:
def preprocess_with_stopwords(text):
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    text = re.sub(r'[^\w\s]', '' , text)
    tokens = nltk.word_tokenize(text.lower())
    lemmatize_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    stemmed_tokens = [stemmer.stem(token) for token in lemmatize_tokens]
    return ' '.join(stemmed_tokens)

In [163]:
corpus = question_list + answer_list
vectorizer = TfidfVectorizer(tokenizer=nltk.word_tokenize)
X = vectorizer.fit_transform([preprocess(text) for text in corpus])



In [164]:
def get_response(text):
    processed_text = preprocess_with_stopwords(text)
    print('processed_text:', processed_text)
    vectorized_text = vectorizer.transform([processed_text])
    similarities = cosine_similarity(vectorized_text, X)
    print('similarities:',similarities)
    max_similarities = np.max(similarities)
    print('max_similarities:',max_similarities)
    if max_similarities > 0.6:
        high_similarities_questions = [q for q, s in zip(question_list, similarities[0]) if s > 0.6]
        print('high_similarities_questions:', high_similarities_questions)
        
        target_answers = []
        for q in high_similarities_questions:
            q_index = question_list.index(q)
            target_answers.append(answer_list[q_index])
        print(target_answers)
        
        # Use the same vectorizer for both input and high similarity questions
        Z = vectorizer.transform([preprocess_with_stopwords(q) for q in high_similarities_questions])
        processed_with_stopwords = preprocess_with_stopwords(text)
        print('processed_with_stopwords:',processed_with_stopwords)
        vectorized_text_with_stopwords = vectorizer.transform([processed_with_stopwords])
        final_similarities = cosine_similarity(vectorized_text_with_stopwords, Z)
        closet = np.argmax(final_similarities)
        return target_answers[closet]
    else:
        return "I can't answer this Question"


In [165]:
get_response("are you right-handed?")

processed_text: are you righthand
similarities: [[0. 0. 0. ... 0. 0. 0.]]
max_similarities: 1.0
high_similarities_questions: ['are you right-handed?']
['yes. all my life.']
processed_with_stopwords: are you righthand


'yes. all my life.'

In [167]:
!pip install streamlit

Collecting streamlit
  Obtaining dependency information for streamlit from https://files.pythonhosted.org/packages/d6/1f/d3b33ca37a147a428581ec8b4834e63cb6f3e7116acf4e2e10f851f45a97/streamlit-1.27.1-py2.py3-none-any.whl.metadata
  Downloading streamlit-1.27.1-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting altair<6,>=4.0 (from streamlit)
  Obtaining dependency information for altair<6,>=4.0 from https://files.pythonhosted.org/packages/f2/b4/02a0221bd1da91f6e6acdf0525528db24b4b326a670a9048da474dfe0667/altair-5.1.1-py3-none-any.whl.metadata
  Downloading altair-5.1.1-py3-none-any.whl.metadata (8.6 kB)
Collecting blinker<2,>=1.0.0 (from streamlit)
  Using cached blinker-1.6.2-py3-none-any.whl (13 kB)
Collecting importlib-metadata<7,>=1.4 (from streamlit)
  Obtaining dependency information for importlib-metadata<7,>=1.4 from https://files.pythonhosted.org/packages/cc/37/db7ba97e676af155f5fcb1a35466f446eadc9104e25b83366e8088c9c926/importlib_metadata-6.8.0-py3-none-any.whl.metadata
  Downl