In [1]:
import pandas as pd
df = pd.read_csv("test.csv", encoding='unicode_escape')

In [2]:
df.head()

Unnamed: 0,Questions,Answers
0,What is data analytics?,Data analytics is the process of examining raw...
1,What are the main methods used in data analytics?,Data analytics encompasses various methods suc...
2,How is data analytics used in business?,Data analytics is widely used in business to g...
3,What are the key steps in the data analytics p...,The data analytics process typically involves ...
4,What is the role of machine learning in data a...,Machine learning plays a crucial role in data ...


In [3]:
ques_list = df['Questions'].tolist()
ans_list = df['Answers'].tolist()

In [4]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
import nltk

In [6]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
import re

In [8]:
def preprocess_text(text):
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    text = re.sub(r'[^\w\s]','',text)
    tokens = nltk.word_tokenize(text.lower())
    tokens = [token for token in tokens if token not in stopwords.words('english')]
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    stemmed_tokens = [stemmer.stem(token) for token in lemmatized_tokens]
    return ' '.join(stemmed_tokens)

In [9]:
def preprocess_with_stopwords(text):
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    text = re.sub(r'[^\w\s]','',text)
    tokens = nltk.word_tokenize(text.lower())
    tokens = [token for token in tokens if token not in stopwords.words('english')]
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    stemmed_tokens = [stemmer.stem(token) for token in lemmatized_tokens]
    return ' '.join(stemmed_tokens)

In [10]:
vectorizer = TfidfVectorizer(tokenizer = nltk.word_tokenize)

In [11]:
X = vectorizer.fit_transform([preprocess_text(q) for q in ques_list])

In [12]:
X

<8x18 sparse matrix of type '<class 'numpy.float64'>'
	with 33 stored elements in Compressed Sparse Row format>

In [13]:
def get_responses(text):
    processed_text = preprocess_with_stopwords(text)
    print("Processed_Text : ",processed_text)
    vectorized_text = vectorizer.transform([processed_text])
    similarities = cosine_similarity(vectorized_text,X)
    print("similarities : ",similarities)
    max_similarity = np.max(similarities)
    print("max_similarity : ",max_similarity)
    
    if (max_similarity >0.6):
        high_similarity_ques =[q for q, s in zip(ques_list, similarities[0]) if s > 0.6]
        print("high_similarity_questions : ",high_similarity_ques)
        
        target_ans =[]
        for q in high_similarity_ques:
            q_index = ques_list.index(q)
            target_ans.append(ans_list[q_index])
        print(target_ans)
        
        
        Z = vectorizer.fit_transform([preprocess_with_stopwords(q) for q in high_similarity_ques])
        processed_text_with_stopwords = preprocess_with_stopwords(text)
        print("processed_text_with_stopwords : ",processed_text_with_stopwords)
        vectorized_text_with_stopwords = vectorizer.transform([processed_text_with_stopwords])
        final_similarities = cosine_similarity(vectorized_text_with_stopwords)
        closest = np.argmax(final_similarities)
        return target_ans[closest]
    
    else :
        return "Sorry I am unable to ans this question"
    

In [14]:
get_responses("Who is Sheikh Hasina?")

Processed_Text :  sheikh hasina
similarities :  [[0. 0. 0. 0. 0. 0. 0. 0.]]
max_similarity :  0.0


'Sorry I am unable to ans this question'

In [15]:
ques_list

['What is data analytics?',
 'What are the main methods used in data analytics?',
 'How is data analytics used in business?',
 'What are the key steps in the data analytics process?',
 'What is the role of machine learning in data analytics?',
 'What is the importance of data quality in data analytics?',
 'What are the ethical considerations in data analytics?',
 'What are some common challenges in data analytics?']

In [16]:
get_responses("What is  machine learning ")

Processed_Text :  machin learn
similarities :  [[0.         0.         0.         0.         0.77627227 0.
  0.         0.        ]]
max_similarity :  0.7762722680124386
high_similarity_questions :  ['What is the role of machine learning in data analytics?']
['Machine learning plays a crucial role in data analytics by enabling the development of algorithms that can automatically learn from data and make predictions or take actions without being explicitly programmed. It is used for tasks such as classification, regression, clustering, and anomaly detection.']
processed_text_with_stopwords :  machin learn


'Machine learning plays a crucial role in data analytics by enabling the development of algorithms that can automatically learn from data and make predictions or take actions without being explicitly programmed. It is used for tasks such as classification, regression, clustering, and anomaly detection.'

In [18]:
# from gingerit.gingerit import GingerIt

# text = 'What is Data Anlytics'

# parser = GingerIt()
# corrected_text = parser.parse(text)

# print(corrected_text['result'])