In [None]:
!pip install --upgrade scikit-learn
!pip install sentence_transformers
!pip install textblob

Collecting scikit-learn
  Downloading scikit_learn-1.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m92.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2
Successfully installed scikit-learn-1.3.2
Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from sentence_transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1

In [None]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
from textblob import TextBlob
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
class TextPreprocessor:
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words("english"))

    def preprocess_text(self, text):
        text = text.lower()

        words = word_tokenize(text)

        words = [self.lemmatizer.lemmatize(word) for word in words if word.isalnum() and word not in self.stop_words]

        cleaned_text = " ".join(words)

        return cleaned_text

In [None]:
class GenerateKeyword:
    def __init__(self, num_topics=1, max_features=1000):
        self.num_topics = num_topics
        self.max_features = max_features
        self.tfidf_vectorizer = TfidfVectorizer(max_features=max_features)
        self.lda_model = LatentDirichletAllocation(n_components=num_topics)
        self.text_preprocessor = TextPreprocessor()

    def generate_topics(self, paragraph):
        preprocessed_paragraph = self.text_preprocessor.preprocess_text(paragraph)
        tfidf_matrix = self.calculate_tfidf(preprocessed_paragraph)
        topics = self.extract_topics(tfidf_matrix)

        return topics

    def calculate_tfidf(self, text):
        tfidf_matrix = self.tfidf_vectorizer.fit_transform([text])
        return tfidf_matrix

    def extract_topics(self, tfidf_matrix):
        lda_model = LatentDirichletAllocation(n_components=self.num_topics)
        lda_topic_matrix = lda_model.fit_transform(tfidf_matrix)

        topics = []
        for topic_idx, topic in enumerate(lda_model.components_):
            top_keywords_idx = topic.argsort()[-10:][::-1]
            top_keywords = [self.tfidf_vectorizer.get_feature_names_out()[i] for i in top_keywords_idx]
            topics.append(top_keywords)

        return topics

In [None]:
class QuestionAnswer:
    def __init__(self, paragraph, user_request):
        self.question = user_request
        self.paragraph = paragraph
        self.vectorizer = TfidfVectorizer()
        self.paragraph_vector = self.vectorizer.fit_transform([paragraph])
        self.sentences = self._split_sentences(paragraph)

    def _split_sentences(self, paragraph):
        blob = TextBlob(paragraph)
        return [str(sentence) for sentence in blob.sentences]

    def embed_sentences(self, sentences):
        model = SentenceTransformer('all-MiniLM-L6-v2')
        embeddings = model.encode(sentences)
        return embeddings

    def answer_question(self):
        # print("Inside answer question, sentences: ")
        # print(self.sentences)
        # print("Inside answer question, question: ")
        # print(self.question)
        question_vector = self.embed_sentences([self.question])
        sentence_vectors = self.embed_sentences(self.sentences)

        similarities = cosine_similarity(question_vector, sentence_vectors)
        most_similar_sentence_index = similarities.argmax()

        answer = self.sentences[most_similar_sentence_index]
        return answer

In [None]:
class Summarization:
  def __init__(self):
    from transformers import pipeline
    self.model = pipeline("summarization", model="Falconsai/text_summarization")

  def summarize(self,paragraph):
    return self.model(paragraph)[0]['summary_text']

***SUMMARIZATION AND KEYWORD GENERATION*** = Advancements in artificial intelligence (AI) have significantly impacted various industries. Machine learning algorithms, a subset of AI, enable computers to learn and make decisions without explicit programming. In healthcare, AI is being employed for diagnostics and personalized treatment plans. The financial sector utilizes AI for fraud detection and risk management. Autonomous vehicles leverage AI to enhance navigation and safety. Natural language processing (NLP) allows computers to understand and generate human-like text, improving chatbots and language translation. Despite the benefits, ethical considerations, such as bias in algorithms, pose challenges for the widespread adoption of AI technologies.
bold text

***QUESTION ANSWER*** = I remember when I first arrived in the United States. Even before the plane landed, the little windows in the airplane revealed snow and ice-covered houses and buildings.
As I walked off the plane, cold air crept through the corrugated ramp that led to the airport terminal. Some people inside the airport were wearing big coats and hats, which I had seen on
television, but never up close.
I felt a little dizzy and needed to sit down, and then my cell phone rang. It was my Aunt Sophia. She was waiting for me outside in the passenger pick-up area, so I walked quickly to the exit,
forgetting all about my luggage.
When the sliding glass door opened to the outside, there was my aunt–a woman I hadn’t seen in over ten years–wearing a parka and waving her arms frantically in my direction.

In [None]:
class IntentRecognition:
    def __init__(self):
        self.intent_keywords = {
            "Summarization": ["summarize", "summary", "brief"],
            "Topic Generation": ["topic", "main idea", "key points"],
            "Keyword Generation": ["keywords"],
            "Question Answering": ["what", "why", "explain", "tell me about","who"]
        }

        self.input_paragraph = ""
        self.user_request = ""

    def extract_paragraph(self, user_input):
        paragraph_pattern = r'"(.*?)"'
        match = re.search(paragraph_pattern, user_input)

        if match:
            self.input_paragraph = match.group(1)
            user_request_pattern = r'^(.*?)' + re.escape(self.input_paragraph) + r'(.*)$'
            user_request_match = re.search(user_request_pattern, user_input)
            if user_request_match:
                self.user_request = user_request_match.group(1).strip() + user_request_match.group(2).strip()

    def recognize_intent(self, user_input):
        detected_intent = "Unknown"
        self.extract_paragraph(user_input)

        for intent, keywords in self.intent_keywords.items():
            for keyword in keywords:
                if keyword.lower() in self.user_request.lower():
                    detected_intent = intent
                    break

        return detected_intent

if __name__ == "__main__":
    recognizer = IntentRecognition()

    user_input = input("Hey! My name is DistilBot. I can assist you with tasks related to Keyword Extraction and Summarization. I can also answer queries related to your paragraph. How can i help you today?")
    # print(user_input)
    # print(type(user_input))
    intent = recognizer.recognize_intent(user_input)
    paragraph = recognizer.input_paragraph
    user_request = recognizer.user_request[:-3]

    # print("Detected Intent:", intent)
    # print("Input Paragraph:", paragraph)
    # print("User Request:", user_request)
    print("Result: ")

    if intent == "Question Answering":
      qa = QuestionAnswer(paragraph, user_request)
      print(qa.answer_question())
    elif intent == "Keyword Generation":
      topic_generator = GenerateKeyword()
      topics = topic_generator.generate_topics(paragraph)
      print(topics)
      # print("hello")
    elif intent == "Summarization":
      s = Summarization()
      print(s.summarize(paragraph))
    else:
      print("Please input correct task")